diff --git a/.github/workflows/docs_and_style.yml b/.github/workflows/docs_and_style.yml index a64e4d7f2..3354f50fd 100644 --- a/.github/workflows/docs_and_style.yml +++ b/.github/workflows/docs_and_style.yml @@ -24,6 +24,23 @@ jobs: - name: Run app regression tests run: node test/app/run-app-regression-tests.cjs + backend-docs-drift: + # The backend reference doc (docs/dev/backends-reference.md) is generated from + # the self-describing backend descriptors. Build lemond, regenerate, and fail + # if the committed doc is stale — the same guarantee a lint provides. + runs-on: ubuntu-latest + concurrency: + group: ${{ github.workflow }}-backend-docs-${{ github.ref }} + cancel-in-progress: true + steps: + - uses: actions/checkout@v5 + - name: Configure and install build dependencies + run: ./setup.sh + - name: Build lemond + run: cmake --build --preset default --target lemond + - name: Check backend reference docs are up to date + run: python3 docs/tools/gen_backend_boilerplate.py --check + markdown-link-check: runs-on: ubuntu-latest concurrency: diff --git a/CMakeLists.txt b/CMakeLists.txt index 70c3bf352..2b7dee8a4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -607,15 +607,6 @@ set(SOURCES_CORE src/cpp/server/utils/wmi_helper.cpp src/cpp/server/utils/network_beacon.cpp src/cpp/server/utils/tcp_jsonl_client.cpp - src/cpp/server/backends/cloud_server.cpp - src/cpp/server/backends/llamacpp_server.cpp - src/cpp/server/backends/fastflowlm_server.cpp - src/cpp/server/backends/ryzenaiserver.cpp - src/cpp/server/backends/whisper_server.cpp - src/cpp/server/backends/moonshine_server.cpp - src/cpp/server/backends/kokoro_server.cpp - src/cpp/server/backends/sd_server.cpp - src/cpp/server/backends/vllm_server.cpp src/cpp/server/backends/backend_utils.cpp src/cpp/server/backend_manager.cpp src/cpp/server/ollama_api.cpp @@ -647,6 +638,83 @@ elseif(UNIX) list(APPEND SOURCES_CORE src/cpp/server/utils/platform/process_unix.cpp) endif() +# ============================================================ +# Self-describing backends registry +# ============================================================ +# The authoritative backend list. Each entry is "|": +# recipe - the recipe string used in server_models.json (may contain dashes) +# stem - identifier-safe name and folder. Each backend lives in its own +# folder, shipping (in namespace lemon::backends::): +# include/lemon/backends//.h inline const descriptor (CLI-safe data) +# include/lemon/backends//_server.h WrappedServer subclass + create() decl +# server/backends//_server.cpp implementation + create() def +# +# Adding a backend is one line here plus that folder. The foreach below compiles +# the server source and regenerates the registry headers, which bind each +# descriptor to its create(). Because this list is a tracked input, editing it +# forces regeneration on the next build (a file(GLOB) would silently miss a +# newly added backend). The descriptor is a header-only inline const, so it links +# into both the lemonade CLI and lemond; only lemond links the server sources. +set(LEMON_BACKENDS + # "|" + "llamacpp|llamacpp" + "whispercpp|whispercpp" + "moonshine|moonshine" + "kokoro|kokoro" + "sd-cpp|sdcpp" + "flm|fastflowlm" + "ryzenai-llm|ryzenai" + "vllm|vllm" + "cloud|cloud" +) + +set(LEMON_DESCRIPTOR_INCLUDES "") +set(LEMON_DESCRIPTOR_ENTRIES "") +set(LEMON_FACTORY_INCLUDES "") +set(LEMON_FACTORY_ENTRIES "") +# The data registry (descriptors, header-only) links into both binaries; the +# factory registry + per-backend server sources are server-only. +# Absolute paths so the CLI subdirectory can reuse LEMON_BACKEND_DESCRIPTOR_SOURCES. +set(LEMON_BACKEND_DESCRIPTOR_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_descriptor_registry.cpp) +set(LEMON_BACKEND_FACTORY_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_registry.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_ops.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/hf_cache_util.cpp) +foreach(_backend_entry ${LEMON_BACKENDS}) + string(REPLACE "|" ";" _backend_parts "${_backend_entry}") + list(GET _backend_parts 1 _backend_stem) + # The descriptor is header-only (no source). Compile every .cpp in the + # backend's folder (server class + any backend-private helpers like GGUF + # parsing) — CONFIGURE_DEPENDS re-globs when a file is added/removed so a new + # helper in a folder needs no CMake edit. (The backend LIST is still explicit + # above so a whole new backend is never silently missed.) + file(GLOB _backend_srcs CONFIGURE_DEPENDS + ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/${_backend_stem}/*.cpp) + list(APPEND LEMON_BACKEND_FACTORY_SOURCES ${_backend_srcs}) + string(APPEND LEMON_DESCRIPTOR_INCLUDES + "#include \"lemon/backends/${_backend_stem}/${_backend_stem}.h\"\n") + string(APPEND LEMON_DESCRIPTOR_ENTRIES + " &lemon::backends::${_backend_stem}::descriptor,\n") + string(APPEND LEMON_FACTORY_INCLUDES + "#include \"lemon/backends/${_backend_stem}/${_backend_stem}_server.h\"\n") + string(APPEND LEMON_FACTORY_ENTRIES + " { &lemon::backends::${_backend_stem}::descriptor, &lemon::backends::${_backend_stem}::create, lemon::backends::${_backend_stem}::spec(), lemon::backends::${_backend_stem}::ops() },\n") +endforeach() + +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_descriptors_generated.h.in + ${CMAKE_CURRENT_BINARY_DIR}/include/backend_descriptors_generated.h + @ONLY) +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_factories_generated.h.in + ${CMAKE_CURRENT_BINARY_DIR}/include/backend_factories_generated.h + @ONLY) + +# lemond gets both descriptor data and factories; the CLI gets only the data +# (see src/cpp/cli/CMakeLists.txt, which reuses LEMON_BACKEND_DESCRIPTOR_SOURCES). +list(APPEND SOURCES_CORE ${LEMON_BACKEND_DESCRIPTOR_SOURCES} ${LEMON_BACKEND_FACTORY_SOURCES}) + # ============================================================ # Server core OBJECT library (shared by lemond and Lemonade.exe) # ============================================================ diff --git a/README.md b/README.md index 38d9db6fe..2175b846e 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,7 @@ Use `lemonade pull` or the built-in **Model Manager** to download models. You ca Lemonade supports multiple inference engines for LLM, speech, TTS, and image generation, and each has its own backend and hardware requirements. + @@ -137,14 +138,14 @@ Lemonade supports multiple inference engines for LLM, speech, TTS, and image gen - - - + + + - - - + + + @@ -152,49 +153,54 @@ Lemonade supports multiple inference engines for LLM, speech, TTS, and image gen - - + + - - - + + + - - - + + + - + - + - + - - + + + + + + + - + @@ -202,28 +208,33 @@ Lemonade supports multiple inference engines for LLM, speech, TTS, and image gen - + + + + + + - - + + - - - - - + + + - - + + + + @@ -231,13 +242,24 @@ Lemonade supports multiple inference engines for LLM, speech, TTS, and image gen + + + + + + + + + +
Text generation llamacppvulkanx86_64 CPU, AMD iGPU, AMD dGPU; ARM64 CPU/GPU (Linux)Windows, Linuxsystemx86_64/ARM64 CPU, GPULinux
rocmSupported AMD ROCm iGPU/dGPU families*Windows, LinuxmetalApple Silicon GPUmacOS
cudaWindows, Linux
cpux86_64 CPU; ARM64 CPU (Linux)vulkanx86_64 CPU, AMD iGPU, AMD dGPU; ARM64 CPU/GPU (Linux) Windows, Linux
metalApple Silicon GPUmacOSrocmSupported AMD ROCm iGPU/dGPU families*Windows, Linux
systemx86_64/ARM64 CPU, GPULinuxcpux86_64 CPU; ARM64 CPU (Linux)Windows, Linux
flmflm npu XDNA2 NPU Windows, Linux
ryzenai-llmryzenai-llm npu XDNA2 NPU Windows
vllm (experimental)vllm (experimental) rocm Strix Halo iGPU (gfx1151) Linux
Speech-to-textwhispercppSpeech-to-textwhispercpp npu XDNA2 NPU Windows
rocmSupported AMD ROCm iGPU/dGPU families*Windows, Linux
vulkan x86_64 CPULinuxWindows, Linux
cpuWindows, Linux
moonshinemetalApple Silicon GPUmacOS
moonshine cpu x86_64/arm64 CPU Windows, Linux, macOS
Text-to-speechkokoroText-to-speechkokoro cpu x86_64 CPU Windows, Linux
Image generationsd-cpprocmSupported AMD ROCm iGPU/dGPU families*Windows, LinuxmetalApple Silicon GPUmacOS
vulkanVulkan-capable GPUsImage generationsd-cpprocmSupported AMD ROCm iGPU/dGPU families* Windows, Linux
NVIDIA GPUs (Turing or newer)** Linux
vulkanVulkan-capable GPUsWindows, Linux
cpu x86_64 CPU Windows, Linux
metalApple Silicon GPUmacOS
+ To check exactly which recipes/backends are supported on your own machine, run: diff --git a/docs/assets/models.js b/docs/assets/models.js index 5bb604006..d9814cccb 100644 --- a/docs/assets/models.js +++ b/docs/assets/models.js @@ -2,25 +2,25 @@ const GITHUB_REPO = 'lemonade-sdk/lemonade'; const TAGS_URL = `https://api.github.com/repos/${GITHUB_REPO}/tags?per_page=100`; const RAW_BASE = 'https://raw.githubusercontent.com/lemonade-sdk/lemonade'; +/* BEGIN GENERATED: models-js-recipes */ const RECIPE_PRIORITY = [ 'llamacpp', 'ryzenai-llm', 'flm', 'whispercpp', 'sd-cpp', - 'oga-hybrid', - 'oga-npu', - 'oga-cpu', 'kokoro' ]; const RECIPE_DISPLAY_NAMES = { llamacpp: 'llama.cpp GPU', - 'ryzenai-llm': 'Ryzen AI SW NPU', - flm: 'FastFlowLM NPU', whispercpp: 'whisper.cpp', - 'sd-cpp': 'stable-diffusion.cpp' + 'sd-cpp': 'stable-diffusion.cpp', + flm: 'FastFlowLM NPU', + 'ryzenai-llm': 'Ryzen AI SW NPU', + vllm: 'vLLM ROCm (experimental)' }; +/* END GENERATED: models-js-recipes */ const state = { tag: null, diff --git a/docs/dev/adding-a-backend.md b/docs/dev/adding-a-backend.md new file mode 100644 index 000000000..7699f97e6 --- /dev/null +++ b/docs/dev/adding-a-backend.md @@ -0,0 +1,151 @@ +# Adding a backend + +Lemonade backends are **self-describing**. A backend declares *what it is* in a +plain-data **descriptor** and implements *how it runs* in a **server class**, and +both live together in the backend's own folder. A registry collects every +descriptor, and the router, the CLI, `/system-info`, and the generated docs all +read it — so there are no scattered `if (recipe == "...")` sites to update. + +Adding a backend is **one folder plus three small appends**: + +| You edit | What goes there | +|----------|-----------------| +| `CMakeLists.txt` → `LEMON_BACKENDS` | **one line**: `"\|"` | +| `src/cpp/include/lemon/backends//.h` | the descriptor (header-only `inline const`) | +| `src/cpp/include/lemon/backends//_server.h` | the `WrappedServer` subclass + `create()` declaration | +| `src/cpp/server/backends//_server.cpp` | the implementation + `create()` definition | +| `src/cpp/resources/backend_versions.json` | version pin(s) — skip if there's no downloaded binary (e.g. cloud) | +| `src/cpp/resources/server_models.json` | the models | + +No router edits, no CLI edits, no doc edits, no support-matrix edits. + +Everything for one backend lives in `lemon::backends::`. The descriptor is +header-only so it links into **both** the `lemonade` CLI and `lemond`; the server +class and `create()` are server-only (compiled into `lemond`). + +## The descriptor — `/.h` + +Plain data. The single object the registry, CLI, `/system-info`, and docs all read. + +```cpp +#pragma once +#include "lemon/backends/backend_descriptor.h" + +namespace lemon { namespace backends { namespace myrecipe { + +inline const BackendDescriptor descriptor = { + /*recipe*/ "myrecipe", + /*display_name*/ "My Backend", + /*binary*/ "my-server", // "" = no subprocess (e.g. cloud) + /*config_section*/ "myrecipe", // defaults to recipe + /*default_device*/ DEVICE_GPU, + /*slot_policy*/ SlotPolicy::Standard, + /*selectable_backend*/ false, // true auto-exposes "_backend" + "--" + /*uses_ctx_size*/ true, // opt in to the shared ctx_size option + /*dynamic_models*/ false, // true = models discovered at runtime (cloud) + /*options*/ { // backend-specific knobs (common ones are automatic) + {"myrecipe_args", "--myrecipe-args", "", "ARGS", "Custom args to pass", "My Options"}, + }, + /*support*/ { // OS / device families ({} = no local gating) + {"myrecipe", "cpu", {"linux", "windows"}, {{"cpu", {"x86_64"}}}}, + }, + /*default_labels*/ {}, // labels injected when a model omits them + /*required_checkpoints*/ {"main"}, // unconditional files; conditional ones checked in load() +}; + +}}} // namespace lemon::backends::myrecipe +``` + +`SlotPolicy` controls accelerator sharing: `Standard` (counts toward LRU slots), +`ExclusiveNpu` (evicts all NPU servers first), `CoexistByType` (one per model +type), `Unmetered` (never counted, never auto-evicted — cloud). + +## The server class + factory — `/_server.{h,cpp}` + +The server class is a `WrappedServer` subclass. Implement `load()`, `unload()`, +and only the capability interfaces you serve (`ITranscriptionServer`, +`IImageServer`, `ITextToSpeechServer`, …). `WrappedServer` provides default +"unsupported" `chat_completion`/`completion`/`responses`, so a non-chat backend +does not stub them. Alongside it, a free `create()` builds the instance. + +`_server.h`: + +```cpp +#pragma once +#include "lemon/backends/backend_registry.h" // BackendContext +#include "lemon/wrapped_server.h" + +namespace lemon { namespace backends { + +class MyServer : public WrappedServer, public ICompletionServer { + // load(), unload(), the capability methods you serve … +}; + +namespace myrecipe { +std::unique_ptr create(const BackendContext& ctx); // server-only +} + +}} // namespace lemon::backends +``` + +`_server.cpp`: + +```cpp +#include "lemon/backends/myrecipe/myrecipe_server.h" +// … MyServer method definitions … + +namespace lemon { namespace backends { namespace myrecipe { +std::unique_ptr create(const BackendContext& ctx) { + return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); +} +}}} // namespace lemon::backends::myrecipe +``` + +## Register it: one line + +```cmake +set(LEMON_BACKENDS + ... + "myrecipe|myrecipe" # "|" +) +``` + +The `foreach` in `CMakeLists.txt` compiles `/_server.cpp` and +regenerates the registry headers, binding `::descriptor` to `::create`. + +## What you get for free + +- **Standard options:** `merge_args`, `auto_evict`, `evict_idle_timeout`, + `downsize_idle_timeout`, `evict_weight_factor`, `pinned`. `ctx_size` is opt-in + via `uses_ctx_size`. +- **Generated CLI flags** for every descriptor option with a `cli_flag`, plus + `--` when `selectable_backend = true`. +- **Install/download** via the backend's `BackendSpec` (binary + install params). +- **`/system-info`** `recipes` entry (display name, options schema, support matrix). +- **Generated docs** — your backend appears automatically in + [`backends-reference.md`](backends-reference.md), the README "Supported + Configurations" matrix, and the multi-model NPU-exclusivity list. A CI job + (`backend-docs-drift`) fails if the committed docs are stale. The descriptor's + `modality`, `experimental`, `web_display_name`, and each support row's + `device_summary` supply the editorial bits the matrix needs. + +## Escape hatches + +| Need | Hook | +|------|------| +| Device depends on the chosen backend variant (whisper npu vs cpu) | override `WrappedServer::effective_device(opts)` | +| Eviction rule depends on the variant | override `WrappedServer::effective_slot_policy(opts)` | +| Availability decided at runtime (cloud creds) | override `WrappedServer::availability()` | +| Conditional / grouped checkpoints (sd-cpp flux, whisper npu_cache) | validate in `load()`; list only unconditional files in `required_checkpoints` | +| Custom per-model fields without editing `ModelInfo` | read `model_info.extra("my_field", fallback)` (populated from unknown `server_models.json` keys) | +| Models supplied at runtime, not from `server_models.json` | set `dynamic_models = true` and provide them in the class (see cloud's `discover_models()`) | +| Per-create setup before load (ryzenai `set_model_path`) | do it in `create()` | + +## The simplest end-to-end example + +**Moonshine** is the minimal case: a single descriptor option, no backend +selection, CPU-only, one capability interface. See +`src/cpp/server/backends/moonshine/` and `include/lemon/backends/moonshine/`. + +> Note: collections (`collection.omni`) are orchestrator-driven, not +> `WrappedServer` subprocesses, and are the one explicit exception to this model. diff --git a/docs/dev/backends-reference.md b/docs/dev/backends-reference.md new file mode 100644 index 000000000..3993fe8fe --- /dev/null +++ b/docs/dev/backends-reference.md @@ -0,0 +1,328 @@ +# Backend reference + + + +## Backends + + +| Recipe | Name | Selectable backend | Uses ctx_size | Backends | +|--------|------|--------------------|---------------|----------| +| `flm` | FastFlowLM NPU | no | yes | npu | +| `kokoro` | Kokoro | no | no | cpu, metal | +| `llamacpp` | Llama.cpp GPU | yes | yes | cpu, cuda, metal, rocm, system, vulkan | +| `moonshine` | Moonshine | no | no | cpu | +| `ryzenai-llm` | Ryzen AI LLM | no | yes | npu | +| `sd-cpp` | StableDiffusion.cpp | yes | no | cpu, cuda, metal, rocm, vulkan | +| `vllm` | vLLM ROCm (experimental) | yes | yes | rocm | +| `whispercpp` | Whisper.cpp | yes | no | cpu, metal, npu, rocm, vulkan | + + +## Support matrix + + +| Recipe | Backend | OS | Device families | +|--------|---------|----|-----------------| +| `flm` | npu | linux, windows | amd_npu (XDNA2) | +| `kokoro` | cpu | linux, windows | cpu (x86_64) | +| `kokoro` | metal | macos | metal | +| `llamacpp` | system | linux | cpu (arm64, x86_64) | +| `llamacpp` | metal | macos | metal | +| `llamacpp` | cuda | linux, windows | nvidia_gpu (sm_100, sm_120, sm_121, sm_75, sm_80, sm_86, sm_89, sm_90) | +| `llamacpp` | vulkan | linux, windows | amd_gpu; cpu (arm64, x86_64) | +| `llamacpp` | rocm | linux, windows | amd_gpu (gfx103X, gfx110X, gfx1150, gfx1151, gfx1152, gfx120X) | +| `llamacpp` | cpu | linux, windows | cpu (arm64, x86_64) | +| `moonshine` | cpu | windows | cpu (x86_64) | +| `moonshine` | cpu | linux | cpu (arm64, x86_64) | +| `moonshine` | cpu | macos | cpu (arm64) | +| `ryzenai-llm` | npu | windows | amd_npu (XDNA2) | +| `sd-cpp` | rocm | linux, windows | amd_gpu (gfx103X, gfx110X, gfx1150, gfx1151, gfx1152, gfx120X) | +| `sd-cpp` | cuda | linux | nvidia_gpu (sm_100, sm_120, sm_121, sm_75, sm_80, sm_86, sm_89, sm_90) | +| `sd-cpp` | vulkan | linux, windows | amd_gpu; cpu (x86_64); nvidia_gpu | +| `sd-cpp` | cpu | linux, windows | cpu (x86_64) | +| `sd-cpp` | metal | macos | metal | +| `vllm` | rocm | linux | amd_gpu (gfx110X, gfx1150, gfx1151, gfx120X) | +| `whispercpp` | npu | windows | amd_npu (XDNA2) | +| `whispercpp` | rocm | linux, windows | amd_gpu (gfx110X, gfx1150, gfx1151, gfx120X) | +| `whispercpp` | vulkan | linux, windows | amd_gpu; cpu (x86_64) | +| `whispercpp` | cpu | linux, windows | cpu (x86_64) | +| `whispercpp` | metal | macos | metal | + + +## Recipe options + + +#### `llamacpp` — Llama.cpp GPU + +| Option | CLI flag | Type | Default | Description | +|--------|----------|------|---------|-------------| +| `ctx_size` | `--ctx-size` | SIZE | -1 | Context size for the model | +| `llamacpp_backend` | `--llamacpp` | BACKEND | "" | LlamaCpp backend to use | +| `llamacpp_device` | `--llamacpp-device` | DEVICES | "" | Comma-separated list of accelerator devices to use (e.g. Vulkan0) | +| `llamacpp_args` | `--llamacpp-args` | ARGS | "" | Custom arguments to pass to llama-server | + +#### `moonshine` — Moonshine + +| Option | CLI flag | Type | Default | Description | +|--------|----------|------|---------|-------------| +| `moonshine_args` | `--moonshine-args` | ARGS | "" | Custom arguments to pass to moonshine-server | + +#### `sd-cpp` — StableDiffusion.cpp + +| Option | CLI flag | Type | Default | Description | +|--------|----------|------|---------|-------------| +| `sd-cpp_backend` | `--sdcpp` | BACKEND | "" | SD.cpp backend to use | +| `sdcpp_args` | `--sdcpp-args` | ARGS | "" | Custom arguments to pass to sd-server (must not conflict with managed args) | +| `steps` | — | SIZE | 20 | Number of diffusion steps | +| `cfg_scale` | — | SIZE | 7.0 | Classifier-free guidance scale | +| `width` | — | SIZE | 512 | Output image width | +| `height` | — | SIZE | 512 | Output image height | +| `sampling_method` | — | ARGS | "" | Sampling method | +| `flow_shift` | — | SIZE | 0.0 | Flow shift | + +#### `vllm` — vLLM ROCm (experimental) + +| Option | CLI flag | Type | Default | Description | +|--------|----------|------|---------|-------------| +| `ctx_size` | `--ctx-size` | SIZE | -1 | Context size for the model | +| `vllm_backend` | `--vllm` | BACKEND | "" | vLLM backend to use | +| `vllm_args` | `--vllm-args` | ARGS | "" | Custom arguments to pass to vllm-server | + +#### `whispercpp` — Whisper.cpp + +| Option | CLI flag | Type | Default | Description | +|--------|----------|------|---------|-------------| +| `whispercpp_backend` | `--whispercpp` | BACKEND | "" | WhisperCpp backend to use | +| `whispercpp_args` | `--whispercpp-args` | ARGS | "" | Custom arguments to pass to whisper-server | + + +## Models + + +#### `collection.omni` — collection.omni (4 models) + +| Model | Size (GB) | Labels | +|-------|-----------|--------| +| `LMX-Omni-5.5B-Lite` | 9.3 | — | +| `LMX-Omni-52B-Halo` | 44.77 | — | +| `Lite Collection` | | — | +| `Ultra Collection` | | — | + +#### `kokoro` — Kokoro (1 models) + +| Model | Size (GB) | Labels | +|-------|-----------|--------| +| `kokoro-v1` | 0.354 | tts | + +#### `llamacpp` — Llama.cpp GPU (77 models) + +| Model | Size (GB) | Labels | +|-------|-----------|--------| +| `Bonsai-1.7B-gguf` | 0.25 | llamacpp | +| `Bonsai-4B-gguf` | 0.572 | llamacpp | +| `Bonsai-8B-gguf` | 1.16 | llamacpp | +| `Cogito-v2-llama-109B-MoE-GGUF` | 65.4 | vision | +| `DeepSeek-Qwen3-8B-GGUF` | 5.25 | reasoning | +| `Devstral-Small-2507-GGUF` | 14.3 | coding, tool-calling | +| `GLM-4.5-Air-UD-Q4K-XL-GGUF` | 67.7 | reasoning | +| `GLM-4.7-Flash-GGUF` | 17.5 | tool-calling | +| `Gemma-3-4b-it-GGUF` | 3.34 | vision | +| `Gemma-4-12B-it-GGUF` | 7.29 | tool-calling, vision, llamacpp | +| `Gemma-4-12B-it-MTP-GGUF` | 7.75 | tool-calling, llamacpp, vision, mtp | +| `Gemma-4-26B-A4B-it-GGUF` | 18.1 | hot, tool-calling, vision, llamacpp | +| `Gemma-4-26B-A4B-it-MTP-GGUF` | 18.5 | hot, tool-calling, vision, llamacpp, mtp | +| `Gemma-4-31B-it-GGUF` | 19.5 | hot, tool-calling, vision, llamacpp | +| `Gemma-4-31B-it-MTP-GGUF` | 20.0 | hot, tool-calling, vision, llamacpp, mtp | +| `Gemma-4-E2B-it-GGUF` | 4.09 | tool-calling, vision, llamacpp | +| `Gemma-4-E4B-it-GGUF` | 5.97 | tool-calling, vision, llamacpp | +| `Jan-nano-128k-GGUF` | 2.5 | — | +| `Jan-v1-4B-GGUF` | 2.5 | — | +| `LFM2-1.2B-GGUF` | 0.731 | — | +| `LFM2-24B-A2B-GGUF` | 14.4 | — | +| `LFM2-8B-A1B-GGUF` | 5.04 | — | +| `LFM2.5-1.2B-Instruct-GGUF` | 0.731 | — | +| `LFM2.5-8B-A1B` | 5.16 | — | +| `Llama-3.2-1B-Instruct-GGUF` | 0.834 | — | +| `Llama-3.2-3B-Instruct-GGUF` | 2.06 | — | +| `Llama-4-Scout-17B-16E-Instruct-GGUF` | 63.2 | vision | +| `Ministral-3-3B-Instruct-2512-GGUF` | 2.99 | vision | +| `Nemotron-3-Nano-30B-A3B-GGUF` | 22.8 | — | +| `Phi-4-mini-instruct-GGUF` | 2.49 | — | +| `Playable1-GGUF` | 4.68 | coding | +| `PromptBridge-0.6b-Alpha-GGUF` | 0.397 | — | +| `Qwen2.5-Coder-32B-Instruct-GGUF` | 19.9 | coding | +| `Qwen2.5-Omni-3B-GGUF` | 4.73 | vision, chat-transcription | +| `Qwen2.5-Omni-7B-GGUF` | 7.33 | vision, chat-transcription | +| `Qwen2.5-VL-3B-Instruct-GGUF` | 3.27 | vision | +| `Qwen2.5-VL-7B-Instruct-GGUF` | 6.04 | vision | +| `Qwen3-0.6B-GGUF` | 0.38 | reasoning | +| `Qwen3-1.7B-GGUF` | 1.06 | reasoning | +| `Qwen3-14B-GGUF` | 8.54 | reasoning | +| `Qwen3-30B-A3B-GGUF` | 17.4 | reasoning | +| `Qwen3-30B-A3B-Instruct-2507-GGUF` | 17.4 | tool-calling | +| `Qwen3-4B-GGUF` | 2.38 | reasoning | +| `Qwen3-4B-Instruct-2507-GGUF` | 2.5 | tool-calling | +| `Qwen3-8B-GGUF` | 5.25 | reasoning | +| `Qwen3-Coder-30B-A3B-Instruct-GGUF` | 18.6 | coding, tool-calling, hot | +| `Qwen3-Coder-Next-GGUF` | 48.0 | coding, tool-calling, hot | +| `Qwen3-Embedding-0.6B-GGUF` | 0.64 | embeddings | +| `Qwen3-Embedding-4B-GGUF` | 4.28 | embeddings | +| `Qwen3-Embedding-8B-GGUF` | 8.05 | embeddings | +| `Qwen3-Next-80B-A3B-Instruct-GGUF` | 46.1 | tool-calling | +| `Qwen3-VL-4B-Instruct-GGUF` | 3.33 | vision | +| `Qwen3-VL-8B-Instruct-GGUF` | 6.19 | vision | +| `Qwen3.5-0.8B-GGUF` | 0.764 | vision, tool-calling | +| `Qwen3.5-122B-A10B-GGUF` | 77.9 | vision, tool-calling | +| `Qwen3.5-122B-A10B-MTP-GGUF` | 79.6 | vision, tool-calling, mtp | +| `Qwen3.5-27B-GGUF` | 18.5 | vision, tool-calling | +| `Qwen3.5-2B-GGUF` | 2.01 | vision, tool-calling | +| `Qwen3.5-35B-A3B-GGUF` | 23.1 | vision, tool-calling | +| `Qwen3.5-4B-GGUF` | 3.58 | vision, tool-calling, hot | +| `Qwen3.5-4B-MTP-GGUF` | 3.66 | vision, tool-calling, mtp | +| `Qwen3.5-9B-GGUF` | 6.88 | vision, tool-calling | +| `Qwen3.6-27B-GGUF` | 18.5 | vision, tool-calling | +| `Qwen3.6-27B-MTP-GGUF` | 18.8 | vision, tool-calling, mtp, hot | +| `Qwen3.6-35B-A3B-GGUF` | 23.3 | vision, tool-calling, hot | +| `Qwen3.6-35B-A3B-MTP-GGUF` | 23.8 | vision, tool-calling, mtp | +| `SmolLM3-3B-GGUF` | 1.94 | — | +| `Tiny-Test-Model-GGUF` | 0.18 | — | +| `bge-reranker-v2-m3-GGUF` | 0.636 | reranking | +| `gpt-oss-120b-GGUF` | 62.8 | reasoning, tool-calling | +| `gpt-oss-120b-mxfp-GGUF` | 63.4 | hot, reasoning, tool-calling | +| `gpt-oss-20b-GGUF` | 11.6 | reasoning, tool-calling | +| `gpt-oss-20b-mxfp4-GGUF` | 12.1 | hot, reasoning, tool-calling | +| `granite-4.0-h-tiny-GGUF` | 4.25 | tool-calling | +| `jina-reranker-v1-tiny-en-GGUF` | 0.0367 | reranking | +| `nomic-embed-text-v1-GGUF` | 0.0781 | embeddings | +| `nomic-embed-text-v2-moe-GGUF` | 0.51 | embeddings | + +#### `moonshine` — Moonshine (3 models) + +| Model | Size (GB) | Labels | +|-------|-----------|--------| +| `Moonshine-Medium-Streaming` | 1.08 | transcription, realtime-transcription, hot | +| `Moonshine-Small-Streaming` | 0.431 | transcription, realtime-transcription | +| `Moonshine-Tiny-Streaming` | 0.202 | transcription, realtime-transcription | + +#### `ryzenai-llm` — Ryzen AI LLM (79 models) + +| Model | Size (GB) | Labels | +|-------|-----------|--------| +| `AMD-OLMo-1B-SFT-DPO-Hybrid` | 1.48 | — | +| `CodeLlama-7b-Instruct-hf-Hybrid` | 7.24 | coding | +| `CodeLlama-7b-Instruct-hf-NPU` | 7.54 | coding | +| `DeepSeek-R1-Distill-Llama-8B-CPU` | 6.2 | reasoning | +| `DeepSeek-R1-Distill-Llama-8B-Hybrid` | 9.09 | reasoning | +| `DeepSeek-R1-Distill-Llama-8B-NPU` | 9.3 | reasoning | +| `DeepSeek-R1-Distill-Qwen-1.5B-Hybrid` | 2.19 | reasoning | +| `DeepSeek-R1-Distill-Qwen-1.5B-NPU` | 2.3 | reasoning | +| `DeepSeek-R1-Distill-Qwen-7B-CPU` | 6.2 | reasoning | +| `DeepSeek-R1-Distill-Qwen-7B-Hybrid` | 8.67 | reasoning | +| `DeepSeek-R1-Distill-Qwen-7B-NPU` | 8.87 | reasoning | +| `Gemma-3-4b-it-mm-NPU` | 6.68 | vision | +| `Llama-2-7b-chat-hf-Hybrid` | 7.31 | — | +| `Llama-2-7b-chat-hf-NPU` | 7.47 | — | +| `Llama-2-7b-hf-Hybrid` | 7.31 | — | +| `Llama-2-7b-hf-NPU` | 7.47 | — | +| `Llama-3.1-8B-Hybrid` | 9.09 | — | +| `Llama-3.1-8B-NPU` | 9.3 | — | +| `Llama-3.2-1B-Hybrid` | 1.89 | — | +| `Llama-3.2-1B-Instruct-CPU` | 1.76 | — | +| `Llama-3.2-1B-Instruct-Hybrid` | 1.89 | — | +| `Llama-3.2-1B-Instruct-NPU` | 1.96 | — | +| `Llama-3.2-1B-NPU` | 1.96 | — | +| `Llama-3.2-3B-Hybrid` | 4.28 | — | +| `Llama-3.2-3B-Instruct-CPU` | 3.38 | — | +| `Llama-3.2-3B-Instruct-Hybrid` | 4.28 | — | +| `Meta-Llama-3-8B-Hybrid` | 9.06 | — | +| `Meta-Llama-3-8B-NPU` | 9.23 | — | +| `Meta-Llama-3.1-8B-Instruct-Hybrid` | 9.09 | — | +| `Meta-Llama-3.1-8B-Instruct-NPU` | 9.3 | — | +| `Mistral-7B-Instruct-v0.1-Hybrid` | 7.84 | — | +| `Mistral-7B-Instruct-v0.1-NPU` | 8.01 | — | +| `Mistral-7B-Instruct-v0.2-Hybrid` | 7.84 | — | +| `Mistral-7B-Instruct-v0.2-NPU` | 8.01 | — | +| `Mistral-7B-Instruct-v0.3-Hybrid` | 7.85 | — | +| `Mistral-7B-Instruct-v0.3-NPU` | 8.09 | — | +| `Mistral-7B-v0.3-Hybrid` | 7.85 | — | +| `Mistral-7B-v0.3-NPU` | 8.09 | — | +| `Phi-3-Mini-Instruct-CPU` | 2.39 | — | +| `Phi-3-mini-128k-instruct-Hybrid` | 4.21 | — | +| `Phi-3-mini-128k-instruct-NPU` | 4.35 | — | +| `Phi-3-mini-4k-instruct-Hybrid` | 4.19 | — | +| `Phi-3-mini-4k-instruct-NPU` | 4.3 | — | +| `Phi-3.5-mini-instruct-Hybrid` | 4.21 | — | +| `Phi-3.5-mini-instruct-NPU` | 4.35 | — | +| `Phi-4-mini-instruct-Hybrid` | 5.47 | — | +| `Phi-4-mini-instruct-NPU` | 5.59 | — | +| `Phi-4-mini-reasoning-Hybrid` | 5.47 | reasoning | +| `Qwen-1.5-7B-Chat-CPU` | 6.32 | — | +| `Qwen-2.5-1.5B-Instruct-Hybrid` | 2.17 | — | +| `Qwen-2.5-1.5B-Instruct-NPU` | 2.25 | — | +| `Qwen1.5-7B-Chat-Hybrid` | 8.83 | — | +| `Qwen1.5-7B-Chat-NPU` | 9.02 | — | +| `Qwen2-1.5B-Hybrid` | 2.19 | — | +| `Qwen2-1.5B-NPU` | 2.3 | — | +| `Qwen2-7B-Hybrid` | 8.68 | — | +| `Qwen2-7B-NPU` | 8.88 | — | +| `Qwen2.5-0.5B-Instruct-CPU` | 0.834 | — | +| `Qwen2.5-0.5B-Instruct-Hybrid` | 0.828 | — | +| `Qwen2.5-14B-instruct-Hybrid` | 16.5 | — | +| `Qwen2.5-3B-Instruct-Hybrid` | 3.97 | — | +| `Qwen2.5-3B-Instruct-NPU` | 4.1 | — | +| `Qwen2.5-7B-Instruct-Hybrid` | 8.65 | — | +| `Qwen2.5-7B-Instruct-NPU` | 8.83 | — | +| `Qwen2.5-Coder-0.5B-Instruct-Hybrid` | 0.828 | coding | +| `Qwen2.5-Coder-1.5B-Instruct-Hybrid` | 2.17 | coding | +| `Qwen2.5-Coder-1.5B-Instruct-NPU` | 2.25 | coding | +| `Qwen2.5-Coder-7B-Instruct-Hybrid` | 8.65 | coding | +| `Qwen2.5-Coder-7B-Instruct-NPU` | 8.83 | coding | +| `Qwen3-1.7B-Hybrid` | 2.55 | reasoning | +| `Qwen3-14B-Hybrid` | 16.5 | reasoning | +| `Qwen3-4B-Hybrid` | 5.17 | reasoning | +| `Qwen3-8B-Hybrid` | 9.42 | reasoning | +| `SmolLM-135M-Instruct-Hybrid` | 0.232 | — | +| `SmolLM2-135M-Instruct-Hybrid` | 0.233 | — | +| `chatglm3-6b-Hybrid` | 6.9 | — | +| `chatglm3-6b-NPU` | 7.04 | — | +| `gemma-2-2b-Hybrid` | 4.04 | — | +| `gpt-oss-20b-NPU` | 13.4 | — | + +#### `sd-cpp` — StableDiffusion.cpp (12 models) + +| Model | Size (GB) | Labels | +|-------|-----------|--------| +| `Flux-2-Klein-4B` | 16.1 | image, edit | +| `Flux-2-Klein-9B-GGUF` | 19.0 | image, edit | +| `Qwen-Image-2512-GGUF` | 19.4 | image | +| `Qwen-Image-GGUF` | 18.2 | image | +| `RealESRGAN-x4plus` | 0.064 | upscaling, image | +| `RealESRGAN-x4plus-anime` | 0.017 | upscaling, image | +| `SD-1.5` | 7.7 | image | +| `SD-Turbo` | 5.21 | image | +| `SD-Turbo-GGUF` | 2.02 | image | +| `SDXL-Base-1.0` | 6.94 | image | +| `SDXL-Turbo` | 6.94 | image | +| `Z-Image-Turbo` | 20.7 | image | + +#### `vllm` — vLLM ROCm (experimental) (4 models) + +| Model | Size (GB) | Labels | +|-------|-----------|--------| +| `Qwen3.5-0.8B-FP16-vLLM` | 1.77 | reasoning | +| `Qwen3.5-2B-FP16-vLLM` | 4.57 | reasoning, tool-calling | +| `Qwen3.5-4B-FP16-vLLM` | 9.34 | reasoning, hot, tool-calling | +| `Qwen3.5-9B-FP16-vLLM` | 19.3 | reasoning, tool-calling | + +#### `whispercpp` — Whisper.cpp (6 models) + +| Model | Size (GB) | Labels | +|-------|-----------|--------| +| `Whisper-Base` | 0.148 | transcription, realtime-transcription | +| `Whisper-Large-v3` | 3.1 | transcription, realtime-transcription | +| `Whisper-Large-v3-Turbo` | 1.62 | transcription, realtime-transcription, hot | +| `Whisper-Medium` | 1.53 | transcription, realtime-transcription | +| `Whisper-Small` | 0.488 | transcription, realtime-transcription | +| `Whisper-Tiny` | 0.075 | transcription, realtime-transcription | + diff --git a/docs/dev/contribute.md b/docs/dev/contribute.md index 766744ea8..5835b1fae 100644 --- a/docs/dev/contribute.md +++ b/docs/dev/contribute.md @@ -23,6 +23,10 @@ Lemonade's roadmap is defined by a set of [working groups](./working-groups/READ Not sure what to work on? Come to the feature-requests and troubleshooting channels on the Discord and see what people need! +### Adding a Backend + +Inference backends are self-describing: a backend is a descriptor (plain data) plus a server class, and everything else (router, CLI, `/system-info`, docs) is derived from it. See [Adding a backend](./adding-a-backend.md) for the full contract and a minimal example. + ### Issues Issues are a great way to document a bug or feature request. However, Lemonade is a community-driven project and you still need to find someone to implement your issue. It is highly recommended that you bring your issue to the [Lemonade discord community](https://discord.gg/5xXzkMu8Zk) and connect with a contributor who wants to implement it. diff --git a/docs/dev/getting-started.md b/docs/dev/getting-started.md index b8e487c4c..ef1769059 100644 --- a/docs/dev/getting-started.md +++ b/docs/dev/getting-started.md @@ -625,6 +625,7 @@ Internal endpoints accept connections from any address, so first-party clients o | `POST` | `/internal/shutdown` | Unloads all models and shuts down the server | | `POST` | `/internal/set` | Unified config setter (see below) | | `GET` | `/internal/config` | Returns the full runtime config snapshot | +| `GET` | `/internal/config/defaults` | Returns the canonical default config (factory defaults) | | `POST` | `/internal/cleanup-cache` | Cleans up orphaned files in the Hugging Face cache | | `POST` | `/internal/pin` | Pin or unpin a loaded model | @@ -676,6 +677,15 @@ Returns the full runtime configuration as a flat JSON object containing all serv curl http://localhost:13305/internal/config ``` +#### `GET /internal/config/defaults` + +Returns the canonical default configuration — the values a brand-new `config.json` is seeded with, independent of this instance's current config or deployment overrides. The per-recipe sections come from the backend descriptors (each descriptor's `config_defaults()`), making this the authoritative source of the factory defaults. `docs/tools/gen_backend_boilerplate.py` reads this endpoint to regenerate the committed `src/cpp/resources/defaults.json`, and a CI `--check` fails if that file drifts from the descriptors. + +**Example:** +```bash +curl http://localhost:13305/internal/config/defaults +``` + ### Dependencies All dependencies are automatically fetched by CMake via FetchContent: diff --git a/docs/embeddable/runtime.md b/docs/embeddable/runtime.md index a50b8c4af..983038e95 100644 --- a/docs/embeddable/runtime.md +++ b/docs/embeddable/runtime.md @@ -114,6 +114,7 @@ Your app can manage its `lemond` instance at runtime by using `/internal` endpoi |--------|------|-------------| | `POST` | `/internal/set` | Unified config setter (see below) | | `GET` | `/internal/config` | Returns the full runtime config snapshot | +| `GET` | `/internal/config/defaults` | Returns the canonical default config (factory defaults) | | `POST` | `/internal/pin` | Pin or unpin a loaded model (prevents auto-eviction) | The settings defined in `config.json` can all be changed at runtime without restarting `lemond` with the `/internal/set` endpoint. See the [Configuration Guide](../guide/configuration/README.md) for details on all settings. @@ -137,6 +138,23 @@ Returns the full runtime configuration as a flat JSON object containing all serv curl http://localhost:8000/internal/config ``` +#### `GET /internal/config/defaults` + +Returns the canonical default configuration — the values a brand-new `config.json` is seeded with, independent of this instance's current config or any deployment override. The per-recipe sections are derived from the backend descriptors, so this is the authoritative source for "what are the factory defaults." It is what `docs/tools/gen_backend_boilerplate.py` reads to regenerate `src/cpp/resources/defaults.json`. + +**Example:** +=== "Windows (cmd.exe)" + + ```cmd + curl http://localhost:8000/internal/config/defaults + ``` + +=== "Linux (bash)" + + ```bash + curl http://localhost:8000/internal/config/defaults + ``` + #### `POST /internal/set` Accepts a JSON object with one or more keys to update atomically. Returns `{"status":"success","updated":{...}}` on success, or `400` with an error message on validation failure. diff --git a/docs/guide/cli.md b/docs/guide/cli.md index 50d388bbb..53ed5f1f5 100644 --- a/docs/guide/cli.md +++ b/docs/guide/cli.md @@ -325,44 +325,56 @@ The following options apply to all model loads: The following options are available depending on the recipe being used: -#### Llama.cpp (`llamacpp` recipe) + +#### Llama.cpp GPU (`llamacpp` recipe) | Option | Description | Default | |--------|-------------|---------| -| `--ctx-size SIZE` | Context size for the model | `4096` | +| `--ctx-size SIZE` | Context size for the model | auto | | `--llamacpp BACKEND` | LlamaCpp backend to use | Auto-detected | -| `--llamacpp-device DEVICE` | Comma-separated list of accelerator devices to use (e.g. Vulkan0) | (empty) | -| `--llamacpp-args ARGS` | Custom arguments to pass to llama-server (must not conflict with managed args) | `""` | +| `--llamacpp-device DEVICES` | Comma-separated list of accelerator devices to use (e.g. Vulkan0) | `""` | +| `--llamacpp-args ARGS` | Custom arguments to pass to llama-server | `""` | -#### FLM (`flm` recipe) +#### Whisper.cpp (`whispercpp` recipe) | Option | Description | Default | |--------|-------------|---------| -| `--ctx-size SIZE` | Context size for the model | `4096` | +| `--whispercpp BACKEND` | WhisperCpp backend to use | Auto-detected | +| `--whispercpp-args ARGS` | Custom arguments to pass to whisper-server | `""` | -#### RyzenAI LLM (`ryzenai-llm` recipe) +#### Moonshine (`moonshine` recipe) | Option | Description | Default | |--------|-------------|---------| -| `--ctx-size SIZE` | Context size for the model | `4096` | +| `--moonshine-args ARGS` | Custom arguments to pass to moonshine-server | `""` | -#### SD.cpp (`sd-cpp` recipe) +#### StableDiffusion.cpp (`sd-cpp` recipe) | Option | Description | Default | |--------|-------------|---------| -| `--sdcpp BACKEND` | SD.cpp backend to use (`cpu` for CPU, `rocm` for AMD GPU) | Auto-detected | +| `--sdcpp BACKEND` | SD.cpp backend to use | Auto-detected | | `--sdcpp-args ARGS` | Custom arguments to pass to sd-server (must not conflict with managed args) | `""` | -| `--steps N` | Number of inference steps for image generation | `20` | -| `--cfg-scale SCALE` | Classifier-free guidance scale for image generation | `7.0` | -| `--width PX` | Image width in pixels | `512` | -| `--height PX` | Image height in pixels | `512` | -#### Whisper.cpp (`whispercpp` recipe) +#### FastFlowLM NPU (`flm` recipe) | Option | Description | Default | |--------|-------------|---------| -| `--whispercpp BACKEND` | WhisperCpp backend to use | Auto-detected | +| `--ctx-size SIZE` | Context size for the model | auto | + +#### Ryzen AI LLM (`ryzenai-llm` recipe) +| Option | Description | Default | +|--------|-------------|---------| +| `--ctx-size SIZE` | Context size for the model | auto | + +#### vLLM ROCm (experimental) (`vllm` recipe) + +| Option | Description | Default | +|--------|-------------|---------| +| `--ctx-size SIZE` | Context size for the model | auto | +| `--vllm BACKEND` | vLLM backend to use | Auto-detected | +| `--vllm-args ARGS` | Custom arguments to pass to vllm-server | `""` | + **Notes:** - Unspecified options will use the backend's default values - Backend options (`--llamacpp`, `--sdcpp`, `--whispercpp`) are auto-detected based on system capabilities diff --git a/docs/guide/configuration/README.md b/docs/guide/configuration/README.md index 93977148c..2a388dc8f 100644 --- a/docs/guide/configuration/README.md +++ b/docs/guide/configuration/README.md @@ -31,68 +31,81 @@ Values set in the user's `config.json` always take precedence over these seeded ### Example config.json + ```json { - "config_version": 1, - "port": 13305, - "host": "localhost", - "log_level": "info", - "global_timeout": 600, - "max_loaded_models": 1, - "no_broadcast": false, - "extra_models_dir": "", - "models_dir": "auto", + "cloud_providers": [], + "config_version": 2, "ctx_size": -1, - "offline": false, - "no_fetch_executables": false, "disable_model_filtering": false, "enable_dgpu_gtt": false, - "rocm_channel": "stable", + "extra_models_dir": "", + "flm": { + "args": "" + }, + "global_timeout": 600, + "host": "localhost", + "kokoro": { + "cpu_bin": "builtin" + }, "llamacpp": { - "backend": "auto", "args": "", - "vulkan_args": "", - "rocm_args": "", + "backend": "auto", "cpu_args": "", - "device": "", - "prefer_system": false, + "cpu_bin": "builtin", + "cuda_bin": "builtin", + "prefer_system": true, + "rocm_args": "", "rocm_bin": "builtin", - "vulkan_bin": "builtin", - "cpu_bin": "builtin" + "vulkan_args": "", + "vulkan_bin": "builtin" }, - "whispercpp": { - "backend": "auto", + "log_level": "info", + "max_loaded_models": 1, + "models_dir": "auto", + "moonshine": { "args": "", "cpu_args": "", - "npu_args": "", - "cpu_bin": "builtin", - "npu_bin": "builtin" + "cpu_bin": "builtin" + }, + "no_broadcast": false, + "no_fetch_executables": false, + "offline": false, + "port": 13305, + "rocm_channel": "stable", + "ryzenai": { + "server_bin": "builtin" }, "sdcpp": { - "backend": "auto", "args": "", - "cpu_args": "", - "rocm_args": "", - "vulkan_args": "", - "steps": 20, + "backend": "auto", "cfg_scale": 7.0, - "width": 512, - "height": 512, + "cpu_args": "", "cpu_bin": "builtin", + "height": 512, + "rocm_args": "", "rocm_bin": "builtin", - "vulkan_bin": "builtin" + "steps": 20, + "vulkan_args": "", + "vulkan_bin": "builtin", + "width": 512 }, - "flm": { + "vllm": { "args": "", + "backend": "auto" }, - "ryzenai": { - "server_bin": "builtin" - }, - "kokoro": { - "cpu_bin": "builtin" + "websocket_port": "auto", + "whispercpp": { + "args": "", + "backend": "auto", + "cpu_args": "", + "cpu_bin": "builtin", + "npu_args": "", + "npu_bin": "builtin" } } ``` + ### Settings Reference diff --git a/docs/guide/configuration/custom-models.md b/docs/guide/configuration/custom-models.md index c3e770442..5a7dbd878 100644 --- a/docs/guide/configuration/custom-models.md +++ b/docs/guide/configuration/custom-models.md @@ -71,7 +71,7 @@ Supported registration flags: | Flag | Description | |------|-------------| | `--checkpoint TYPE CHECKPOINT` | Add a checkpoint entry. Repeat for multi-file models such as `main` + `mmproj` or `main` + `vae`. | -| `--recipe RECIPE` | Recipe to associate with the new `user.*` model. Common values: `llamacpp`, `flm`, `ryzenai-llm`, `vllm`, `whispercpp`, `moonshine`, `sd-cpp`, `kokoro`, `collection.omni`. | +| `--recipe RECIPE` | Recipe to associate with the new `user.*` model. Common values: `llamacpp`, `whispercpp`, `moonshine`, `kokoro`, `sd-cpp`, `flm`, `ryzenai-llm`, `vllm`, `collection.omni`. | | `--label LABEL` | Add a label to the new model. Repeatable. Valid labels include `coding`, `embeddings`, `hot`, `mtp`, `reasoning`, `reranking`, `tool-calling`, `vision`. | | `--components MODEL [MODEL ...]` | Components for an omni collection (see below). Use with `--recipe collection.omni`. | diff --git a/docs/guide/configuration/multi-model.md b/docs/guide/configuration/multi-model.md index 30ed840d5..db9944ff9 100644 --- a/docs/guide/configuration/multi-model.md +++ b/docs/guide/configuration/multi-model.md @@ -22,7 +22,9 @@ Each type has its own independent LRU cache, all sharing the same slot limit set ## Device Constraints -- **NPU Exclusivity:** `flm`, `ryzenai-llm`, and `whispercpp` are mutually exclusive on the NPU. + +- **NPU Exclusivity:** `whispercpp`, `flm`, and `ryzenai-llm` are mutually exclusive on the NPU. + - Loading a model from one of these backends will automatically evict all NPU models from the other backends. - `flm` supports loading 1 ASR model, 1 LLM, and 1 embedding model on the NPU at the same time. - `ryzenai-llm` supports loading exactly 1 LLM, which uses the entire NPU. diff --git a/docs/tools/gen_backend_boilerplate.py b/docs/tools/gen_backend_boilerplate.py new file mode 100644 index 000000000..b4e8ac8d9 --- /dev/null +++ b/docs/tools/gen_backend_boilerplate.py @@ -0,0 +1,621 @@ +#!/usr/bin/env python3 +"""Generate backend boilerplate (docs + config defaults) from the descriptors. + +The C++ backend descriptors (src/cpp/include/lemon/backends//.h) are +the single source of truth for what each backend is. This script boots a `lemond` +server and regenerates the committed artifacts that would otherwise be +hand-maintained: + + * Marker-delimited regions of the backend reference docs, from + ``/system-info`` ``recipes`` + ``server_models.json``. + * The whole of ``src/cpp/resources/defaults.json``, mirrored verbatim from + ``/internal/config/defaults`` (its per-recipe blocks come from each + descriptor's ``config_defaults()``). + +A CI step runs it with ``--check`` and fails if any committed artifact drifts. + +Usage: + python docs/tools/gen_backend_boilerplate.py [--lemond PATH] [--check] + +``--check`` regenerates in memory and exits non-zero if any on-disk artifact +differs, without modifying it. For the docs, only the regions between:: + + + + +are rewritten; surrounding prose is left untouched. +""" + +import argparse +import json +import re +import socket +import subprocess +import sys +import tempfile +import time +import urllib.request +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[2] +SERVER_MODELS = REPO_ROOT / "src" / "cpp" / "resources" / "server_models.json" +TARGET_DOC = REPO_ROOT / "docs" / "dev" / "backends-reference.md" + + +def free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("127.0.0.1", 0)) + return s.getsockname()[1] + + +def find_lemond(explicit: str | None) -> Path: + if explicit: + p = Path(explicit) + if not p.exists(): + sys.exit(f"lemond not found at {p}") + return p + for candidate in [ + REPO_ROOT / "build" / "lemond", + REPO_ROOT / "build" / "lemond.exe", + ]: + if candidate.exists(): + return candidate + sys.exit("Could not find a built lemond (looked in build/). Pass --lemond PATH.") + + +class Lemond: + """Boots a throwaway lemond on a free port with an isolated cache dir.""" + + def __init__(self, binary: Path): + self.binary = binary + self.port = free_port() + self._cache = tempfile.TemporaryDirectory(prefix="lemond-docs-") + self._proc: subprocess.Popen | None = None + + def __enter__(self): + self._proc = subprocess.Popen( + [str(self.binary), self._cache.name, "--port", str(self.port)], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + deadline = time.time() + 60 + while time.time() < deadline: + try: + self._get("/api/v1/health") + return self + except Exception: + if self._proc.poll() is not None: + sys.exit("lemond exited before becoming ready") + time.sleep(0.5) + self.__exit__(None, None, None) + sys.exit("lemond did not become ready within 60s") + + def __exit__(self, *exc): + if self._proc and self._proc.poll() is None: + try: + self._get("/internal/shutdown", timeout=2) + except Exception: + pass + try: + self._proc.wait(timeout=10) + except Exception: + self._proc.kill() + self._cache.cleanup() + + def _get(self, path: str, timeout: float = 5): + url = f"http://127.0.0.1:{self.port}{path}" + with urllib.request.urlopen(url, timeout=timeout) as r: + return r.read() + + def system_info(self) -> dict: + return json.loads(self._get("/api/v1/system-info", timeout=30)) + + def config(self) -> dict: + return json.loads(self._get("/internal/config", timeout=10)) + + def config_defaults_text(self) -> str: + # Verbatim text of the canonical default config (the server's own + # serialization) so the committed resources/defaults.json is byte-stable. + text = self._get("/internal/config/defaults", timeout=10).decode("utf-8") + return text if text.endswith("\n") else text + "\n" + + +def md_escape(text: str) -> str: + return str(text).replace("|", "\\|") + + +MODALITY_ORDER = [ + "Text generation", + "Speech-to-text", + "Text-to-speech", + "Image generation", +] +OS_LABEL = {"windows": "Windows", "linux": "Linux", "macos": "macOS"} +OS_ORDER = ["windows", "linux", "macos"] + + +def _fmt_os(os_set) -> str: + return ", ".join(OS_LABEL.get(o, o) for o in OS_ORDER if o in os_set) + + +def _code_devices(summary: str) -> str: + # Light formatting: render bare arch tokens as , matching the README style. + summary = re.sub(r"\bx86_64\b", "x86_64", summary) + summary = re.sub(r"\barm64\b", "arm64", summary) + return summary + + +def _ordered(recipes: dict) -> list: + # Recipes in descriptor registry order (stable, deterministic doc rendering). + return sorted(recipes.items(), key=lambda kv: kv[1].get("order", 999)) + + +def render_readme_matrix(recipes: dict) -> str: + # Group descriptor-backed recipes by modality, in descriptor registry order. + by_mod: dict[str, list] = {m: [] for m in MODALITY_ORDER} + for recipe, info in _ordered(recipes): + mod = info.get("modality") + if not mod or mod not in by_mod: + continue + # Merge support rows sharing a (backend, device summary); union their OS. + merged: list[dict] = [] + seen: dict[tuple, dict] = {} + for row in info.get("support", []): + key = (row["backend"], row.get("device_summary", "")) + if key in seen: + seen[key]["os"] |= set(row.get("os", [])) + else: + d = { + "backend": row["backend"], + "summary": row.get("device_summary", ""), + "os": set(row.get("os", [])), + } + seen[key] = d + merged.append(d) + if merged: + by_mod[mod].append((recipe, info, merged)) + + out = [ + "", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + ] + for mod in MODALITY_ORDER: + recipes_in = by_mod[mod] + if not recipes_in: + continue + mod_span = sum(len(m) for _, _, m in recipes_in) + first_mod = True + for recipe, info, merged in recipes_in: + engine = f"{recipe}" + ( + " (experimental)" if info.get("experimental") else "" + ) + first_recipe = True + for d in merged: + out.append(" ") + if first_mod: + out.append( + f' ' + ) + first_mod = False + if first_recipe: + out.append(f' ') + first_recipe = False + out.append(f' ') + out.append(f" ") + out.append(f" ") + out.append(" ") + out += [" ", "
ModalityEngineBackendDeviceOS
{mod}{engine}{d["backend"]}{_code_devices(d['summary'])}{_fmt_os(d['os'])}
"] + return "\n".join(out) + + +def _cli_default(opt: dict) -> str: + d = opt.get("default") + if opt.get("type_name") == "BACKEND" and d == "": + return "Auto-detected" + if isinstance(d, str): + return '`""`' if d == "" else f"`{d}`" + if isinstance(d, bool): + return f"`{str(d).lower()}`" + if d == -1: + return "auto" + return f"`{d}`" + + +def render_cli_recipe_options(recipes: dict) -> str: + # Per-recipe load options, exactly as the CLI registers them from descriptors. + # Recipes with no CLI options (kokoro, cloud) are omitted. + blocks: list[str] = [] + for recipe, info in _ordered(recipes): + cli_opts = [o for o in info.get("options", []) if o.get("cli_flag")] + if not info.get("uses_ctx_size") and not cli_opts: + continue + blocks.append(f"#### {info.get('display_name', recipe)} (`{recipe}` recipe)\n") + blocks.append("| Option | Description | Default |") + blocks.append("|--------|-------------|---------|") + if info.get("uses_ctx_size"): + blocks.append("| `--ctx-size SIZE` | Context size for the model | auto |") + for o in cli_opts: + blocks.append( + "| `{flag} {t}` | {h} | {d} |".format( + flag=o["cli_flag"], + t=o.get("type_name", ""), + h=md_escape(o.get("help", "")), + d=_cli_default(o), + ) + ) + blocks.append("") + return "\n".join(blocks).rstrip() + + +def _oxford(items: list) -> str: + items = [f"`{i}`" for i in items] + if len(items) <= 1: + return "".join(items) + if len(items) == 2: + return f"{items[0]} and {items[1]}" + return ", ".join(items[:-1]) + f", and {items[-1]}" + + +def _js_to_title(recipe: str) -> str: + # Mirror models.js toTitle(): the website's fallback for unlisted display names. + return re.sub( + r"\b\w", + lambda m: m.group(0).upper(), + recipe.replace("_", " ").replace("-", " "), + ) + + +def _js_key(recipe: str) -> str: + # Bare identifier if it's a valid JS key, else quoted (matches models.js style). + return recipe if re.fullmatch(r"[A-Za-z_][A-Za-z0-9_]*", recipe) else f"'{recipe}'" + + +def render_models_js(recipes: dict) -> str: + # RECIPE_PRIORITY: recipes with web_priority > 0, in that order (legacy oga-* + # recipes have no descriptor and are intentionally dropped). + prioritized = sorted( + (r for r, i in recipes.items() if i.get("web_priority", 0) > 0), + key=lambda r: recipes[r]["web_priority"], + ) + pri_lines = ",\n".join(f" '{r}'" for r in prioritized) + + # RECIPE_DISPLAY_NAMES: only recipes whose name differs from the JS toTitle() + # fallback (matching the curated map, which omits redundant entries). + name_lines = [] + for r, info in _ordered(recipes): + name = info.get("web_display_name") or info.get("display_name", r) + if name and name != _js_to_title(r): + name_lines.append(f" {_js_key(r)}: '{name}'") + names = ",\n".join(name_lines) + + return ( + f"const RECIPE_PRIORITY = [\n{pri_lines}\n];\n\n" + f"const RECIPE_DISPLAY_NAMES = {{\n{names}\n}};" + ) + + +def render_config_example(config: dict) -> str: + # The canonical config.json, straight from a fresh lemond's /internal/config. + # `port` is the only environment-dependent field (it reflects the launch port); + # normalize it to the documented default. + cfg = dict(config) + cfg["port"] = 13305 + return "```json\n" + json.dumps(cfg, indent=2) + "\n```" + + +def render_recipe_values(recipes: dict) -> str: + # Inline list of recipe values for `--recipe`, plus the collection orchestrator. + rs = [r for r, _ in _ordered(recipes)] + ["collection.omni"] + return ", ".join(f"`{r}`" for r in rs) + + +def render_npu_exclusivity(recipes: dict) -> str: + npu = [ + r + for r, info in _ordered(recipes) + if any( + row.get("backend") == "npu" + or any(d.get("device") == "amd_npu" for d in row.get("devices", [])) + for row in info.get("support", []) + ) + ] + return f"- **NPU Exclusivity:** {_oxford(npu)} are mutually exclusive on the NPU." + + +def render_overview(recipes: dict) -> str: + rows = [ + "| Recipe | Name | Selectable backend | Uses ctx_size | Backends |", + "|--------|------|--------------------|---------------|----------|", + ] + for recipe in sorted(recipes): + info = recipes[recipe] + if "display_name" not in info: + continue # not a descriptor-backed recipe on this run + backends = sorted({b["backend"] for b in info.get("support", [])}) or sorted( + info.get("backends", {}) + ) + rows.append( + "| `{r}` | {n} | {s} | {c} | {b} |".format( + r=recipe, + n=md_escape(info.get("display_name", "")), + s="yes" if info.get("selectable_backend") else "no", + c="yes" if info.get("uses_ctx_size") else "no", + b=", ".join(backends) if backends else "—", + ) + ) + return "\n".join(rows) + + +def render_support_matrix(recipes: dict) -> str: + rows = [ + "| Recipe | Backend | OS | Device families |", + "|--------|---------|----|-----------------|", + ] + for recipe in sorted(recipes): + info = recipes[recipe] + for row in info.get("support", []): + fams = [] + for d in row.get("devices", []): + f = d.get("families") or [] + fams.append(d["device"] + (f" ({', '.join(f)})" if f else "")) + rows.append( + "| `{r}` | {b} | {o} | {d} |".format( + r=recipe, + b=row.get("backend", ""), + o=", ".join(sorted(row.get("os", []))), + d=md_escape("; ".join(fams)) if fams else "—", + ) + ) + return "\n".join(rows) + + +def render_options(recipes: dict) -> str: + blocks = [] + for recipe in sorted(recipes): + info = recipes[recipe] + opts = info.get("options") + if not opts: + continue + blocks.append(f"#### `{recipe}` — {info.get('display_name', recipe)}\n") + blocks.append("| Option | CLI flag | Type | Default | Description |") + blocks.append("|--------|----------|------|---------|-------------|") + if info.get("uses_ctx_size"): + blocks.append( + "| `ctx_size` | `--ctx-size` | SIZE | -1 | Context size for the model |" + ) + for o in opts: + blocks.append( + "| `{n}` | {f} | {t} | {d} | {h} |".format( + n=o["name"], + f=f"`{o['cli_flag']}`" if o.get("cli_flag") else "—", + t=o.get("type_name", ""), + d=md_escape( + json.dumps(o.get("default")) + if not isinstance(o.get("default"), str) + else o.get("default") or '""' + ), + h=md_escape(o.get("help", "")), + ) + ) + blocks.append("") + return "\n".join(blocks).rstrip() + + +def render_models(recipes: dict) -> str: + models = json.loads(SERVER_MODELS.read_text()) + by_recipe: dict[str, list] = {} + for name, data in models.items(): + if not isinstance(data, dict): + continue + by_recipe.setdefault(data.get("recipe", "(unspecified)"), []).append( + (name, data) + ) + blocks = [] + for recipe in sorted(by_recipe): + entries = sorted(by_recipe[recipe]) + display = recipes.get(recipe, {}).get("display_name", recipe) + blocks.append(f"#### `{recipe}` — {display} ({len(entries)} models)\n") + blocks.append("| Model | Size (GB) | Labels |") + blocks.append("|-------|-----------|--------|") + for name, data in entries: + blocks.append( + "| `{n}` | {s} | {l} |".format( + n=md_escape(name), + s=data.get("size", ""), + l=md_escape(", ".join(data.get("labels", []))) or "—", + ) + ) + blocks.append("") + return "\n".join(blocks).rstrip() + + +DEFAULT_TEMPLATE = """# Backend reference + + + +## Backends + + + + +## Support matrix + + + + +## Recipe options + + + + +## Models + + + +""" + + +def apply_sections(text: str, sections: dict[str, str]) -> str: + for marker_id, body in sections.items(): + # Accept HTML (``) markers for Markdown and block (`/* ... */`) + # markers for code files like .js, so the same generator drives both. + mid = re.escape(marker_id) + begin = ( + r"(|/\* BEGIN GENERATED: " + + mid + + r" \*/)" + ) + end = ( + r"(|/\* END GENERATED: " + + mid + + r" \*/)" + ) + pattern = re.compile(begin + r".*?" + end, re.DOTALL) + m = pattern.search(text) + if not m: + sys.exit(f"Marker region '{marker_id}' not found in target doc") + + # Inline regions (markers mid-line, e.g. inside a table cell) get no + # surrounding newlines; block regions are wrapped on their own lines. + inline = m.start() > 0 and text[m.start() - 1] != "\n" + # Escape backslashes and group-ref markers in the body for re.sub. + safe_body = body.replace("\\", "\\\\") + sep = "" if inline else "\n" + replacement = r"\1" + sep + safe_body + sep + r"\2" + text = pattern.sub(replacement, text) + return text + + +def main() -> int: + ap = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + ap.add_argument("--lemond", help="Path to the built lemond binary") + ap.add_argument( + "--check", action="store_true", help="Fail if docs are stale; do not write" + ) + args = ap.parse_args() + + binary = find_lemond(args.lemond) + with Lemond(binary) as server: + info = server.system_info() + config = server.config() + defaults_text = server.config_defaults_text() + recipes = info.get("recipes", {}) + if not recipes: + sys.exit("/system-info returned no recipes") + if not config: + sys.exit("/internal/config returned nothing") + + # Each target doc maps marker IDs -> generated content. backends-reference.md + # is created from a template if missing; the others must already contain their + # markers (the regions were added to the curated docs by hand once). + targets: dict = { + TARGET_DOC: { + "sections": { + "backends-overview": render_overview(recipes), + "backends-matrix": render_support_matrix(recipes), + "backend-options": render_options(recipes), + "backend-models": render_models(recipes), + }, + "template": DEFAULT_TEMPLATE, + }, + REPO_ROOT + / "README.md": { + "sections": {"backends-matrix": render_readme_matrix(recipes)}, + }, + REPO_ROOT + / "docs" + / "guide" + / "configuration" + / "multi-model.md": { + "sections": {"npu-exclusivity": render_npu_exclusivity(recipes)}, + }, + REPO_ROOT + / "docs" + / "guide" + / "cli.md": { + "sections": {"cli-recipe-options": render_cli_recipe_options(recipes)}, + }, + REPO_ROOT + / "docs" + / "guide" + / "configuration" + / "custom-models.md": { + "sections": {"recipe-values": render_recipe_values(recipes)}, + }, + REPO_ROOT + / "docs" + / "guide" + / "configuration" + / "README.md": { + "sections": {"config-example": render_config_example(config)}, + }, + REPO_ROOT + / "docs" + / "assets" + / "models.js": { + "sections": {"models-js-recipes": render_models_js(recipes)}, + }, + } + + # Whole-file generated artifacts (not marker-delimited): resources/defaults.json + # is the canonical default config, mirrored verbatim from GET + # /internal/config/defaults (per-recipe blocks come from the descriptors). + raw_targets: dict = { + REPO_ROOT / "src" / "cpp" / "resources" / "defaults.json": defaults_text, + } + + stale = [] + for path, content in raw_targets.items(): + rel = path.relative_to(REPO_ROOT) + if args.check: + if not path.exists() or path.read_text() != content: + stale.append(str(rel)) + else: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content) + print(f"Wrote {rel}") + + for path, spec in targets.items(): + rel = path.relative_to(REPO_ROOT) + current = path.read_text() if path.exists() else spec.get("template", "") + if not current: + sys.exit(f"{rel} is missing and has no template") + updated = apply_sections(current, spec["sections"]) + if args.check: + if not path.exists() or path.read_text() != updated: + stale.append(str(rel)) + else: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(updated) + print(f"Wrote {rel}") + + if args.check: + if stale: + sys.exit( + "Stale generated files: " + + ", ".join(stale) + + "\nRun: python docs/tools/gen_backend_boilerplate.py" + ) + print("All generated files are up to date.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/mkdocs.yml b/mkdocs.yml index 18201bba3..73ecc9981 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -83,6 +83,8 @@ nav: - Contribute: dev/contribute.md - Documentation Guide: dev/documentation.md - C++: dev/getting-started.md + - Adding a Backend: dev/adding-a-backend.md + - Backends Reference: dev/backends-reference.md - Desktop App: dev/app.md - Web UI: dev/web-ui.md - Lemonade Omni Models: dev/lemonade-omni.md diff --git a/src/app/src/renderer/utils/recipeNames.ts b/src/app/src/renderer/utils/recipeNames.ts index d654c635a..8f1fdbb1f 100644 --- a/src/app/src/renderer/utils/recipeNames.ts +++ b/src/app/src/renderer/utils/recipeNames.ts @@ -4,15 +4,28 @@ export const isCollectionRecipe = (recipe?: string): boolean => { return recipe === COLLECTION_OMNI_MODEL_RECIPE; }; +// Recipe display names. Hardware-backend names (llamacpp, whispercpp, sd-cpp, …) +// are populated at runtime from /system-info's `recipes[].display_name`, which is +// generated from the C++ backend descriptors — the single source of truth. Only +// recipes NOT surfaced by /system-info's hardware support matrix are seeded here: +// the collection orchestrator (not a backend) and cloud offload (a backend with +// no local support rows). export const RECIPE_DISPLAY_NAMES: Record = { [COLLECTION_OMNI_MODEL_RECIPE]: 'Lemonade', - 'flm': 'FastFlowLM NPU', - 'llamacpp': 'Llama.cpp GPU', - 'ryzenai-llm': 'Ryzen AI LLM', - 'whispercpp': 'Whisper.cpp', - 'moonshine': 'Moonshine', - 'sd-cpp': 'StableDiffusion.cpp', - 'kokoro': 'Kokoro', 'cloud': 'Cloud', - 'vllm': 'vLLM ROCm (experimental)', +}; + +// Merge display names from a /system-info `recipes` object into RECIPE_DISPLAY_NAMES. +// Called whenever system info is (re)fetched so the map reflects the descriptors. +export const updateRecipeDisplayNames = ( + recipes?: Record +): void => { + if (!recipes) { + return; + } + for (const [recipe, info] of Object.entries(recipes)) { + if (info && typeof info.display_name === 'string' && info.display_name) { + RECIPE_DISPLAY_NAMES[recipe] = info.display_name; + } + } }; diff --git a/src/app/src/renderer/utils/systemData.ts b/src/app/src/renderer/utils/systemData.ts index 63f1d9427..fcd3b8f92 100644 --- a/src/app/src/renderer/utils/systemData.ts +++ b/src/app/src/renderer/utils/systemData.ts @@ -39,8 +39,23 @@ export interface Recipes { [recipeName: string]: Recipe; } +// Per-recipe option schema, generated from the C++ backend descriptor. +export interface RecipeOptionSchema { + name: string; + cli_flag: string; + default: unknown; + type_name: string; + help: string; + group: string; +} + export interface Recipe { default_backend?: string; + // Descriptor metadata (generated from the C++ backend descriptors). + display_name?: string; + selectable_backend?: boolean; + uses_ctx_size?: boolean; + options?: RecipeOptionSchema[]; backends: { [backendName: string]: BackendInfo; }; @@ -75,6 +90,11 @@ const fetchSystemInfoFromAPI = async (): Promise => { const data = await response.json(); const systemInfo: SystemInfo = { ...data }; + // Seed recipe display names from the descriptor-generated /system-info data + // so the UI doesn't hardcode per-recipe names. + const { updateRecipeDisplayNames } = await import('./recipeNames'); + updateRecipeDisplayNames(systemInfo.recipes); + return { info: systemInfo }; } catch (error) { console.error('Failed to fetch supported inference data from API:', error); diff --git a/src/cpp/cli/CMakeLists.txt b/src/cpp/cli/CMakeLists.txt index bd58c60ba..b6a0f26d6 100644 --- a/src/cpp/cli/CMakeLists.txt +++ b/src/cpp/cli/CMakeLists.txt @@ -97,6 +97,10 @@ set(COMMON_SOURCES agent_config_file.cpp opencode_profile.cpp pi_profile.cpp + # Self-describing backend descriptors (plain data; CLI-safe). Lets the CLI + # read recipe options/flags from descriptors without linking server classes. + # The matching factories (create()) are server-only and NOT listed here. + ${LEMON_BACKEND_DESCRIPTOR_SOURCES} ) # Add platform-specific sources diff --git a/src/cpp/cli/bench.cpp b/src/cpp/cli/bench.cpp index 6cf1b1a5b..280b26d33 100644 --- a/src/cpp/cli/bench.cpp +++ b/src/cpp/cli/bench.cpp @@ -1,5 +1,6 @@ #include "lemon_cli/bench.h" #include "lemon_cli/lemonade_client.h" +#include "lemon/backends/backend_descriptor_registry.h" #include #include #include @@ -406,9 +407,10 @@ bool load_model_for_backend(lemonade::LemonadeClient& client, request_body["model_name"] = model; request_body["save_options"] = false; - // For llamacpp recipe, pass backend override - if (recipe == "llamacpp") { - request_body["llamacpp_backend"] = backend; + // For recipes that expose a selectable backend, pass the override. + if (const auto* desc = lemon::backends::descriptor_for(recipe); + desc && desc->selectable_backend) { + request_body[desc->effective_config_section() + "_backend"] = backend; } if (ctx_size > 0) { diff --git a/src/cpp/cli/hf_pull.cpp b/src/cpp/cli/hf_pull.cpp index 8ed30ca0a..f5a84c051 100644 --- a/src/cpp/cli/hf_pull.cpp +++ b/src/cpp/cli/hf_pull.cpp @@ -255,11 +255,12 @@ int hf_pull_flow(lemonade::LemonadeClient& client, const auto& variants = variants_response["variants"]; std::string recipe = variants_response.value("recipe", std::string("llamacpp")); + std::string repo_kind = variants_response.value("repo_kind", std::string("gguf")); - // Non-llamacpp recipes (currently: ONNX RyzenAI) ship as a single - // installable unit — no per-variant menu, no `:variant` checkpoint - // suffix, no `-VARIANT` model name tail. - if (recipe != "llamacpp") { + // Non-GGUF repos (currently: ONNX RyzenAI) ship as a single installable + // unit — no per-variant menu, no `:variant` checkpoint suffix, no + // `-VARIANT` model name tail. (Collections returned earlier above.) + if (repo_kind != "gguf") { if (!variant.empty()) { std::cerr << "warning: variant '" << variant << "' ignored for " << recipe << " checkpoints" << std::endl; diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h new file mode 100644 index 000000000..03ca71e69 --- /dev/null +++ b/src/cpp/include/lemon/backends/backend_descriptor.h @@ -0,0 +1,128 @@ +#pragma once + +#include +#include +#include +#include "lemon/model_types.h" +#include "lemon/recipe_backend_def.h" + +namespace lemon { + +// A single declarative configuration knob a backend exposes. The same list +// drives config.json defaults, CLI flag registration, and load-time option +// resolution, so they can never drift apart. +struct BackendOption { + std::string name; // option key, e.g. "vllm_args" + std::string cli_flag; // CLI flag, e.g. "--vllm-args" ("" = not a CLI flag) + nlohmann::json default_value; // default value when the option is unset + std::string type_name; // "ARGS" | "SIZE" | "BACKEND" | "BOOL" + std::string help; // CLI help text + std::string group; // CLI help group, e.g. "General Options" +}; + +// How a backend shares the accelerator. Replaces the router's recipe-string +// checks for NPU exclusivity and LRU slot accounting. +enum class SlotPolicy { + Standard, // counts toward the LRU slots, no device exclusivity (llamacpp, sd-cpp) + ExclusiveNpu, // evict ALL npu servers before loading (ryzenai-llm, whispercpp-npu) + CoexistByType, // one per model type, evicts exclusive-npu peers (flm) + Unmetered // never counts toward slots, never auto-evicted (cloud) +}; + +// How an installed backend version is compared against the expected pin. +enum class VersionPolicy { + Exact, // installed must match the expected version + AtLeast // installed >= expected is acceptable (system-managed packages, e.g. flm) +}; + +inline const char* slot_policy_to_string(SlotPolicy p) { + switch (p) { + case SlotPolicy::Standard: return "standard"; + case SlotPolicy::ExclusiveNpu: return "exclusive_npu"; + case SlotPolicy::CoexistByType: return "coexist_by_type"; + case SlotPolicy::Unmetered: return "unmetered"; + } + return "standard"; +} + +// Plain data declaring *what a backend is*. This is the single object the +// registry, the CLI, /system-info, and the docs all read. Behavior lives in the +// paired WrappedServer subclass (see backend_registry.h for how they bind). +struct BackendDescriptor { + std::string recipe; // "vllm" + std::string display_name; // "vLLM ROCm (experimental)" + std::string binary; // subprocess to launch/install ("" = none, e.g. cloud) + std::string config_section; // config.json section; defaults to recipe (sd-cpp -> "sdcpp") + + DeviceType default_device = DEVICE_GPU; // default; override effective_device() if variant-dependent + SlotPolicy slot_policy = SlotPolicy::Standard; // default; override effective_slot_policy() if variant-dependent + bool selectable_backend = false; // auto-creates "_backend" option + "--" flag + bool uses_ctx_size = false; // opt in to the shared ctx_size option + bool dynamic_models = false; // true = ops supply models at runtime (cloud, flm), not server_models.json + + std::vector options; // backend-specific knobs (common ones are automatic) + std::vector support; // which OS / GPU families it runs on ({} = no local gating) + std::vector default_labels; // labels injected when a model omits them + std::vector required_checkpoints{"main"}; // unconditional files; conditional ones checked in load() + + // Editorial metadata for the generated docs (README support matrix, website). + std::string modality; // "Text generation" | "Speech-to-text" | "Text-to-speech" | "Image generation" + bool experimental = false; // true renders "(experimental)" next to the recipe in generated docs + std::string web_display_name; // name used on the docs website ("" = fall back to display_name) + int web_priority = 0; // model-grouping order on the docs website (lower = higher; 0 = unlisted) + + // ROCm release channels this backend publishes (e.g. {"stable","nightly"}). + // Empty = the backend has no ROCm channels (its "rocm" build is a single + // artifact). Drives the rocm-stable/rocm-nightly bin-key collapse and the + // channel clamp (a requested channel not listed here falls back to the first). + std::vector rocm_channels; + + // True if the backend's subprocess exposes a Prometheus /metrics endpoint + // that lemond should scrape and re-export (llama-server does). + bool exposes_prometheus_metrics = false; + + // True if this backend's ROCm build requires the gfx1151 (Strix Halo) kernel + // CWSR fix. Gates the availability/remediation check for the "rocm" backend. + bool rocm_requires_cwsr_fix = false; + + // How the installed version is compared against the expected pin. Exact by + // default; system-managed packages (flm) accept any version >= expected. + VersionPolicy version_policy = VersionPolicy::Exact; + + // True if the backend pulls its own models on demand (flm self-pulls via its + // CLI) rather than being pre-downloaded from Hugging Face by the router. Such + // backends are skipped by the load-time auto-download path. + bool self_manages_downloads = false; + + // --- config.json per-recipe defaults schema --- + // The backend's section of config.json is derived from these fields, so a new + // backend's defaults live in its descriptor instead of a hand-maintained + // defaults.json block. (selectable_backend additionally emits `backend: "auto"`.) + bool takes_args = false; // emits `args: ""` + std::vector arg_variants; // each emits `_args: ""` + std::vector bin_variants; // each emits `_bin: "builtin"` + nlohmann::json config_extra = nlohmann::json::object(); // fixed extras (e.g. prefer_system, image defaults) + + // The config.json section name for this backend, falling back to the recipe. + std::string effective_config_section() const { + return config_section.empty() ? recipe : config_section; + } + + // Build this backend's config.json default section from the schema above. + // Returns an empty object when the backend has no configurable section. + nlohmann::json config_defaults() const { + nlohmann::json block = nlohmann::json::object(); + if (selectable_backend) block["backend"] = "auto"; + if (takes_args) block["args"] = ""; + for (const auto& v : arg_variants) block[v + "_args"] = ""; + for (const auto& v : bin_variants) block[v + "_bin"] = "builtin"; + if (config_extra.is_object()) { + for (auto it = config_extra.begin(); it != config_extra.end(); ++it) { + block[it.key()] = it.value(); + } + } + return block; + } +}; + +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/backend_descriptor_registry.h b/src/cpp/include/lemon/backends/backend_descriptor_registry.h new file mode 100644 index 000000000..44ec7e15d --- /dev/null +++ b/src/cpp/include/lemon/backends/backend_descriptor_registry.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include +#include "lemon/backends/backend_descriptor.h" + +namespace lemon { +namespace backends { + +// Read-only view over every backend descriptor (plain data). This API is +// CLI-safe: it pulls in no server classes, so it links into both the lemonade +// CLI and lemond. The factory side (create_server) lives in backend_registry.h +// and is server-only. + +// All registered descriptors, in LEMON_BACKENDS order. +const std::vector& all_descriptors(); + +// Descriptor for a recipe, or nullptr if the recipe has no registered backend. +const BackendDescriptor* descriptor_for(const std::string& recipe); + +// True if the recipe is backed by a registered descriptor. +bool has_backend(const std::string& recipe); + +// True if the recipe publishes ROCm release channels (stable/nightly) — i.e. its +// "rocm" backend resolves to a channel-specific artifact. False for recipes whose +// rocm build is a single artifact (or that have no rocm build at all). +bool recipe_has_rocm_channels(const std::string& recipe); + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/backend_ops.h b/src/cpp/include/lemon/backends/backend_ops.h new file mode 100644 index 000000000..047c6795d --- /dev/null +++ b/src/cpp/include/lemon/backends/backend_ops.h @@ -0,0 +1,177 @@ +#pragma once + +#include +#include +#include +#include "lemon/model_manager.h" // ModelInfo, DownloadProgressCallback (server-side only) + +namespace lemon { + +class CloudProviderRegistry; + +namespace backends { + +// Context handed to BackendOps methods — the bits of server state model +// management needs without a running subprocess. Grows as migrations require. +struct BackendOpsContext { + ModelManager* model_manager = nullptr; + CloudProviderRegistry* cloud_registry = nullptr; // for dynamic cloud discovery +}; + +// Inputs for resolving a checkpoint's on-disk path. The model manager computes +// the HF-cache locations generically; each backend's ops decide how to find its +// artifact within (a .gguf file, a genai_config.json directory, a .bin, …). +struct CheckpointResolveContext { + std::string hf_cache; // HF cache root dir + std::string model_cache_path; // hf_cache/ + std::string repo_id; // checkpoint's repo id + std::string main_repo_id; // the model's "main" checkpoint repo id (fallback) + std::string variant; // checkpoint variant after ':' ("" if none) + std::string type; // checkpoint type ("main", "mmproj", "npu_cache", …) + std::string checkpoint; // the raw checkpoint string +}; + +// Stateless per-backend behavior for model management that happens WITHOUT a +// running subprocess: checkpoint-path resolution, download, dynamic discovery, +// per-model metadata, version detection, availability. One singleton per +// backend, exposed via lemon::backends::::ops() and bound in the registry +// (see BackendRegistration::ops). +// +// The base class is the shared default behavior (the common HF-backed case); +// each backend folder overrides ONLY the policy points it needs, so shared +// logic is inherited rather than copied. Methods are added here incrementally as +// switchboards in model_manager / system_info are migrated; every method has a +// default so adding one never forces edits to backends that don't override it. +class BackendOps { +public: + virtual ~BackendOps() = default; + + // Populate model-specific metadata (context window, capability labels, …) + // for a downloaded model. Default: nothing. + virtual void populate_metadata(ModelInfo& info, const BackendOpsContext& ctx) const { + (void)info; + (void)ctx; + } + + // Resolve a checkpoint to its absolute on-disk path (file or directory). + // Default: the shared HF behavior — locate the variant/aux file in the active + // snapshot, else fall back to the model cache directory. Backends with a + // bespoke artifact layout (GGUF file, genai_config.json dir, .bin, …) override. + virtual std::string resolve_checkpoint_path(const ModelInfo& info, + const CheckpointResolveContext& ctx) const; + + // Find the primary checkpoint artifact inside a freshly-imported local + // directory (a local_import pull), e.g. the .gguf / .bin file or the + // genai_config.json directory. Returns the absolute path to register, or "" + // to register the directory itself. Default: "" (register the directory). + virtual std::string find_imported_checkpoint(const std::string& import_dir) const { + (void)import_dir; + return ""; + } + + // Validate a user-supplied checkpoint string when registering a new model. + // Return an error message if invalid, "" if acceptable. Default: accept. + // llamacpp requires a :variant on GGUF checkpoints. + virtual std::string validate_registration_checkpoint(const std::string& checkpoint) const { + (void)checkpoint; + return ""; + } + + // Select the repo-relative files to download for the main checkpoint + // `main_variant`, for backends whose artifact layout isn't a GGUF file. + // Return nullopt to use the default GGUF selection. (Direct single-file + // variants — .safetensors/.pth/.ckpt — are handled generically upstream.) + // moonshine overrides: its variant names a directory of files to fetch. + virtual std::optional> select_checkpoint_files( + const std::string& main_variant, const std::vector& repo_files) const { + (void)main_variant; + (void)repo_files; + return std::nullopt; + } + + // Models supplied at runtime rather than from server_models.json (descriptor + // dynamic_models = true). Default: none. cloud/flm override. + virtual std::vector discover_models(const BackendOpsContext& ctx) const { + (void)ctx; + return {}; + } + + // Whether a model's local artifacts are present. Default: the shared HF + // checkpoint-completeness check (ModelManager::checkpoints_complete). cloud + // (always true) and flm (installed-set membership) override. + virtual bool is_downloaded(const ModelInfo& info, const BackendOpsContext& ctx) const; + + // Validate a resolved checkpoint file for the cache. Returns "" if valid, or + // a reason it should be treated as not-downloaded. Default: always valid; + // llamacpp checks GGUF magic. + virtual std::string validate_checkpoint_file(const std::string& resolved_path) const { + (void)resolved_path; + return ""; + } + + // Download a model's artifacts. Default: the shared Hugging Face download. + // cloud (no-op) and flm (flm pull) override. + virtual void download_model(const ModelInfo& info, bool do_not_upgrade, + DownloadProgressCallback progress, + const BackendOpsContext& ctx) const; + + // Whether the model cache must be rebuilt after this backend downloads a + // model (e.g. flm, whose model list changes). Default: false. + virtual bool invalidates_cache_after_download() const { return false; } + + // Resolve a backend's installed version for a given backend variant. The + // caller passes the version read from the on-disk version.txt (or "" if + // absent); the default returns it unchanged. Backends that detect their + // version another way override: llamacpp's "system" build runs + // `llama-server --version`; flm queries `flm version` when no file is present. + virtual std::string resolve_version(const std::string& backend, + const std::string& file_version) const { + (void)backend; + return file_version; + } + + // Result of a backend-specific install check: whether the backend variant is + // usable, plus an optional error explaining why not. + struct InstallCheck { + bool installed = false; + std::string error; + }; + + // Decide whether a backend variant is installed, given whether its managed + // binary was found on disk. Default: installed iff the binary was found. + // llamacpp's "system" build also requires the ggml HIP plugin when an AMD GPU + // is present; flm can be a system PATH package even without a managed binary. + virtual InstallCheck check_install(const std::string& backend, bool binary_found) const { + (void)backend; + return {binary_found, ""}; + } + + // The /system-info state for a backend variant that is supported but not + // currently available (install probe failed). + struct UnavailableState { + std::string state; // "installable" | "update_required" | "action_required" + std::string message; // shown to the user + std::string action; // remediation (a URL or an install command) + bool attach_installed_version = false; // surface the installed version too + }; + + // Classify a "supported but not available" backend variant for /system-info, + // given the install probe's error text and the generic install command the + // caller would otherwise use. Return nullopt to use the generic + // installable/no-fetch default. flm overrides: it is a system .deb + drivers + // needing manual setup, so its states and remediation links differ. + virtual std::optional classify_unavailable( + const std::string& backend, const std::string& install_error, + const std::string& default_install_command) const { + (void)backend; + (void)install_error; + (void)default_install_command; + return std::nullopt; + } +}; + +// Shared default ops instance for backends that override nothing. +const BackendOps* default_backend_ops(); + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/backend_registry.h b/src/cpp/include/lemon/backends/backend_registry.h new file mode 100644 index 000000000..240ddf728 --- /dev/null +++ b/src/cpp/include/lemon/backends/backend_registry.h @@ -0,0 +1,76 @@ +#pragma once + +#include +#include +#include "lemon/backends/backend_descriptor.h" +#include "lemon/backends/backend_descriptor_registry.h" +#include "lemon/backends/backend_ops.h" + +namespace lemon { + +class WrappedServer; +class ModelManager; +class BackendManager; +class CloudProviderRegistry; +struct ModelInfo; + +namespace backends { + +struct BackendSpec; // install/download spec, defined in backend_utils.h + +// Everything a backend's create() needs to build an instance. Mirrors the +// arguments the old router factory passed to each backend constructor. +struct BackendContext { + std::string log_level; + ModelManager* model_manager = nullptr; + BackendManager* backend_manager = nullptr; + CloudProviderRegistry* cloud_registry = nullptr; + const ModelInfo* model_info = nullptr; // for per-create setup (cloud provider, ryzenai model path) +}; + +using BackendCreateFn = std::unique_ptr (*)(const BackendContext&); + +// Convenience for the common create(): construct a server class from the +// standard (log_level, model_manager, backend_manager) context fields. Backends +// needing extra constructor arguments (cloud, ryzenai) build theirs by hand. +template +std::unique_ptr make_server(const BackendContext& ctx) { + return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); +} + +// Construct-on-first-use singleton for a stateless ops class, giving the +// registry a stable pointer. Backends with no custom behavior return +// default_backend_ops() from their ops() instead. +template +const BackendOps* single_ops() { + static const T kOps; + return &kOps; +} + +// Binds a descriptor (what the backend is) to its server class's create() (how +// it runs). The generated factory registry supplies one per backend. This API is +// server-only: it references server classes via create(), so it is compiled into +// lemond but not the CLI. The CLI reads descriptors through backend_descriptor_registry.h. +struct BackendRegistration { + const BackendDescriptor* descriptor; + BackendCreateFn create; + const BackendSpec* spec; // install/download spec, or nullptr (e.g. cloud has none) + const BackendOps* ops; // stateless model-management behavior (never null) +}; + +// All registered (descriptor, create, spec, ops) entries, in LEMON_BACKENDS order. +const std::vector& all_registrations(); + +// Install/download spec for a recipe, or nullptr if the recipe has none. +const BackendSpec* spec_for(const std::string& recipe); + +// Stateless model-management ops for a recipe. Falls back to the shared default +// ops (base behavior) for recipes with no registered backend. +const BackendOps* ops_for(const std::string& recipe); + +// Construct a backend instance for a recipe and associate its descriptor, or +// nullptr if the recipe has no registered backend. +std::unique_ptr create_server(const std::string& recipe, const BackendContext& ctx); + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/backend_utils.h b/src/cpp/include/lemon/backends/backend_utils.h index bfc37734d..bdbfe0869 100644 --- a/src/cpp/include/lemon/backends/backend_utils.h +++ b/src/cpp/include/lemon/backends/backend_utils.h @@ -5,6 +5,7 @@ #include #include #include +#include "lemon/backends/backend_descriptor.h" namespace fs = std::filesystem; @@ -42,6 +43,17 @@ namespace lemon::backends { std::string log_name() const { return recipe + " Server"; }; }; + // Build a backend's install/download spec from its descriptor's recipe/binary + // and the server class T's get_install_params. The construct-on-first-use + // static gives the registry a stable pointer. Backends whose install key + // differs from the recipe (ryzenai) or that have no installable artifact + // (cloud) build their BackendSpec by hand instead of using this. + template + const BackendSpec* make_spec(const BackendDescriptor& d, bool split = false) { + static const BackendSpec kSpec(d.recipe, d.binary, T::get_install_params, split); + return &kSpec; + } + // Return the backend spec for recipes that use the standard BackendSpec flow. // Returns nullptr for recipes that require custom handling (e.g., flm) or unknown recipes. const BackendSpec* try_get_spec_for_recipe(const std::string& recipe); diff --git a/src/cpp/include/lemon/backends/cloud/cloud.h b/src/cpp/include/lemon/backends/cloud/cloud.h new file mode 100644 index 000000000..976a84f70 --- /dev/null +++ b/src/cpp/include/lemon/backends/cloud/cloud.h @@ -0,0 +1,32 @@ +#pragma once + +#include "lemon/backends/backend_descriptor.h" + +namespace lemon { +namespace backends { +namespace cloud { + +// The cloud backend descriptor (plain data). Header-only `inline const` so it +// links into both the lemonade CLI and lemond without a separate source file. +inline const BackendDescriptor descriptor = { + /*recipe*/ "cloud", + /*display_name*/ "Cloud", + /*binary*/ "", // no subprocess: runs on a remote provider + /*config_section*/ "", // defaults to recipe + /*default_device*/ DEVICE_NONE, + /*slot_policy*/ SlotPolicy::Unmetered, // never counts toward slots, never auto-evicted + /*selectable_backend*/ false, + /*uses_ctx_size*/ false, + /*dynamic_models*/ true, // models discovered at runtime from the provider + /*options*/ {}, + /*support*/ {}, // no local gating: install/support machinery skips cloud + /*default_labels*/ {}, + /*required_checkpoints*/ {}, // no downloaded files + /*modality*/ "", + /*experimental*/ false, + /*web_display_name*/ "", +}; + +} // namespace cloud +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/cloud_server.h b/src/cpp/include/lemon/backends/cloud/cloud_server.h similarity index 92% rename from src/cpp/include/lemon/backends/cloud_server.h rename to src/cpp/include/lemon/backends/cloud/cloud_server.h index 21bf20642..51b61d6f4 100644 --- a/src/cpp/include/lemon/backends/cloud_server.h +++ b/src/cpp/include/lemon/backends/cloud/cloud_server.h @@ -1,7 +1,9 @@ #pragma once -#include "../model_manager.h" -#include "../wrapped_server.h" +#include "lemon/backends/backend_registry.h" + +#include "lemon/model_manager.h" +#include "lemon/wrapped_server.h" #include #include @@ -109,5 +111,11 @@ class CloudServer : public WrappedServer { bool loaded_ = false; }; -} // namespace backends -} // namespace lemon +namespace cloud { +// Factory for the cloud backend (constructs the server class — lemond only). +std::unique_ptr create(const BackendContext& ctx); +const BackendSpec* spec(); +const BackendOps* ops(); +} // namespace cloud +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h new file mode 100644 index 000000000..24049ab31 --- /dev/null +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h @@ -0,0 +1,48 @@ +#pragma once + +#include "lemon/backends/backend_descriptor.h" + +namespace lemon { +namespace backends { +namespace fastflowlm { + +// The fastflowlm backend descriptor (plain data). Header-only `inline const` so it +// links into both the lemonade CLI and lemond without a separate source file. +inline const BackendDescriptor descriptor = { + /*recipe*/ "flm", + /*display_name*/ "FastFlowLM NPU", +#ifdef _WIN32 + /*binary*/ "flm.exe", +#else + /*binary*/ "flm", +#endif + /*config_section*/ "", // defaults to recipe + /*default_device*/ DEVICE_NPU, + /*slot_policy*/ SlotPolicy::CoexistByType, + /*selectable_backend*/ false, + /*uses_ctx_size*/ true, + /*dynamic_models*/ true, // models come from flm's model_list.json, not server_models.json + /*options*/ {}, + /*support*/ { + {"npu", {"windows", "linux"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"}, + }, + /*default_labels*/ {}, + /*required_checkpoints*/ {"main"}, + /*modality*/ "Text generation", + /*experimental*/ false, + /*web_display_name*/ "FastFlowLM NPU", + /*web_priority*/ 3, + /*rocm_channels*/ {}, + /*exposes_prometheus_metrics*/ false, + /*rocm_requires_cwsr_fix*/ false, + /*version_policy*/ VersionPolicy::AtLeast, // system-managed package + /*self_manages_downloads*/ true, // flm pulls its own models via the flm CLI + /*takes_args*/ true, + /*arg_variants*/ {}, + /*bin_variants*/ {}, + /*config_extra*/ nlohmann::json::object(), +}; + +} // namespace fastflowlm +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h new file mode 100644 index 000000000..87470300c --- /dev/null +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h @@ -0,0 +1,55 @@ +#pragma once + +#include +#include +#include +#include +#include "lemon/model_manager.h" // ModelInfo, DownloadProgressCallback + +namespace lemon { + +namespace backends { +namespace fastflowlm { + +// Locate the FLM executable (install dir on Windows, system PATH on Linux). +std::string find_flm_binary(); + +// Installed FLM model checkpoints (from `flm list --filter installed`). +std::vector flm_installed_checkpoints(); + +// Discover all available FLM models (from `flm list --json`), each with its +// downloaded status set. Returns empty if FLM is not ready. +std::vector flm_discover_models(); + +// FLM-specific model-file helpers. FLM stores models under FLM_MODEL_PATH / +// platform-default roots and describes them with a config.json; this knowledge +// lives in the fastflowlm backend folder rather than in the shared model manager. + +// Derive the on-disk repo directory name from an FLM model URL. +std::string repo_dir_from_url(const std::string& url); + +// Locate config.json for an FLM repo dir across the candidate model roots. +std::filesystem::path find_flm_config_path_from_repo_dir(const std::string& repo_dir); + +// Read the model's max context window from its FLM config.json (0 if unknown). +int64_t read_flm_max_context_window(const ModelInfo& info); + +// Locate the flm executable on PATH / install dirs ("" if not found). +std::string find_flm_executable(); + +// Run `flm validate` and report readiness; error_message on failure. +bool run_flm_validate(const std::string& flm_path, std::string& error_message); + +// Detect the installed FLM version via `flm version` ("unknown" if unavailable). +std::string flm_version(); + +// Download (pull) an FLM model by checkpoint via the `flm` CLI. +void flm_download(const std::string& checkpoint, bool do_not_upgrade, + DownloadProgressCallback progress_callback); + +// Remove an installed FLM model by checkpoint via `flm remove`; throws on failure. +void flm_remove(const std::string& checkpoint); + +} // namespace fastflowlm +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/fastflowlm_server.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h similarity index 82% rename from src/cpp/include/lemon/backends/fastflowlm_server.h rename to src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h index bd9c554ac..bdcb1d88a 100644 --- a/src/cpp/include/lemon/backends/fastflowlm_server.h +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h @@ -1,7 +1,9 @@ #pragma once -#include "../wrapped_server.h" -#include "backend_utils.h" +#include "lemon/backends/backend_registry.h" + +#include "lemon/wrapped_server.h" +#include "lemon/backends/backend_utils.h" #include namespace lemon { @@ -11,17 +13,6 @@ class FastFlowLMServer : public WrappedServer, public IEmbeddingsServer, public public: static InstallParams get_install_params(const std::string& backend, const std::string& version); - inline static const BackendSpec SPEC = BackendSpec( - // recipe - "flm", - // executable - #ifdef _WIN32 - "flm.exe" - #else - "flm" - #endif - , get_install_params - ); FastFlowLMServer(const std::string& log_level, ModelManager* model_manager = nullptr, BackendManager* backend_manager = nullptr); @@ -70,5 +61,11 @@ class FastFlowLMServer : public WrappedServer, public IEmbeddingsServer, public bool is_loaded_ = false; }; -} // namespace backends -} // namespace lemon +namespace fastflowlm { +// Factory for the fastflowlm backend (constructs the server class — lemond only). +std::unique_ptr create(const BackendContext& ctx); +const BackendSpec* spec(); +const BackendOps* ops(); +} // namespace fastflowlm +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/hf_cache_util.h b/src/cpp/include/lemon/backends/hf_cache_util.h new file mode 100644 index 000000000..91c64278e --- /dev/null +++ b/src/cpp/include/lemon/backends/hf_cache_util.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include + +namespace lemon { +namespace backends { +namespace hf_cache { + +// Shared Hugging Face cache mechanics used by backend ops to locate model +// artifacts on disk (the same logic model_manager uses for its own cache work). + +// Exists check that tolerates the symlinks HF uses for dedup (Win32 on Windows, +// where MSVC's std::filesystem refuses untrusted reparse points). +bool exists(const std::filesystem::path& p); + +// Directory-iteration options that skip inaccessible/symlinked entries instead +// of throwing. +std::filesystem::directory_options dir_options(); + +// The active HF snapshot directory (snapshots/) for a model cache +// dir, or an empty path if there is no recorded ref / it doesn't exist. +std::filesystem::path active_snapshot_path(const std::filesystem::path& model_cache_path); + +// HF cache directory name for a repo id ("org/repo" -> "models--org--repo"). +std::string repo_id_to_cache_dir_name(const std::string& repo_id); + +} // namespace hf_cache +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/kokoro/kokoro.h b/src/cpp/include/lemon/backends/kokoro/kokoro.h new file mode 100644 index 000000000..5f3fbf97c --- /dev/null +++ b/src/cpp/include/lemon/backends/kokoro/kokoro.h @@ -0,0 +1,49 @@ +#pragma once + +#include "lemon/backends/backend_descriptor.h" + +namespace lemon { +namespace backends { +namespace kokoro { + +// The kokoro backend descriptor (plain data). Header-only `inline const` so it +// links into both the lemonade CLI and lemond without a separate source file. +inline const BackendDescriptor descriptor = { + /*recipe*/ "kokoro", + /*display_name*/ "Kokoro", +#ifdef _WIN32 + /*binary*/ "koko.exe", +#else + /*binary*/ "koko", +#endif + /*config_section*/ "", // defaults to recipe + /*default_device*/ DEVICE_CPU, + /*slot_policy*/ SlotPolicy::Standard, + /*selectable_backend*/ false, + /*uses_ctx_size*/ false, + /*dynamic_models*/ false, + /*options*/ {}, + /*support*/ { + {"cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"}, + {"metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"}, + }, + /*default_labels*/ {}, // kokoro models carry "tts" explicitly in server_models.json + /*required_checkpoints*/ {"main"}, + /*modality*/ "Text-to-speech", + /*experimental*/ false, + /*web_display_name*/ "", + /*web_priority*/ 6, + /*rocm_channels*/ {}, + /*exposes_prometheus_metrics*/ false, + /*rocm_requires_cwsr_fix*/ false, + /*version_policy*/ VersionPolicy::Exact, + /*self_manages_downloads*/ false, + /*takes_args*/ false, + /*arg_variants*/ {}, + /*bin_variants*/ {"cpu"}, + /*config_extra*/ nlohmann::json::object(), +}; + +} // namespace kokoro +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/kokoro_server.h b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h similarity index 69% rename from src/cpp/include/lemon/backends/kokoro_server.h rename to src/cpp/include/lemon/backends/kokoro/kokoro_server.h index 0b99bcb96..6a9738252 100644 --- a/src/cpp/include/lemon/backends/kokoro_server.h +++ b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h @@ -1,8 +1,10 @@ #pragma once -#include "../wrapped_server.h" -#include "../server_capabilities.h" -#include "backend_utils.h" +#include "lemon/backends/backend_registry.h" + +#include "lemon/wrapped_server.h" +#include "lemon/server_capabilities.h" +#include "lemon/backends/backend_utils.h" #include #include @@ -13,15 +15,6 @@ class KokoroServer : public WrappedServer, public ITextToSpeechServer { public: static InstallParams get_install_params(const std::string& backend, const std::string& version); - inline static const BackendSpec SPEC = BackendSpec( - "kokoro", - #ifdef _WIN32 - "koko.exe" - #else - "koko" - #endif - , get_install_params - ); explicit KokoroServer(const std::string& log_level, ModelManager* model_manager, @@ -45,5 +38,11 @@ class KokoroServer : public WrappedServer, public ITextToSpeechServer { void audio_speech(const json& request, httplib::DataSink& sink) override; }; -} // namespace backends -} // namespace lemon +namespace kokoro { +// Factory for the kokoro backend (constructs the server class — lemond only). +std::unique_ptr create(const BackendContext& ctx); +const BackendSpec* spec(); +const BackendOps* ops(); +} // namespace kokoro +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h new file mode 100644 index 000000000..7c58a73f3 --- /dev/null +++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h @@ -0,0 +1,62 @@ +#pragma once + +#include "lemon/backends/backend_descriptor.h" + +namespace lemon { +namespace backends { +namespace llamacpp { + +// The llamacpp backend descriptor (plain data). Header-only `inline const` so it +// links into both the lemonade CLI and lemond without a separate source file. +inline const BackendDescriptor descriptor = { + /*recipe*/ "llamacpp", + /*display_name*/ "Llama.cpp GPU", +#ifdef _WIN32 + /*binary*/ "llama-server.exe", +#else + /*binary*/ "llama-server", +#endif + /*config_section*/ "", // defaults to recipe + /*default_device*/ DEVICE_GPU, // cpu/system variants resolve to CPU via effective_device() + /*slot_policy*/ SlotPolicy::Standard, + /*selectable_backend*/ true, + /*uses_ctx_size*/ true, + /*dynamic_models*/ false, + /*options*/ { + {"llamacpp_backend", "--llamacpp", "", "BACKEND", + "LlamaCpp backend to use", "Llama.cpp Backend Options"}, + {"llamacpp_device", "--llamacpp-device", "", "DEVICES", + "Comma-separated list of accelerator devices to use (e.g. Vulkan0)", "Llama.cpp Backend Options"}, + {"llamacpp_args", "--llamacpp-args", "", "ARGS", + "Custom arguments to pass to llama-server", "Llama.cpp Backend Options"}, + }, + /*support*/ { + {"system", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64/ARM64 CPU, GPU"}, + {"metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"}, + {"cuda", {"windows", "linux"}, + {{"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}}, "NVIDIA GPUs (Turing or newer)**"}, + {"vulkan", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}, {"amd_gpu", {}}}, "x86_64 CPU, AMD iGPU, AMD dGPU; ARM64 CPU/GPU (Linux)"}, + {"rocm", {"windows", "linux"}, + {{"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}}}, "Supported AMD ROCm iGPU/dGPU families*"}, + {"cpu", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64 CPU; ARM64 CPU (Linux)"}, + }, + /*default_labels*/ {}, + /*required_checkpoints*/ {"main"}, + /*modality*/ "Text generation", + /*experimental*/ false, + /*web_display_name*/ "llama.cpp GPU", + /*web_priority*/ 1, + /*rocm_channels*/ {"stable", "nightly"}, + /*exposes_prometheus_metrics*/ true, + /*rocm_requires_cwsr_fix*/ true, + /*version_policy*/ VersionPolicy::Exact, + /*self_manages_downloads*/ false, + /*takes_args*/ true, + /*arg_variants*/ {"rocm", "vulkan", "cpu"}, + /*bin_variants*/ {"rocm", "vulkan", "cuda", "cpu"}, + /*config_extra*/ {{"prefer_system", true}}, +}; + +} // namespace llamacpp +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp_gguf.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp_gguf.h new file mode 100644 index 000000000..6bf170584 --- /dev/null +++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp_gguf.h @@ -0,0 +1,17 @@ +#pragma once + +#include + +namespace lemon { +namespace backends { +namespace llamacpp { + +// Resolve the on-disk path of the GGUF file for a model cache directory and +// variant (handles sharding, folder variants, and quant-token fallback). Returns +// the cache directory if no GGUF is present, or "" if the requested variant +// can't be resolved. +std::string resolve_gguf_path(const std::string& model_cache_path, const std::string& variant); + +} // namespace llamacpp +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/llamacpp_server.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h similarity index 77% rename from src/cpp/include/lemon/backends/llamacpp_server.h rename to src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h index c9356f6b8..8a7a8405f 100644 --- a/src/cpp/include/lemon/backends/llamacpp_server.h +++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h @@ -1,7 +1,9 @@ #pragma once -#include "../wrapped_server.h" -#include "backend_utils.h" +#include "lemon/backends/backend_registry.h" + +#include "lemon/wrapped_server.h" +#include "lemon/backends/backend_utils.h" #include namespace lemon { @@ -11,15 +13,6 @@ class LlamaCppServer : public WrappedServer, public IEmbeddingsServer, public IR public: static InstallParams get_install_params(const std::string& backend, const std::string& version); - inline static const BackendSpec SPEC = BackendSpec( - "llamacpp", - #ifdef _WIN32 - "llama-server.exe" - #else - "llama-server" - #endif - , get_install_params - ); LlamaCppServer(const std::string& log_level, ModelManager* model_manager, @@ -56,5 +49,11 @@ class LlamaCppServer : public WrappedServer, public IEmbeddingsServer, public IR json tokenize(const json& request) override; }; -} // namespace backends -} // namespace lemon +namespace llamacpp { +// Factory for the llamacpp backend (constructs the server class — lemond only). +std::unique_ptr create(const BackendContext& ctx); +const BackendSpec* spec(); +const BackendOps* ops(); +} // namespace llamacpp +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/moonshine/moonshine.h b/src/cpp/include/lemon/backends/moonshine/moonshine.h new file mode 100644 index 000000000..ae7313714 --- /dev/null +++ b/src/cpp/include/lemon/backends/moonshine/moonshine.h @@ -0,0 +1,49 @@ +#pragma once + +#include "lemon/backends/backend_descriptor.h" + +namespace lemon { +namespace backends { +namespace moonshine { + +// The moonshine backend descriptor (plain data). Header-only `inline const` so it +// links into both the lemonade CLI and lemond without a separate source file. +inline const BackendDescriptor descriptor = { + /*recipe*/ "moonshine", + /*display_name*/ "Moonshine", + /*binary*/ "moonshine-server", + /*config_section*/ "", // defaults to recipe + /*default_device*/ DEVICE_CPU, + /*slot_policy*/ SlotPolicy::Standard, + /*selectable_backend*/ false, + /*uses_ctx_size*/ false, + /*dynamic_models*/ false, + /*options*/ { + {"moonshine_args", "--moonshine-args", "", "ARGS", + "Custom arguments to pass to moonshine-server", ""}, + }, + /*support*/ { + {"cpu", {"windows"}, {{"cpu", {"x86_64"}}}, "x86_64/arm64 CPU"}, + {"cpu", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64/arm64 CPU"}, + {"cpu", {"macos"}, {{"cpu", {"arm64"}}}, "x86_64/arm64 CPU"}, + }, + /*default_labels*/ {"transcription", "realtime-transcription"}, + /*required_checkpoints*/ {"main"}, + /*modality*/ "Speech-to-text", + /*experimental*/ false, + /*web_display_name*/ "", + /*web_priority*/ 0, + /*rocm_channels*/ {}, + /*exposes_prometheus_metrics*/ false, + /*rocm_requires_cwsr_fix*/ false, + /*version_policy*/ VersionPolicy::Exact, + /*self_manages_downloads*/ false, + /*takes_args*/ true, + /*arg_variants*/ {"cpu"}, + /*bin_variants*/ {"cpu"}, + /*config_extra*/ nlohmann::json::object(), +}; + +} // namespace moonshine +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/moonshine_server.h b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h similarity index 76% rename from src/cpp/include/lemon/backends/moonshine_server.h rename to src/cpp/include/lemon/backends/moonshine/moonshine_server.h index 6f13f216b..e6535a34b 100644 --- a/src/cpp/include/lemon/backends/moonshine_server.h +++ b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h @@ -1,8 +1,10 @@ #pragma once -#include "../wrapped_server.h" -#include "../server_capabilities.h" -#include "backend_utils.h" +#include "lemon/backends/backend_registry.h" + +#include "lemon/wrapped_server.h" +#include "lemon/server_capabilities.h" +#include "lemon/backends/backend_utils.h" #include namespace lemon { @@ -12,11 +14,6 @@ class MoonshineServer : public WrappedServer, public ITranscriptionServer, publi public: static InstallParams get_install_params(const std::string& backend, const std::string& version); - inline static const BackendSpec SPEC = BackendSpec( - "moonshine", - "moonshine-server", - get_install_params - ); explicit MoonshineServer(const std::string& log_level, ModelManager* model_manager, @@ -51,5 +48,11 @@ class MoonshineServer : public WrappedServer, public ITranscriptionServer, publi int tcp_port_ = 0; // Port for line-delimited JSON streaming }; -} // namespace backends -} // namespace lemon +namespace moonshine { +// Factory for the moonshine backend (constructs the server class — lemond only). +std::unique_ptr create(const BackendContext& ctx); +const BackendSpec* spec(); +const BackendOps* ops(); +} // namespace moonshine +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h new file mode 100644 index 000000000..dbc15d7f3 --- /dev/null +++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h @@ -0,0 +1,48 @@ +#pragma once + +#include "lemon/backends/backend_descriptor.h" + +namespace lemon { +namespace backends { +namespace ryzenai { + +// The ryzenai backend descriptor (plain data). Header-only `inline const` so it +// links into both the lemonade CLI and lemond without a separate source file. +inline const BackendDescriptor descriptor = { + /*recipe*/ "ryzenai-llm", + /*display_name*/ "Ryzen AI LLM", +#ifdef _WIN32 + /*binary*/ "ryzenai-server.exe", +#else + /*binary*/ "ryzenai-server", +#endif + /*config_section*/ "ryzenai", // differs from recipe "ryzenai-llm" + /*default_device*/ DEVICE_NPU, + /*slot_policy*/ SlotPolicy::ExclusiveNpu, + /*selectable_backend*/ false, + /*uses_ctx_size*/ true, + /*dynamic_models*/ false, + /*options*/ {}, + /*support*/ { + {"npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"}, + }, + /*default_labels*/ {}, + /*required_checkpoints*/ {"main"}, + /*modality*/ "Text generation", + /*experimental*/ false, + /*web_display_name*/ "Ryzen AI SW NPU", + /*web_priority*/ 2, + /*rocm_channels*/ {}, + /*exposes_prometheus_metrics*/ false, + /*rocm_requires_cwsr_fix*/ false, + /*version_policy*/ VersionPolicy::Exact, + /*self_manages_downloads*/ false, + /*takes_args*/ false, + /*arg_variants*/ {}, + /*bin_variants*/ {"server"}, + /*config_extra*/ nlohmann::json::object(), +}; + +} // namespace ryzenai +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/ryzenaiserver.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h similarity index 78% rename from src/cpp/include/lemon/backends/ryzenaiserver.h rename to src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h index 36e1ba98d..f3a6806e7 100644 --- a/src/cpp/include/lemon/backends/ryzenaiserver.h +++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h @@ -1,5 +1,7 @@ #pragma once +#include "lemon/backends/backend_registry.h" + #include "lemon/wrapped_server.h" #include "lemon/server_capabilities.h" #include "lemon/backends/backend_utils.h" @@ -15,15 +17,6 @@ class RyzenAIServer : public WrappedServer { public: static InstallParams get_install_params(const std::string& backend, const std::string& version); - inline static const BackendSpec SPEC = BackendSpec( - "ryzenai-server", -#ifdef _WIN32 - "ryzenai-server.exe" -#else - "ryzenai-server" -#endif - , get_install_params - ); RyzenAIServer(const std::string& model_name, bool debug, ModelManager* model_manager, BackendManager* backend_manager); @@ -54,3 +47,14 @@ class RyzenAIServer : public WrappedServer { }; } // namespace lemon + +namespace lemon { +namespace backends { +namespace ryzenai { +// Factory for the ryzenai backend (constructs the server class — lemond only). +std::unique_ptr create(const BackendContext& ctx); +const BackendSpec* spec(); +const BackendOps* ops(); +} // namespace ryzenai +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h new file mode 100644 index 000000000..986d26fbe --- /dev/null +++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h @@ -0,0 +1,66 @@ +#pragma once + +#include "lemon/backends/backend_descriptor.h" + +namespace lemon { +namespace backends { +namespace sdcpp { + +// The sdcpp backend descriptor (plain data). Header-only `inline const` so it +// links into both the lemonade CLI and lemond without a separate source file. +inline const BackendDescriptor descriptor = { + /*recipe*/ "sd-cpp", + /*display_name*/ "StableDiffusion.cpp", +#ifdef _WIN32 + /*binary*/ "sd-server.exe", +#else + /*binary*/ "sd-server", +#endif + /*config_section*/ "sdcpp", + /*default_device*/ DEVICE_CPU, + /*slot_policy*/ SlotPolicy::Standard, + /*selectable_backend*/ true, + /*uses_ctx_size*/ false, + /*dynamic_models*/ false, + /*options*/ { + {"sd-cpp_backend", "--sdcpp", "", "BACKEND", + "SD.cpp backend to use", "Stable Diffusion Options"}, + {"sdcpp_args", "--sdcpp-args", "", "ARGS", + "Custom arguments to pass to sd-server (must not conflict with managed args)", "Stable Diffusion Options"}, + // Image generation defaults (recipe-level only, not CLI flags). + {"steps", "", 20, "SIZE", "Number of diffusion steps", "Stable Diffusion Options"}, + {"cfg_scale", "", 7.0, "SIZE", "Classifier-free guidance scale", "Stable Diffusion Options"}, + {"width", "", 512, "SIZE", "Output image width", "Stable Diffusion Options"}, + {"height", "", 512, "SIZE", "Output image height", "Stable Diffusion Options"}, + {"sampling_method", "", "", "ARGS", "Sampling method", "Stable Diffusion Options"}, + {"flow_shift", "", 0.0, "SIZE", "Flow shift", "Stable Diffusion Options"}, + }, + /*support*/ { + {"rocm", {"windows", "linux"}, + {{"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}}}, "Supported AMD ROCm iGPU/dGPU families*"}, + {"cuda", {"linux"}, + {{"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}}, "NVIDIA GPUs (Turing or newer)**"}, + {"vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}, {"nvidia_gpu", {}}}, "Vulkan-capable GPUs"}, + {"cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"}, + {"metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"}, + }, + /*default_labels*/ {"image"}, + /*required_checkpoints*/ {"main"}, // flux text_encoder+vae validated together in load() + /*modality*/ "Image generation", + /*experimental*/ false, + /*web_display_name*/ "stable-diffusion.cpp", + /*web_priority*/ 5, + /*rocm_channels*/ {"stable"}, + /*exposes_prometheus_metrics*/ false, + /*rocm_requires_cwsr_fix*/ true, + /*version_policy*/ VersionPolicy::Exact, + /*self_manages_downloads*/ false, + /*takes_args*/ true, + /*arg_variants*/ {"cpu", "rocm", "vulkan"}, + /*bin_variants*/ {"cpu", "rocm", "vulkan"}, + /*config_extra*/ {{"steps", 20}, {"cfg_scale", 7.0}, {"width", 512}, {"height", 512}}, +}; + +} // namespace sdcpp +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/sd_server.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h similarity index 85% rename from src/cpp/include/lemon/backends/sd_server.h rename to src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h index 857374951..185108afc 100644 --- a/src/cpp/include/lemon/backends/sd_server.h +++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h @@ -1,11 +1,13 @@ #pragma once -#include "../wrapped_server.h" -#include "../server_capabilities.h" -#include "../model_manager.h" -#include "../recipe_options.h" -#include "../utils/process_manager.h" -#include "backend_utils.h" +#include "lemon/backends/backend_registry.h" + +#include "lemon/wrapped_server.h" +#include "lemon/server_capabilities.h" +#include "lemon/model_manager.h" +#include "lemon/recipe_options.h" +#include "lemon/utils/process_manager.h" +#include "lemon/backends/backend_utils.h" #include #include @@ -16,15 +18,6 @@ class SDServer : public WrappedServer, public IImageServer { public: static InstallParams get_install_params(const std::string& backend, const std::string& version); - inline static const BackendSpec SPEC = BackendSpec( - "sd-cpp", - #ifdef _WIN32 - "sd-server.exe" - #else - "sd-server" - #endif - , get_install_params - ); explicit SDServer(const std::string& log_level, ModelManager* model_manager, @@ -93,5 +86,11 @@ class SDServer : public WrappedServer, public IImageServer { std::string resolve_size(const nlohmann::json& request) const; }; -} // namespace backends -} // namespace lemon +namespace sdcpp { +// Factory for the sdcpp backend (constructs the server class — lemond only). +std::unique_ptr create(const BackendContext& ctx); +const BackendSpec* spec(); +const BackendOps* ops(); +} // namespace sdcpp +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/vllm/vllm.h b/src/cpp/include/lemon/backends/vllm/vllm.h new file mode 100644 index 000000000..8984e15b3 --- /dev/null +++ b/src/cpp/include/lemon/backends/vllm/vllm.h @@ -0,0 +1,49 @@ +#pragma once + +#include "lemon/backends/backend_descriptor.h" + +namespace lemon { +namespace backends { +namespace vllm { + +// The vllm backend descriptor (plain data). Header-only `inline const` so it +// links into both the lemonade CLI and lemond without a separate source file. +inline const BackendDescriptor descriptor = { + /*recipe*/ "vllm", + /*display_name*/ "vLLM ROCm (experimental)", + /*binary*/ "vllm-server", + /*config_section*/ "", // defaults to recipe + /*default_device*/ DEVICE_GPU, + /*slot_policy*/ SlotPolicy::Standard, + /*selectable_backend*/ true, + /*uses_ctx_size*/ true, + /*dynamic_models*/ false, + /*options*/ { + {"vllm_backend", "--vllm", "", "BACKEND", + "vLLM backend to use", "vLLM Options"}, + {"vllm_args", "--vllm-args", "", "ARGS", + "Custom arguments to pass to vllm-server", "vLLM Options"}, + }, + /*support*/ { + {"rocm", {"linux"}, {{"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}}, "Strix Halo iGPU (gfx1151)"}, + }, + /*default_labels*/ {}, + /*required_checkpoints*/ {"main"}, + /*modality*/ "Text generation", + /*experimental*/ true, + /*web_display_name*/ "", + /*web_priority*/ 0, + /*rocm_channels*/ {}, // single rocm artifact, no stable/nightly channels + /*exposes_prometheus_metrics*/ false, + /*rocm_requires_cwsr_fix*/ true, + /*version_policy*/ VersionPolicy::Exact, + /*self_manages_downloads*/ false, + /*takes_args*/ true, + /*arg_variants*/ {}, + /*bin_variants*/ {}, + /*config_extra*/ nlohmann::json::object(), +}; + +} // namespace vllm +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/vllm_server.h b/src/cpp/include/lemon/backends/vllm/vllm_server.h similarity index 74% rename from src/cpp/include/lemon/backends/vllm_server.h rename to src/cpp/include/lemon/backends/vllm/vllm_server.h index 62ec94af2..1ac9438ed 100644 --- a/src/cpp/include/lemon/backends/vllm_server.h +++ b/src/cpp/include/lemon/backends/vllm/vllm_server.h @@ -1,7 +1,9 @@ #pragma once -#include "../wrapped_server.h" -#include "backend_utils.h" +#include "lemon/backends/backend_registry.h" + +#include "lemon/wrapped_server.h" +#include "lemon/backends/backend_utils.h" #include namespace lemon { @@ -11,12 +13,6 @@ class VLLMServer : public WrappedServer { public: static InstallParams get_install_params(const std::string& backend, const std::string& version); - inline static const BackendSpec SPEC = BackendSpec( - "vllm", - "vllm-server" - , get_install_params - , /*supports_split_archive=*/true - ); VLLMServer(const std::string& log_level, ModelManager* model_manager, @@ -45,5 +41,11 @@ class VLLMServer : public WrappedServer { }; -} // namespace backends -} // namespace lemon +namespace vllm { +// Factory for the vllm backend (constructs the server class — lemond only). +std::unique_ptr create(const BackendContext& ctx); +const BackendSpec* spec(); +const BackendOps* ops(); +} // namespace vllm +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h new file mode 100644 index 000000000..9c38b66d5 --- /dev/null +++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h @@ -0,0 +1,58 @@ +#pragma once + +#include "lemon/backends/backend_descriptor.h" + +namespace lemon { +namespace backends { +namespace whispercpp { + +// The whispercpp backend descriptor (plain data). Header-only `inline const` so it +// links into both the lemonade CLI and lemond without a separate source file. +inline const BackendDescriptor descriptor = { + /*recipe*/ "whispercpp", + /*display_name*/ "Whisper.cpp", +#ifdef _WIN32 + /*binary*/ "whisper-server.exe", +#else + /*binary*/ "whisper-server", +#endif + /*config_section*/ "", // defaults to recipe + /*default_device*/ DEVICE_CPU, // npu variant resolves to NPU + ExclusiveNpu via effective_*() + /*slot_policy*/ SlotPolicy::Standard, + /*selectable_backend*/ true, + /*uses_ctx_size*/ false, + /*dynamic_models*/ false, + /*options*/ { + {"whispercpp_backend", "--whispercpp", "", "BACKEND", + "WhisperCpp backend to use", "Whisper.cpp Options"}, + {"whispercpp_args", "--whispercpp-args", "", "ARGS", + "Custom arguments to pass to whisper-server", "Whisper.cpp Options"}, + }, + /*support*/ { + {"npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"}, + {"rocm", {"windows", "linux"}, + {{"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}}, "Supported AMD ROCm iGPU/dGPU families*"}, + {"vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}}, "x86_64 CPU"}, + {"cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"}, + {"metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"}, + }, + /*default_labels*/ {"transcription", "realtime-transcription"}, + /*required_checkpoints*/ {"main"}, // npu_cache validated in load() (npu variant only) + /*modality*/ "Speech-to-text", + /*experimental*/ false, + /*web_display_name*/ "whisper.cpp", + /*web_priority*/ 4, + /*rocm_channels*/ {}, + /*exposes_prometheus_metrics*/ false, + /*rocm_requires_cwsr_fix*/ false, + /*version_policy*/ VersionPolicy::Exact, + /*self_manages_downloads*/ false, + /*takes_args*/ true, + /*arg_variants*/ {"cpu", "npu"}, + /*bin_variants*/ {"cpu", "npu"}, + /*config_extra*/ nlohmann::json::object(), +}; + +} // namespace whispercpp +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/whisper_server.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h similarity index 83% rename from src/cpp/include/lemon/backends/whisper_server.h rename to src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h index 55a1734d9..dc97cbd9f 100644 --- a/src/cpp/include/lemon/backends/whisper_server.h +++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h @@ -1,8 +1,10 @@ #pragma once -#include "../wrapped_server.h" -#include "../server_capabilities.h" -#include "backend_utils.h" +#include "lemon/backends/backend_registry.h" + +#include "lemon/wrapped_server.h" +#include "lemon/server_capabilities.h" +#include "lemon/backends/backend_utils.h" #include #include @@ -13,15 +15,6 @@ class WhisperServer : public WrappedServer, public ITranscriptionServer { public: static InstallParams get_install_params(const std::string& backend, const std::string& version); - inline static const BackendSpec SPEC = BackendSpec( - "whispercpp", -#ifdef _WIN32 - "whisper-server.exe" -#else - "whisper-server" -#endif - , get_install_params - ); explicit WhisperServer(const std::string& log_level, ModelManager* model_manager, @@ -74,5 +67,11 @@ class WhisperServer : public WrappedServer, public ITranscriptionServer { std::filesystem::path temp_dir_; // Directory for temporary audio files }; -} // namespace backends -} // namespace lemon +namespace whispercpp { +// Factory for the whispercpp backend (constructs the server class — lemond only). +std::unique_ptr create(const BackendContext& ctx); +const BackendSpec* spec(); +const BackendOps* ops(); +} // namespace whispercpp +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/config_file.h b/src/cpp/include/lemon/config_file.h index ec56c17fb..8c46e125f 100644 --- a/src/cpp/include/lemon/config_file.h +++ b/src/cpp/include/lemon/config_file.h @@ -84,8 +84,15 @@ static inline bool config_migrate(json& config, /// Manages reading and writing config.json in the lemonade cache dir. class ConfigFile { public: - /// Returns the full default config loaded from installed resource JSON. - /// On Linux, an optional distro override at /usr/share/lemonade/defaults.json + /// The canonical default config: resources/defaults.json (global keys) with + /// each backend's per-recipe section seeded from its descriptor. Host- and + /// deployment-independent, so it is reproducible — this is what + /// GET /internal/config/defaults emits and gen_backend_boilerplate.py writes + /// back into resources/defaults.json. + static json base_defaults(); + + /// base_defaults() plus deployment overrides. On Linux, an optional distro + /// override at /usr/share/lemonade/defaults.json (and LEMONADE_DEFAULTS_PATH) /// is merged on top when present. static json get_defaults(); diff --git a/src/cpp/include/lemon/model_manager.h b/src/cpp/include/lemon/model_manager.h index 77a10066e..c346e3ab9 100644 --- a/src/cpp/include/lemon/model_manager.h +++ b/src/cpp/include/lemon/model_manager.h @@ -77,11 +77,6 @@ struct ModelInfo { bool suggested = false; std::string source; // "local_upload" for locally uploaded models bool downloaded = false; // Whether model is downloaded and available - // When true, LlamaCppServer launches llama-server with `-hf ` - // instead of `-m [--mmproj ]`. Required for models like - // Qwen2.5-Omni where llama-server's manual-load path rejects audio content - // parts — the -hf path drives the dual-clip (vision+audio) context correctly. - bool hf_load = false; double size = 0.0; // Model size in GB int64_t max_context_window = 0; // Static model-supported text context, when known @@ -105,8 +100,18 @@ struct ModelInfo { double cost_input_per_million = -1.0; double cost_output_per_million = -1.0; - // Moonshine-specific model architecture (e.g., 2 = TINY_STREAMING, 4 = SMALL_STREAMING, 5 = MEDIUM_STREAMING) - int moonshine_arch = -1; + // Generic per-model fields a backend declares for itself. Any server_models.json + // key not consumed by a typed field above lands here, so a new backend can read + // custom per-model config in load() without editing this shared struct. + std::map extras; + + // Look up an extra field, returning a default when absent. + template + T extra(const std::string& key, const T& fallback) const { + auto it = extras.find(key); + if (it == extras.end() || it->second.is_null()) return fallback; + try { return it->second.get(); } catch (...) { return fallback; } + } // Utility std::string checkpoint(const std::string& type = "main") const { return checkpoints.count(type) ? checkpoints.at(type) : ""; } @@ -209,11 +214,19 @@ class ModelManager { // Check if model is downloaded bool is_model_downloaded(const std::string& model_name); - // Get list of installed FLM models (for caching) - std::vector get_flm_installed_models(); + // True if the model's backend pulls its own models on demand (e.g. flm) and + // so should be skipped by the router's load-time auto-download path. + bool backend_self_manages_downloads(const std::string& recipe) const; + + // Shared Hugging Face completeness check: true if all required checkpoints + // are present and complete (per-backend file validation runs via ops). The + // default BackendOps::is_downloaded delegates here for HF-backed backends. + bool checkpoints_complete(const ModelInfo& info) const; - // Get list of all available FLM models from 'flm list --json' - std::vector get_flm_available_models(); + // Shared Hugging Face download engine. The default BackendOps::download_model + // delegates here; flm/cloud override with their own download. + void download_from_huggingface_engine(const ModelInfo& info, + DownloadProgressCallback progress_callback = nullptr); // Get HuggingFace cache directory (respects HF_HUB_CACHE, HF_HOME, and platform defaults) std::string get_hf_cache_dir() const; @@ -295,11 +308,6 @@ class ModelManager { void download_from_huggingface(const ModelInfo& info, DownloadProgressCallback progress_callback = nullptr); - // Download from FLM - void download_from_flm(const std::string& checkpoint, - bool do_not_upgrade = true, - DownloadProgressCallback progress_callback = nullptr); - // Discover GGUF models from extra_models_dir std::map discover_extra_models() const; diff --git a/src/cpp/include/lemon/model_types.h b/src/cpp/include/lemon/model_types.h index eb5d4e0b4..c92bedb37 100644 --- a/src/cpp/include/lemon/model_types.h +++ b/src/cpp/include/lemon/model_types.h @@ -139,28 +139,14 @@ inline ModelType get_model_type_from_labels(const std::vector& labe return ModelType::LLM; } -// Determine device type from recipe -// Default device from recipe — individual backends override based on their config +// Fallback device type for recipes with no registered backend descriptor +// (collections and unknown recipes). The authoritative per-backend default lives +// in BackendDescriptor::default_device; ModelManager::device_type_for_recipe +// consults the descriptor registry first and only falls back here. Kept in this +// low-level header (which must not depend on the backend registry) for that +// fallback alone — it intentionally carries no per-backend knowledge. inline DeviceType get_device_type_from_recipe(const std::string& recipe) { - if (recipe == "llamacpp") { - return DEVICE_GPU; - } else if (recipe == "ryzenai-llm") { - return DEVICE_NPU; - } else if (recipe == "flm") { - return DEVICE_NPU; - } else if (recipe == "whispercpp") { - return DEVICE_CPU; - } else if (recipe == "moonshine") { - return DEVICE_CPU; - } else if (recipe == "sd-cpp") { - return DEVICE_CPU; - } else if (recipe == "kokoro") { - return DEVICE_CPU; - } else if (is_collection_recipe(recipe)) { - return DEVICE_NONE; - } else if (recipe == "cloud") { - return DEVICE_NONE; // Cloud-offloaded models execute on a remote provider - } + (void)recipe; return DEVICE_NONE; } diff --git a/src/cpp/include/lemon/recipe_backend_def.h b/src/cpp/include/lemon/recipe_backend_def.h new file mode 100644 index 000000000..ec0af9a9d --- /dev/null +++ b/src/cpp/include/lemon/recipe_backend_def.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include +#include + +namespace lemon { + +// Device constraints: device_type -> set of allowed families (empty = all families) +using DeviceConstraints = std::map>; + +// A single recipe/backend support row: which OS and device families a given +// (recipe, backend) pair runs on. The canonical support matrix is assembled by +// collecting these rows from every backend descriptor (see BackendDescriptor::support). +// +// IMPORTANT: For recipes with multiple backends (e.g. llamacpp), the order in +// which these rows appear defines the preference order — first listed = most +// preferred. Empty family set {} means "all families of that device type". +struct RecipeBackendDef { + std::string recipe; + std::string backend; + std::set supported_os; + DeviceConstraints devices; + // Human-friendly device description for the generated support matrix (README). + // May contain footnote markers (e.g. "*") whose text lives as prose in the doc. + std::string device_summary = ""; +}; + +// A backend descriptor's support row, without the recipe (it's always the +// owning descriptor's recipe — assembling a RecipeBackendDef fills it in). Keeps +// the descriptor literals from repeating their own recipe on every row. +struct BackendSupport { + std::string backend; + std::set supported_os; + DeviceConstraints devices; + std::string device_summary = ""; +}; + +} // namespace lemon diff --git a/src/cpp/include/lemon/router.h b/src/cpp/include/lemon/router.h index e98a8b11d..a4f2d9629 100644 --- a/src/cpp/include/lemon/router.h +++ b/src/cpp/include/lemon/router.h @@ -167,7 +167,7 @@ class Router { bool has_npu_server() const; WrappedServer* find_npu_server() const; WrappedServer* find_npu_server_by_recipe(const std::string& recipe) const; - WrappedServer* find_flm_server_by_type(ModelType type) const; + WrappedServer* find_coexisting_server_by_type(ModelType type) const; void evict_all_npu_servers(); void evict_server(WrappedServer* server, int timeout_seconds = -1); void evict_all_servers(); diff --git a/src/cpp/include/lemon/server.h b/src/cpp/include/lemon/server.h index d481f5b80..3e39357d3 100644 --- a/src/cpp/include/lemon/server.h +++ b/src/cpp/include/lemon/server.h @@ -73,6 +73,7 @@ class Server { // Unified config endpoints void handle_config_set(const httplib::Request& req, httplib::Response& res); void handle_config_get(const httplib::Request& req, httplib::Response& res); + void handle_config_defaults_get(const httplib::Request& req, httplib::Response& res); // Side-effect callback for RuntimeConfig::set(). Receives a nested JSON // mirroring the input shape, containing only entries that actually changed. diff --git a/src/cpp/include/lemon/system_info.h b/src/cpp/include/lemon/system_info.h index 9b143ae47..a67c744b6 100644 --- a/src/cpp/include/lemon/system_info.h +++ b/src/cpp/include/lemon/system_info.h @@ -104,9 +104,6 @@ class SystemInfo { }; static std::vector get_all_recipe_statuses(); - static std::string get_flm_version(); - static std::string get_system_llamacpp_version(); - // Device support detection static std::string get_rocm_arch(); static std::string get_cuda_arch(); diff --git a/src/cpp/include/lemon/utils/path_utils.h b/src/cpp/include/lemon/utils/path_utils.h index 96561186c..63f142ee6 100644 --- a/src/cpp/include/lemon/utils/path_utils.h +++ b/src/cpp/include/lemon/utils/path_utils.h @@ -35,22 +35,6 @@ bool is_safe_executable_path(const std::string& path); */ bool looks_like_path(const std::string& v); -/** - * Find the FLM executable (flm.exe on Windows, flm on Unix). - * Uses SearchPathA on Windows (same API as CreateProcessA) to search PATH, - * then falls back to the default installation directory. - * @return Full path to flm executable, or empty string if not found. - */ -std::string find_flm_executable(); - -/** - * Run 'flm validate' command and check if it succeeds. - * @param flm_path Optional path to flm executable. If empty, will search for it. - * @param error_message Output parameter for error message if validation fails. - * @return true if validation succeeds, false otherwise. - */ -bool run_flm_validate(const std::string& flm_path, std::string& error_message); - /** * Get an environment variable as UTF-8 text. */ @@ -73,13 +57,6 @@ std::string path_to_utf8(const std::filesystem::path& path); */ std::string find_executable_in_path(const std::string& executable_name); -/** - * Check if the HIP plugin for GGML backends is available on the system. - * This function checks common installation paths for libggml-hip.so. - * @return true if the HIP plugin is found, false otherwise. - */ -bool is_ggml_hip_plugin_available(); - /** * Set the lemonade cache directory. Must be called once at startup before * get_cache_dir(). After this call, get_cache_dir() returns this path. diff --git a/src/cpp/include/lemon/wrapped_server.h b/src/cpp/include/lemon/wrapped_server.h index f3ec74da4..41e91595b 100644 --- a/src/cpp/include/lemon/wrapped_server.h +++ b/src/cpp/include/lemon/wrapped_server.h @@ -17,6 +17,7 @@ #include "model_manager.h" #include "backend_manager.h" #include "recipe_options.h" +#include "backends/backend_descriptor.h" namespace lemon { @@ -307,10 +308,46 @@ class WrappedServer : public ICompletionServer { // No-op by default } - // ICompletionServer implementation - forward requests to the wrapped server - virtual json chat_completion(const json& request) override = 0; - virtual json completion(const json& request) override = 0; - virtual json responses(const json& request) = 0; + // ICompletionServer implementation - forward requests to the wrapped server. + // Default to an "unsupported" error so non-chat backends (TTS, image, + // transcription) inherit a sensible response instead of stubbing each one. + virtual json chat_completion(const json& request) override { + return unsupported_capability_error("chat completion"); + } + virtual json completion(const json& request) override { + return unsupported_capability_error("text completion"); + } + virtual json responses(const json& request) { + return unsupported_capability_error("responses"); + } + + // Descriptor association (set by the backend registry at create() time). The + // effective_* hooks below default to the descriptor's declared values; a + // backend whose device or eviction rule depends on the chosen backend + // variant overrides them (e.g. whisper on npu vs cpu, llamacpp on cpu vs gpu). + void set_descriptor(const BackendDescriptor* descriptor) { descriptor_ = descriptor; } + const BackendDescriptor* get_descriptor() const { return descriptor_; } + + // Effective accelerator device for this load. The router calls this after it + // resolves the "_backend" option but before eviction. Defaults to the + // descriptor's default_device; variant-dependent backends override. + virtual DeviceType effective_device(const RecipeOptions& options) const { + (void)options; + return descriptor_ ? descriptor_->default_device : device_type_; + } + + // Effective slot/eviction policy for this load. The router switches on this + // value to enforce NPU exclusivity and LRU slot accounting. Defaults to the + // descriptor's slot_policy; variant-dependent backends override. + virtual SlotPolicy effective_slot_policy(const RecipeOptions& options) const { + (void)options; + return descriptor_ ? descriptor_->slot_policy : SlotPolicy::Standard; + } + + // Dynamic availability check. Returns "" if the backend can run on this + // system, or a user-facing reason why it cannot. Defaults to "available"; + // backends with runtime-dependent availability (cloud) override. + virtual std::string availability() const { return ""; } // Forward streaming requests to the wrapped server (public for Router access) // Virtual so backends can transform request (e.g., FLM needs checkpoint in model field) @@ -373,6 +410,17 @@ class WrappedServer : public ICompletionServer { BackendRequestKind kind_; }; + // Standard "this backend does not serve " error payload, matching the + // shape backends return from unsupported capability methods. + json unsupported_capability_error(const std::string& what) const { + return json{{"error", { + {"message", server_name_ + " does not support " + what + + ". Use the appropriate endpoint for this model type instead."}, + {"type", "unsupported_operation"}, + {"code", "model_not_applicable"} + }}}; + } + static bool has_process_handle(const ProcessHandle& handle); ProcessHandle get_process_handle_snapshot() const; void set_process_handle(ProcessHandle handle); @@ -420,6 +468,7 @@ class WrappedServer : public ICompletionServer { std::string log_level_; ModelManager* model_manager_; // Non-owning pointer to ModelManager BackendManager* backend_manager_; // Non-owning pointer to BackendManager + const BackendDescriptor* descriptor_ = nullptr; // Non-owning; set by the backend registry at create() // Multi-model support fields std::string model_name_; diff --git a/src/cpp/resources/defaults.json b/src/cpp/resources/defaults.json index f79396266..ab86404dd 100644 --- a/src/cpp/resources/defaults.json +++ b/src/cpp/resources/defaults.json @@ -1,71 +1,71 @@ { + "cloud_providers": [], "config_version": 2, - "port": 13305, - "host": "localhost", - "websocket_port": "auto", - "log_level": "info", - "global_timeout": 600, - "max_loaded_models": 1, - "no_broadcast": false, - "extra_models_dir": "", - "models_dir": "auto", "ctx_size": -1, - "offline": false, - "no_fetch_executables": false, "disable_model_filtering": false, "enable_dgpu_gtt": false, - "rocm_channel": "stable", + "extra_models_dir": "", + "flm": { + "args": "" + }, + "global_timeout": 600, + "host": "localhost", + "kokoro": { + "cpu_bin": "builtin" + }, "llamacpp": { - "backend": "auto", "args": "", - "rocm_args": "", - "vulkan_args": "", + "backend": "auto", "cpu_args": "", + "cpu_bin": "builtin", + "cuda_bin": "builtin", "prefer_system": true, + "rocm_args": "", "rocm_bin": "builtin", - "vulkan_bin": "builtin", - "cuda_bin": "builtin", - "cpu_bin": "builtin" + "vulkan_args": "", + "vulkan_bin": "builtin" }, - "whispercpp": { - "backend": "auto", + "log_level": "info", + "max_loaded_models": 1, + "models_dir": "auto", + "moonshine": { "args": "", "cpu_args": "", - "npu_args": "", - "cpu_bin": "builtin", - "npu_bin": "builtin" + "cpu_bin": "builtin" + }, + "no_broadcast": false, + "no_fetch_executables": false, + "offline": false, + "port": 13305, + "rocm_channel": "stable", + "ryzenai": { + "server_bin": "builtin" }, "sdcpp": { - "backend": "auto", "args": "", - "cpu_args": "", - "rocm_args": "", - "vulkan_args": "", - "steps": 20, + "backend": "auto", "cfg_scale": 7.0, - "width": 512, - "height": 512, + "cpu_args": "", "cpu_bin": "builtin", + "height": 512, + "rocm_args": "", "rocm_bin": "builtin", - "vulkan_bin": "builtin" - }, - "flm": { - "args": "" + "steps": 20, + "vulkan_args": "", + "vulkan_bin": "builtin", + "width": 512 }, "vllm": { - "backend": "auto", - "args": "" - }, - "ryzenai": { - "server_bin": "builtin" - }, - "kokoro": { - "cpu_bin": "builtin" + "args": "", + "backend": "auto" }, - "moonshine": { + "websocket_port": "auto", + "whispercpp": { "args": "", + "backend": "auto", "cpu_args": "", - "cpu_bin": "builtin" - }, - "cloud_providers": [] + "cpu_bin": "builtin", + "npu_args": "", + "npu_bin": "builtin" + } } diff --git a/src/cpp/server/backend_manager.cpp b/src/cpp/server/backend_manager.cpp index 120b61428..7f5452845 100644 --- a/src/cpp/server/backend_manager.cpp +++ b/src/cpp/server/backend_manager.cpp @@ -1,5 +1,6 @@ #include "lemon/backend_manager.h" #include "lemon/backend_version_policy.h" +#include "lemon/backends/backend_descriptor_registry.h" #include "lemon/backends/backend_utils.h" #include "lemon/runtime_config.h" #include "lemon/system_info.h" @@ -36,7 +37,7 @@ std::string get_current_os() { } std::string normalize_backend_name(const std::string& recipe, const std::string& backend) { - if ((recipe == "llamacpp" || recipe == "sd-cpp") && backend == "rocm") { + if (backends::recipe_has_rocm_channels(recipe) && backend == "rocm") { // Map "rocm" to the appropriate channel based on config std::string channel = "stable"; // default to stable for now if (auto* cfg = RuntimeConfig::global()) { @@ -64,15 +65,6 @@ std::string get_backend_runtime_version(const json& backend_versions, return backend_versions[recipe][runtime_key].get(); } - // Only fall back to llamacpp runtime version if the recipe is llamacpp - if (recipe == "llamacpp" && - backend_versions.contains("llamacpp") && - backend_versions["llamacpp"].is_object() && - backend_versions["llamacpp"].contains(runtime_key) && - backend_versions["llamacpp"][runtime_key].is_string()) { - return backend_versions["llamacpp"][runtime_key].get(); - } - throw std::runtime_error("backend_versions.json is missing runtime version for: " + recipe + ":" + runtime_key); } @@ -484,7 +476,7 @@ void BackendManager::install_backend(const std::string& recipe, const std::strin // Do that here before inflating the install to a multi-file UX flow. const std::string os = get_current_os(); const bool is_rocm_stable_backend = - (recipe == "llamacpp" || recipe == "sd-cpp") && + backends::recipe_has_rocm_channels(recipe) && resolved_backend == "rocm-stable"; const bool therock_applicable = is_rocm_stable_backend && will_install_therock(os, backend_versions_); diff --git a/src/cpp/server/backends/backend_descriptor_registry.cpp b/src/cpp/server/backends/backend_descriptor_registry.cpp new file mode 100644 index 000000000..6d1741d87 --- /dev/null +++ b/src/cpp/server/backends/backend_descriptor_registry.cpp @@ -0,0 +1,34 @@ +#include "lemon/backends/backend_descriptor_registry.h" + +// Generated from LEMON_BACKENDS at configure time. Defines +// lemon::backends::all_generated_descriptors() (descriptor data only). +#include "backend_descriptors_generated.h" + +namespace lemon { +namespace backends { + +const std::vector& all_descriptors() { + static const std::vector kDescriptors = all_generated_descriptors(); + return kDescriptors; +} + +const BackendDescriptor* descriptor_for(const std::string& recipe) { + for (const BackendDescriptor* d : all_descriptors()) { + if (d->recipe == recipe) { + return d; + } + } + return nullptr; +} + +bool has_backend(const std::string& recipe) { + return descriptor_for(recipe) != nullptr; +} + +bool recipe_has_rocm_channels(const std::string& recipe) { + const BackendDescriptor* d = descriptor_for(recipe); + return d != nullptr && !d->rocm_channels.empty(); +} + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/backend_descriptors_generated.h.in b/src/cpp/server/backends/backend_descriptors_generated.h.in new file mode 100644 index 000000000..3f6d7ec2a --- /dev/null +++ b/src/cpp/server/backends/backend_descriptors_generated.h.in @@ -0,0 +1,19 @@ +#pragma once +// +// AUTO-GENERATED at CMake configure time from LEMON_BACKENDS in CMakeLists.txt. +// Do not edit by hand. Descriptor DATA only (CLI-safe; no server classes). +// +#include +#include "lemon/backends/backend_descriptor.h" +@LEMON_DESCRIPTOR_INCLUDES@ +namespace lemon { +namespace backends { + +inline std::vector all_generated_descriptors() { + return { +@LEMON_DESCRIPTOR_ENTRIES@ + }; +} + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/backend_factories_generated.h.in b/src/cpp/server/backends/backend_factories_generated.h.in new file mode 100644 index 000000000..d488ce014 --- /dev/null +++ b/src/cpp/server/backends/backend_factories_generated.h.in @@ -0,0 +1,21 @@ +#pragma once +// +// AUTO-GENERATED at CMake configure time from LEMON_BACKENDS in CMakeLists.txt. +// Do not edit by hand. Binds each descriptor to its server class's create() +// (server-only: pulls in server classes, compiled into lemond not the CLI). +// +#include +#include "lemon/backends/backend_registry.h" +@LEMON_DESCRIPTOR_INCLUDES@ +@LEMON_FACTORY_INCLUDES@ +namespace lemon { +namespace backends { + +inline std::vector generated_registrations() { + return { +@LEMON_FACTORY_ENTRIES@ + }; +} + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/backend_ops.cpp b/src/cpp/server/backends/backend_ops.cpp new file mode 100644 index 000000000..2f4cdf48c --- /dev/null +++ b/src/cpp/server/backends/backend_ops.cpp @@ -0,0 +1,125 @@ +#include "lemon/backends/backend_ops.h" + +#include +#include +#include "lemon/backends/hf_cache_util.h" +#include "lemon/utils/path_utils.h" + +namespace fs = std::filesystem; + +namespace lemon { +namespace backends { + +using lemon::utils::path_from_utf8; +using lemon::utils::path_to_utf8; + +// Default checkpoint resolution: the shared Hugging Face behavior. Locate the +// requested variant (or auxiliary file like mmproj) within the active snapshot, +// falling back to the main repo and finally the model cache directory. Backends +// with bespoke layouts override resolve_checkpoint_path(). +std::string BackendOps::resolve_checkpoint_path(const ModelInfo& info, + const CheckpointResolveContext& ctx) const { + (void)info; + + // NPU side-cache checkpoints have no resolvable local file here (the backend + // that uses them resolves them itself at load time). + if (ctx.type == "npu_cache") { + return ""; + } + + fs::path model_cache_path_fs = path_from_utf8(ctx.model_cache_path); + + if (!ctx.variant.empty()) { + // Prefer refs/main for auxiliary checkpoints too (e.g. mmproj) so + // companion files stay on the active snapshot as the main model. + fs::path active_snapshot = hf_cache::active_snapshot_path(model_cache_path_fs); + if (!active_snapshot.empty()) { + fs::path direct_variant_path = active_snapshot / path_from_utf8(ctx.variant); + if (hf_cache::exists(direct_variant_path)) { + return path_to_utf8(direct_variant_path); + } + std::error_code ec; + for (const auto& entry : + fs::recursive_directory_iterator(active_snapshot, hf_cache::dir_options(), ec)) { + if (ec) break; + if (entry.is_regular_file(ec)) { + if (entry.path().filename().string() == ctx.variant) { + return path_to_utf8(entry.path()); + } + } else if (entry.is_directory(ec)) { + fs::path variant_path = entry.path() / path_from_utf8(ctx.variant); + if (hf_cache::exists(variant_path)) { + return path_to_utf8(variant_path); + } + } + ec.clear(); + } + } + + // Try to find the exact variant in the cache directory's subtree. + if (hf_cache::exists(model_cache_path_fs)) { + for (const auto& entry : + fs::recursive_directory_iterator(model_cache_path_fs, hf_cache::dir_options())) { + if (entry.is_regular_file()) { + if (entry.path().filename().string() == ctx.variant) { + return path_to_utf8(entry.path()); + } + } else if (entry.is_directory()) { + fs::path variant_path = entry.path() / path_from_utf8(ctx.variant); + if (hf_cache::exists(variant_path)) { + return path_to_utf8(variant_path); + } + } + } + } + + // Backward-compat: older downloads placed all files in the main repo dir. + if (ctx.repo_id != ctx.main_repo_id) { + std::string main_cache_path = + ctx.hf_cache + "/" + hf_cache::repo_id_to_cache_dir_name(ctx.main_repo_id); + fs::path main_cache_path_fs = path_from_utf8(main_cache_path); + if (fs::exists(main_cache_path_fs)) { + for (const auto& entry : fs::recursive_directory_iterator(main_cache_path_fs)) { + if (entry.is_regular_file()) { + if (entry.path().filename().string() == ctx.variant) { + return path_to_utf8(entry.path()); + } + } else if (entry.is_directory()) { + fs::path variant_path = entry.path() / path_from_utf8(ctx.variant); + if (fs::exists(variant_path)) { + return path_to_utf8(variant_path); + } + } + } + } + } + + // Variant not found — signal not downloaded. + return ""; + } + + // No variant: return the cache directory. + return ctx.model_cache_path; +} + +bool BackendOps::is_downloaded(const ModelInfo& info, const BackendOpsContext& ctx) const { + // Default: the shared HF checkpoint-completeness check. + return ctx.model_manager != nullptr && ctx.model_manager->checkpoints_complete(info); +} + +void BackendOps::download_model(const ModelInfo& info, bool do_not_upgrade, + DownloadProgressCallback progress, const BackendOpsContext& ctx) const { + // Default: the shared Hugging Face download engine. + (void)do_not_upgrade; + if (ctx.model_manager != nullptr) { + ctx.model_manager->download_from_huggingface_engine(info, progress); + } +} + +const BackendOps* default_backend_ops() { + static const BackendOps kDefault; + return &kDefault; +} + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/backend_registry.cpp b/src/cpp/server/backends/backend_registry.cpp new file mode 100644 index 000000000..abbeaf998 --- /dev/null +++ b/src/cpp/server/backends/backend_registry.cpp @@ -0,0 +1,49 @@ +#include "lemon/backends/backend_registry.h" +#include "lemon/wrapped_server.h" + +// Generated from LEMON_BACKENDS at configure time. Defines +// lemon::backends::generated_registrations(), pairing each descriptor with its +// server class's create(). +#include "backend_factories_generated.h" + +namespace lemon { +namespace backends { + +const std::vector& all_registrations() { + static const std::vector kRegistrations = generated_registrations(); + return kRegistrations; +} + +const BackendSpec* spec_for(const std::string& recipe) { + for (const auto& reg : all_registrations()) { + if (reg.descriptor->recipe == recipe) { + return reg.spec; + } + } + return nullptr; +} + +const BackendOps* ops_for(const std::string& recipe) { + for (const auto& reg : all_registrations()) { + if (reg.descriptor->recipe == recipe) { + return reg.ops; + } + } + return default_backend_ops(); +} + +std::unique_ptr create_server(const std::string& recipe, const BackendContext& ctx) { + for (const auto& reg : all_registrations()) { + if (reg.descriptor->recipe == recipe) { + std::unique_ptr server = reg.create(ctx); + if (server) { + server->set_descriptor(reg.descriptor); + } + return server; + } + } + return nullptr; +} + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/backend_utils.cpp b/src/cpp/server/backends/backend_utils.cpp index bbed27684..42c0d1709 100644 --- a/src/cpp/server/backends/backend_utils.cpp +++ b/src/cpp/server/backends/backend_utils.cpp @@ -2,14 +2,7 @@ #include "lemon/backends/install_staging.h" #include "lemon/runtime_config.h" #include "lemon/system_info.h" -#include "lemon/backends/llamacpp_server.h" -#include "lemon/backends/whisper_server.h" -#include "lemon/backends/sd_server.h" -#include "lemon/backends/kokoro_server.h" -#include "lemon/backends/ryzenaiserver.h" -#include "lemon/backends/vllm_server.h" -#include "lemon/backends/fastflowlm_server.h" -#include "lemon/backends/moonshine_server.h" +#include "lemon/backends/backend_registry.h" // spec_for() — descriptor->install spec, no server includes #include "lemon/model_manager.h" // For DownloadProgress, DownloadProgressCallback #include "lemon/utils/path_utils.h" @@ -41,15 +34,9 @@ using json = nlohmann::json; namespace lemon::backends { const BackendSpec* try_get_spec_for_recipe(const std::string& recipe) { - if (recipe == "llamacpp") return &LlamaCppServer::SPEC; - if (recipe == "whispercpp") return &WhisperServer::SPEC; - if (recipe == "sd-cpp") return &SDServer::SPEC; - if (recipe == "kokoro") return &KokoroServer::SPEC; - if (recipe == "ryzenai-llm") return &::lemon::RyzenAIServer::SPEC; - if (recipe == "vllm") return &VLLMServer::SPEC; - if (recipe == "flm") return &FastFlowLMServer::SPEC; - if (recipe == "moonshine") return &MoonshineServer::SPEC; - return nullptr; + // Each backend exposes its install/download spec through the registry + // (see ::spec()); no per-recipe branches or server includes here. + return spec_for(recipe); } static std::string hash_string_from_json(const json& node) { @@ -315,8 +302,8 @@ namespace lemon::backends { std::string& out_section, std::string& out_bin_key) { std::string config_backend = backend; - if ((recipe == "llamacpp" || recipe == "sd-cpp") && - (backend == "rocm-stable" || backend == "rocm-nightly")) { + if ((recipe_has_rocm_channels(recipe) && + (backend == "rocm-stable" || backend == "rocm-nightly"))) { config_backend = "rocm"; } out_section = RuntimeConfig::recipe_to_config_section(recipe); @@ -369,7 +356,7 @@ namespace lemon::backends { // Resolve "rocm" to actual channel for backends that support ROCm channels std::string resolved_backend = backend; - if ((spec.recipe == "llamacpp" || spec.recipe == "sd-cpp") && backend == "rocm") { + if (recipe_has_rocm_channels(spec.recipe) && backend == "rocm") { std::string channel = "stable"; // default to stable if (auto* cfg = RuntimeConfig::global()) { channel = cfg->rocm_channel_for_recipe(spec.recipe); @@ -409,7 +396,7 @@ namespace lemon::backends { // directory or ROCm backends remain stuck in update_required after a // successful install. std::string resolved_backend = backend; - if ((spec.recipe == "llamacpp" || spec.recipe == "sd-cpp") && backend == "rocm") { + if (recipe_has_rocm_channels(spec.recipe) && backend == "rocm") { std::string channel = "stable"; if (auto* cfg = RuntimeConfig::global()) { channel = cfg->rocm_channel_for_recipe(spec.recipe); @@ -423,7 +410,7 @@ namespace lemon::backends { std::string BackendUtils::get_backend_version(const std::string& recipe, const std::string& backend) { std::string resolved_backend = backend; - if ((recipe == "llamacpp" || recipe == "sd-cpp") && backend == "rocm") { + if (recipe_has_rocm_channels(recipe) && backend == "rocm") { // Map "rocm" to the appropriate channel based on config std::string channel = "stable"; // default to stable for now if (auto* cfg = RuntimeConfig::global()) { diff --git a/src/cpp/server/backends/cloud_server.cpp b/src/cpp/server/backends/cloud/cloud_server.cpp similarity index 90% rename from src/cpp/server/backends/cloud_server.cpp rename to src/cpp/server/backends/cloud/cloud_server.cpp index b29fee4cc..3c61c213b 100644 --- a/src/cpp/server/backends/cloud_server.cpp +++ b/src/cpp/server/backends/cloud/cloud_server.cpp @@ -1,4 +1,6 @@ -#include "lemon/backends/cloud_server.h" +#include "lemon/backends/cloud/cloud_server.h" +#include "lemon/backends/backend_registry.h" +#include "lemon/model_manager.h" #include "lemon/cloud_provider_registry.h" #include "lemon/error_types.h" #include "lemon/runtime_config.h" @@ -830,3 +832,81 @@ std::vector CloudServer::discover_models(const std::string& provider, } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace cloud { + +std::unique_ptr create(const BackendContext& ctx) { + return std::make_unique( + ctx.model_info->cloud_provider, ctx.log_level, + ctx.model_manager, ctx.backend_manager, ctx.cloud_registry); +} + + +namespace { +class CloudOps : public BackendOps { +public: + std::string resolve_checkpoint_path(const ModelInfo&, + const CheckpointResolveContext&) const override { + // Cloud-offloaded models have no local artifacts; the checkpoint is the + // upstream provider's model id, used directly when forwarding requests. + return ""; + } + + // Cloud models have no local artifacts — always "downloaded". + bool is_downloaded(const ModelInfo&, const BackendOpsContext&) const override { + return true; + } + + // "Downloading" a cloud model is a no-op. + void download_model(const ModelInfo&, bool, DownloadProgressCallback, + const BackendOpsContext&) const override {} + + // Discover models from each installed cloud provider with a resolvable + // credential. Per AGENTS.md invariant #11 the registry persists only + // {provider, base_url}; keys come from env vars / process memory. Failures + // are logged, never propagated, so one offline provider can't block discovery. + std::vector discover_models(const BackendOpsContext& ctx) const override { + std::vector out; + if (ctx.cloud_registry == nullptr) { + return out; + } + for (const auto& rec : ctx.cloud_registry->list_installed()) { + const std::string api_key = ctx.cloud_registry->resolve_key(rec.name); + if (api_key.empty() || rec.base_url.empty()) { + LOG(INFO, "CloudOps") << "Skipping cloud discovery for '" << rec.name + << "': no API key resolvable (set " + << CloudProviderRegistry::env_var_name(rec.name) + << " or POST /v1/cloud/auth)" << std::endl; + continue; + } + // Don't send the API key to a plaintext http:// endpoint unless the + // provider explicitly opted in (AGENTS.md invariant #11). + if (CloudProviderRegistry::is_http_base_url(rec.base_url) && !rec.allow_insecure_http) { + LOG(WARNING, "CloudOps") << "Skipping cloud discovery for '" << rec.name + << "': http:// with API key requires allow_insecure_http=true" + << std::endl; + continue; + } + try { + for (auto& m : CloudServer::discover_models(rec.name, api_key, rec.base_url)) { + if (m.recipe == "cloud" && !m.model_name.empty()) { + out.push_back(std::move(m)); + } + } + } catch (const std::exception& e) { + LOG(WARNING, "CloudOps") << "Cloud discovery threw for '" << rec.name + << "': " << e.what() << std::endl; + } + } + return out; + } +}; +} // namespace + +const BackendSpec* spec() { return nullptr; } +const BackendOps* ops() { return single_ops(); } +} // namespace cloud +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp new file mode 100644 index 000000000..83d2080bc --- /dev/null +++ b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp @@ -0,0 +1,753 @@ +#include "lemon/backends/fastflowlm/fastflowlm_models.h" + +#include +#include +#include +#include "lemon/model_manager.h" +#include "lemon/utils/aixlog.hpp" +#include "lemon/utils/json_utils.h" +#include "lemon/utils/path_utils.h" +#include +#include +#include +#include "lemon/backends/backend_descriptor_registry.h" +#include "lemon/backends/backend_registry.h" +#include "lemon/backends/backend_utils.h" +#include "lemon/system_info.h" +#include "lemon/utils/process_manager.h" + +namespace fs = std::filesystem; +using json = nlohmann::json; + +namespace lemon { +namespace backends { +namespace fastflowlm { +namespace { + +using lemon::utils::path_from_utf8; +using lemon::utils::path_to_utf8; + +bool safe_exists(const fs::path& p) { + std::error_code ec; + return fs::exists(p, ec); +} + +// Candidate roots that FLM may use to store models. FLM resolves its model +// directory from the FLM_MODEL_PATH env var (set by the installer) and falls +// back to platform-default locations. +std::vector get_flm_models_dir_candidates() { + std::vector roots; + + const char* flm_model_path = std::getenv("FLM_MODEL_PATH"); + if (flm_model_path && *flm_model_path) { + roots.push_back(path_from_utf8(flm_model_path) / "models"); + } + +#ifdef _WIN32 + const char* userprofile = std::getenv("USERPROFILE"); + if (userprofile && *userprofile) { + fs::path home = path_from_utf8(userprofile); + roots.push_back(home / ".flm" / "models"); // current installer default + roots.push_back(home / "Documents" / "flm" / "models"); // legacy installer default + roots.push_back(home / "flm" / "models"); + } +#else + const char* xdg_config_home = std::getenv("XDG_CONFIG_HOME"); + if (xdg_config_home && *xdg_config_home) { + roots.push_back(path_from_utf8(xdg_config_home) / "flm" / "models"); + } + const char* home = std::getenv("HOME"); + if (home && *home) { + fs::path home_path = path_from_utf8(home); + roots.push_back(home_path / ".flm" / "models"); + roots.push_back(home_path / ".config" / "flm" / "models"); + } +#endif + + return roots; +} + +} // namespace + +fs::path find_flm_config_path_from_repo_dir(const std::string& repo_dir) { + if (repo_dir.empty()) return fs::path(); + + for (const auto& root : get_flm_models_dir_candidates()) { + fs::path candidate = root / repo_dir / "config.json"; + if (safe_exists(candidate)) return candidate; + } + return fs::path(); +} + +std::string repo_dir_from_url(const std::string& url) { + std::string clean = url; + while (!clean.empty() && clean.back() == '/') clean.pop_back(); + size_t query_pos = clean.find_first_of("?#"); + if (query_pos != std::string::npos) clean = clean.substr(0, query_pos); + + for (const std::string marker : {"/tree/", "/resolve/"}) { + size_t marker_pos = clean.find(marker); + if (marker_pos != std::string::npos) { + clean = clean.substr(0, marker_pos); + break; + } + } + + size_t slash = clean.find_last_of('/'); + return slash == std::string::npos ? clean : clean.substr(slash + 1); +} + +int64_t read_flm_max_context_window(const ModelInfo& info) { + if (info.type != ModelType::LLM) return 0; + + std::string config_path = info.resolved_path("config"); + if (config_path.empty()) return 0; + + try { + json config = lemon::utils::JsonUtils::load_from_file(config_path); + if (config.contains("max_position_embeddings") && config["max_position_embeddings"].is_number_integer()) { + int64_t value = config["max_position_embeddings"].get(); + return value > 0 ? value : 0; + } + if (config.contains("text_config") && config["text_config"].is_object()) { + const auto& text_config = config["text_config"]; + if (text_config.contains("max_position_embeddings") && text_config["max_position_embeddings"].is_number_integer()) { + int64_t value = text_config["max_position_embeddings"].get(); + return value > 0 ? value : 0; + } + } + } catch (const std::exception& e) { + LOG(DEBUG, "FastFlowLM") << "Could not read FLM config metadata for " + << info.model_name << ": " << e.what() << std::endl; + } + return 0; +} + +std::string find_flm_binary() { + try { + const backends::BackendSpec* spec = try_get_spec_for_recipe("flm"); + if (!spec) { + return ""; + } + return BackendUtils::get_backend_binary_path(*spec, "npu"); + } catch (...) { +#ifndef _WIN32 + return find_flm_executable(); +#else + return ""; +#endif + } +} + +std::vector flm_installed_checkpoints() { + std::vector installed_models; + + std::string flm_path = find_flm_binary(); + if (flm_path.empty()) return installed_models; + + // Run 'flm list --filter installed --quiet --json' to get only installed models + std::string output; +#ifdef _WIN32 + std::string command = "\"" + flm_path + "\" list --filter installed --quiet --json 2>NUL"; + int rc = lemon::utils::ProcessManager::run_command(command, output); +#else + std::string command = "\"" + flm_path + "\" list --filter installed --quiet --json 2>/dev/null"; + FILE* pipe = popen(command.c_str(), "r"); + if (!pipe) { + return installed_models; + } + + char buffer[256]; + while (fgets(buffer, sizeof(buffer), pipe) != nullptr) { + output += buffer; + } + + pclose(pipe); +#endif + + // Parse output: { "models": [ { "name": "modelname:tag", ... }, ... ] } + try { + json j = lemon::utils::JsonUtils::parse(output); + if (j.contains("models") && j["models"].is_array()) { + for (const auto& model : j["models"]) { + if (model.contains("name") && model["name"].is_string()) { + installed_models.push_back(model["name"].get()); + } + } + return installed_models; + } + } catch (...) { + // Fallback to legacy parsing if JSON parsing fails + } + + // Legacy parsing - cleaner format without emojis + // Expected format: + // Models: + // - modelname:tag + // - another:model + std::istringstream stream(output); + std::string line; + while (std::getline(stream, line)) { + // Trim whitespace + line.erase(0, line.find_first_not_of(" \t\r\n")); + line.erase(line.find_last_not_of(" \t\r\n") + 1); + + // Skip the "Models:" header line or empty lines + if (line == "Models:" || line.empty()) { + continue; + } + + // Parse model checkpoint (format: " - modelname:tag") + if (line.find("- ") == 0) { + std::string checkpoint = line.substr(2); + // Trim any remaining whitespace + checkpoint.erase(0, checkpoint.find_first_not_of(" \t")); + checkpoint.erase(checkpoint.find_last_not_of(" \t") + 1); + if (!checkpoint.empty()) { + installed_models.push_back(checkpoint); + } + } + } + + return installed_models; +} + +std::vector flm_discover_models() { + std::vector flm_models; + if (!SystemInfoCache::get_flm_status().is_ready()) { + return flm_models; + } + + std::string flm_path = find_flm_binary(); + if (flm_path.empty()) return flm_models; + + LOG(INFO, "ModelManager") << "FLM binary found at: " << flm_path << std::endl; + + // Run 'flm list --json' to get all available models + std::string output; +#ifdef _WIN32 + std::string command = "\"" + flm_path + "\" list --json"; + int rc = lemon::utils::ProcessManager::run_command(command, output); + LOG(INFO, "ModelManager") << "flm list --json exit code: " << rc + << ", output length: " << output.size() << std::endl; + if (rc != 0 || output.empty()) { + LOG(WARNING, "ModelManager") << "flm list --json failed or returned empty. " + << "Output: " << output.substr(0, 200) << std::endl; + } +#else + std::string command = "\"" + flm_path + "\" list --json 2>/dev/null"; + FILE* pipe = popen(command.c_str(), "r"); + if (!pipe) { + return flm_models; + } + + char buffer[256]; + while (fgets(buffer, sizeof(buffer), pipe) != nullptr) { + output += buffer; + } + + pclose(pipe); +#endif + + // Parse output: { "models": [ { "name": "modelname:tag", "footprint": 1.23, ... }, ... ] } + try { + json j = lemon::utils::JsonUtils::parse(output); + if (j.contains("models") && j["models"].is_array()) { + for (const auto& m : j["models"]) { + if (m.contains("name") && m["name"].is_string()) { + std::string checkpoint = m["name"].get(); + + // Format display name: replace : with -, append -FLM + // e.g., "llama3.2:1b" -> "llama3.2-1b-FLM" + std::string display_name = checkpoint; + // Replace : with - + std::replace(display_name.begin(), display_name.end(), ':', '-'); + + std::string model_name = display_name + "-FLM"; + + ModelInfo info; + info.model_name = model_name; + info.checkpoints["main"] = checkpoint; + info.recipe = "flm"; + info.suggested = true; // All official FLM models are suggested + info.downloaded = lemon::utils::JsonUtils::get_or_default(m, "installed", false); + + if (lemon::utils::JsonUtils::get_or_default(m, "installed", false) && m.contains("url") && m["url"].is_string()) { + fs::path config_path = backends::fastflowlm::find_flm_config_path_from_repo_dir( + backends::fastflowlm::repo_dir_from_url(m["url"].get())); + if (!config_path.empty()) { + info.resolved_paths["config"] = path_to_utf8(config_path); + } + } + + // Size in GB (footprint field contains disk size in GB) + if (m.contains("footprint") && m["footprint"].is_number()) { + info.size = m["footprint"].get(); + } + + // Labels from FLM metadata + if (m.contains("label") && m["label"].is_array()) { + for (const auto& l : m["label"]) { + if (l.is_string()) { + info.labels.push_back(l.get()); + } + } + } + + // Populate type and device fields (multi-model support) + info.type = get_model_type_from_labels(info.labels); + const BackendDescriptor* flm_desc = descriptor_for("flm"); + info.device = flm_desc ? flm_desc->default_device : DEVICE_NPU; + + flm_models.push_back(info); + } + } + } + } catch (const std::exception& e) { + LOG(WARNING, "ModelManager") << "FLM model discovery failed: " << e.what() << std::endl; + } catch (...) { + LOG(WARNING, "ModelManager") << "FLM model discovery failed with unknown error" << std::endl; + } + + return flm_models; +} + + +void flm_download(const std::string& checkpoint, bool do_not_upgrade, + DownloadProgressCallback progress_callback) { + LOG(INFO, "ModelManager") << "Pulling FLM model: " << checkpoint << std::endl; + + // Ensure FLM is ready (single source of truth) + auto status = SystemInfoCache::get_flm_status(); + if (!status.is_ready()) { + throw std::runtime_error(status.error_string()); + } + + std::string flm_path = find_flm_binary(); + if (flm_path.empty()) { + throw std::runtime_error("FLM executable not found"); + } + + // Prepare arguments + std::vector args = {"pull", checkpoint}; + if (!do_not_upgrade) { + args.push_back("--force"); + } + + LOG(INFO, "ProcessManager") << "Starting process: \"" << flm_path << "\""; + for (const auto& arg : args) { + LOG(INFO, "ProcessManager") << " \"" << arg << "\""; + } + LOG(INFO, "ProcessManager") << std::endl; + + // State for parsing FLM output + int total_files = 0; + int current_file_index = 0; + std::string current_filename; + bool cancelled = false; + + // Run flm pull command and parse output + int exit_code = lemon::utils::ProcessManager::run_process_with_output( + flm_path, args, + [&](const std::string& line) -> bool { + // Always print the line to console + LOG(INFO, "FLM") << line << std::endl; + + // Parse FLM output to extract progress information + // Pattern: "[FLM] Downloading X/Y: filename" + if (line.find("[FLM] Downloading ") != std::string::npos && + line.find("/") != std::string::npos && + line.find(":") != std::string::npos) { + + // Extract "X/Y: filename" from "[FLM] Downloading X/Y: filename" + size_t start = line.find("Downloading ") + 12; + size_t slash = line.find("/", start); + size_t colon = line.find(":", slash); + + if (slash != std::string::npos && colon != std::string::npos) { + try { + current_file_index = std::stoi(line.substr(start, slash - start)); + total_files = std::stoi(line.substr(slash + 1, colon - slash - 1)); + current_filename = line.substr(colon + 2); // Skip ": " + + // Send progress update + if (progress_callback) { + DownloadProgress progress; + progress.file = current_filename; + progress.file_index = current_file_index; + progress.total_files = total_files; + progress.bytes_downloaded = 0; + progress.bytes_total = 0; + progress.percent = (total_files > 0) ? + ((current_file_index - 1) * 100 / total_files) : 0; + + if (!progress_callback(progress)) { + cancelled = true; + return false; // Kill the process + } + } + } catch (...) { + // Ignore parse errors + } + } + } + // Pattern: "[FLM] Downloading: XX.X% (XXX.XMB / XXX.XMB)" + else if (line.find("[FLM] Downloading: ") != std::string::npos && + line.find("%") != std::string::npos) { + + // Extract percentage and bytes + size_t start = line.find("Downloading: ") + 13; + size_t pct_end = line.find("%", start); + + if (pct_end != std::string::npos) { + try { + std::string pct_str = line.substr(start, pct_end - start); + double file_percent = std::stod(pct_str); + + // Try to extract bytes (XXX.XMB / XXX.XMB) + size_t open_paren = line.find("(", pct_end); + size_t slash = line.find("/", open_paren); + size_t close_paren = line.find(")", slash); + + size_t bytes_downloaded = 0; + size_t bytes_total = 0; + + if (open_paren != std::string::npos && slash != std::string::npos) { + std::string downloaded_str = line.substr(open_paren + 1, slash - open_paren - 1); + std::string total_str = line.substr(slash + 1, close_paren - slash - 1); + + // Parse "XXX.XMB" format + auto parse_size = [](const std::string& s) -> size_t { + double val = 0; + size_t mb_pos = s.find("MB"); + size_t gb_pos = s.find("GB"); + size_t kb_pos = s.find("KB"); + + if (mb_pos != std::string::npos) { + val = std::stod(s.substr(0, mb_pos)); + return static_cast(val * 1024 * 1024); + } else if (gb_pos != std::string::npos) { + val = std::stod(s.substr(0, gb_pos)); + return static_cast(val * 1024 * 1024 * 1024); + } else if (kb_pos != std::string::npos) { + val = std::stod(s.substr(0, kb_pos)); + return static_cast(val * 1024); + } + return 0; + }; + + bytes_downloaded = parse_size(downloaded_str); + bytes_total = parse_size(total_str); + } + + // Send progress update with byte-level info + if (progress_callback) { + DownloadProgress progress; + progress.file = current_filename; + progress.file_index = current_file_index; + progress.total_files = total_files; + progress.bytes_downloaded = bytes_downloaded; + progress.bytes_total = bytes_total; + // Use intra-file percent when we have byte-level progress + progress.percent = static_cast(file_percent); + + if (!progress_callback(progress)) { + cancelled = true; + return false; // Kill the process + } + } + } catch (...) { + // Ignore parse errors + } + } + } + // Pattern: "[FLM] Overall progress: XX.X% (X/Y files)" + else if (line.find("[FLM] Overall progress: ") != std::string::npos) { + size_t start = line.find("progress: ") + 10; + size_t pct_end = line.find("%", start); + + if (pct_end != std::string::npos) { + try { + int overall_percent = static_cast(std::stod(line.substr(start, pct_end - start))); + + if (progress_callback) { + DownloadProgress progress; + progress.file = current_filename; + progress.file_index = current_file_index; + progress.total_files = total_files; + progress.bytes_downloaded = 0; // Not available for overall progress + progress.bytes_total = 0; + progress.percent = overall_percent; + + if (!progress_callback(progress)) { + cancelled = true; + return false; // Kill the process + } + } + } catch (...) { + // Ignore parse errors + } + } + } + // Pattern: "[FLM] Missing files (N):" + else if (line.find("[FLM] Missing files (") != std::string::npos) { + size_t start = line.find("(") + 1; + size_t end = line.find(")", start); + if (end != std::string::npos) { + try { + total_files = std::stoi(line.substr(start, end - start)); + } catch (...) { + // Ignore parse errors + } + } + } + + return true; // Continue + }, + "", // Working directory + 3600 // 1 hour timeout for large model downloads + ); + + if (cancelled) { + LOG(INFO, "ModelManager") << "FLM download cancelled by client" << std::endl; + throw std::runtime_error("Download cancelled"); + } + + if (exit_code != 0) { + LOG(ERROR, "ModelManager") << "FLM pull failed with exit code: " << exit_code << std::endl; + throw std::runtime_error("FLM pull failed with exit code: " + std::to_string(exit_code)); + } + + // Send completion event + if (progress_callback) { + DownloadProgress progress; + progress.complete = true; + progress.file_index = total_files; + progress.total_files = total_files; + progress.percent = 100; + (void)progress_callback(progress); // Ignore return - download already complete + } + + LOG(INFO, "ModelManager") << "FLM model pull completed successfully" << std::endl; +} + + +std::string flm_version() { + // Cache real version strings to avoid spawning the subprocess twice per + // build_recipes_info() pass. "unknown" is NOT cached so that post-install + // verification in fastflowlm_server.cpp gets a fresh result after FLM is installed. + static std::string cached_version; + if (!cached_version.empty()) { + return cached_version; + } + + // Find the flm executable using shared utility + std::string flm_path = find_flm_executable(); + if (flm_path.empty() || !lemon::utils::is_safe_executable_path(flm_path)) { + return "unknown"; + } + + std::string output; + #ifdef _WIN32 + std::string command = "\"" + flm_path + "\" version --json 2>NUL"; + int rc = lemon::utils::ProcessManager::run_command(command, output); + #else + std::string command = "\"" + flm_path + "\" version --json 2>/dev/null"; + FILE* pipe = popen(command.c_str(), "r"); + if (!pipe) { + return "unknown"; + } + + char buffer[256]; + while (fgets(buffer, sizeof(buffer), pipe) != nullptr) { + output += buffer; + } + + pclose(pipe); + #endif + + // Parse JSON output: { "version": "0.9.34" } + try { + json j = lemon::utils::JsonUtils::parse(output); + if (j.contains("version") && j["version"].is_string()) { + std::string version = j["version"].get(); + // If the version doesn't start with 'v', prepend it + // for backend_versions.json compatibility (e.g. "v0.9.34"). + if (!version.empty() && version[0] != 'v') { + version = "v" + version; + } + cached_version = version; + return cached_version; + } + } catch (...) { + // Fallback to legacy parsing if JSON parsing fails + } + + // Legacy parsing from output like "FLM v0.9.4" + if (output.find("FLM v") != std::string::npos) { + size_t pos = output.find("FLM v"); + // Keep the 'v' prefix so it matches backend_versions.json (e.g. "v0.9.34"). + std::string version = output.substr(pos + 4); + // Trim whitespace and newlines + size_t end = version.find_first_of(" \t\n\r"); + if (end != std::string::npos) { + version = version.substr(0, end); + } + cached_version = version; + return cached_version; + } + + return "unknown"; +} + + +std::string find_flm_executable() { +#ifdef _WIN32 + // On Windows, only check the Lemonade install directory (auto-installed zip). + // No system PATH fallback - FLM should be installed via install_backend(). + std::string install_dir = (fs::path(lemon::utils::get_downloaded_bin_dir()) / "flm" / "npu").make_preferred().string(); + if (fs::exists(install_dir)) { + for (const auto& entry : fs::recursive_directory_iterator(install_dir)) { + if (entry.is_regular_file() && entry.path().filename().string() == "flm.exe") { + std::string path = entry.path().string(); + if (lemon::utils::is_safe_executable_path(path)) { + return path; + } + } + } + } + return ""; +#else + // Walk PATH directly — minimal Fedora/openSUSE containers do not ship `which`. + if (!lemon::utils::find_executable_in_path("flm").empty()) { + return "flm"; + } + return ""; +#endif +} + +bool run_flm_validate(const std::string& flm_path, std::string& error_message) { + std::string flm_exe = flm_path.empty() ? find_flm_executable() : flm_path; + if (flm_exe.empty()) { + error_message = "FLM executable not found"; + return false; + } + if (!lemon::utils::is_safe_executable_path(flm_exe)) { + error_message = "FLM path contains invalid characters"; + return false; + } + + std::string command = "\"" + flm_exe + "\" validate --json"; + std::string output; + int exit_code; +#ifdef _WIN32 + exit_code = lemon::utils::ProcessManager::run_command(command, output); +#else + FILE* pipe = popen(command.c_str(), "r"); + if (!pipe) { + error_message = "Failed to execute " + flm_exe; + return false; + } + + char buffer[1024]; + while (fgets(buffer, sizeof(buffer), pipe) != nullptr) { + output += buffer; + } + + exit_code = pclose(pipe); + if (exit_code != -1) { + exit_code = WEXITSTATUS(exit_code); + } +#endif + + try { + if (!output.empty()) { + json j = lemon::utils::JsonUtils::parse(output); + if (j.is_object()) { + // Check for overall status + bool validation_ok = false; + if (j.contains("ready")) { + validation_ok = j["ready"].get(); + } + + if (validation_ok) { + error_message.clear(); + return true; + } + + std::vector errors; + + if (j.contains("amd_device_found") && !j["amd_device_found"].get()) { + errors.push_back("No AMD NPU device found."); + } + + if (j.contains("all_fw_ok") && !j["all_fw_ok"].get()) { + errors.push_back("NPU firmware is incompatible."); + } + if (j.contains("kernel_ok") && !j["kernel_ok"].get()) { + errors.push_back("Kernel version is incompatible."); + } + + if (j.contains("memlock_ok") && !j["memlock_ok"].get()) { + errors.push_back("Memlock limits are too low."); + } + + if (j.contains("npu_driver_ok") && !j["npu_driver_ok"].get()) { + errors.push_back("NPU driver version is too old."); + } + + if (errors.empty()) { + error_message = "NPU validation failed."; + } else { + error_message = ""; + for (size_t i = 0; i < errors.size(); ++i) { + error_message += errors[i] + (i == errors.size() - 1 ? "" : " "); + } + } + return false; + } + } + } catch (...) { + // Fallback for non-JSON output or parsing error + } + + if (exit_code != 0) { + error_message = "flm validate failed with exit code " + std::to_string(exit_code); + return false; + } + + error_message.clear(); + return true; +} + + +void flm_remove(const std::string& checkpoint) { + if (checkpoint.empty()) { + throw std::runtime_error("FLM model has empty checkpoint field, cannot delete"); + } + std::string flm_path = find_flm_binary(); + if (flm_path.empty()) { + throw std::runtime_error("FLM executable not found"); + } + std::vector args = {"remove", checkpoint}; + auto handle = lemon::utils::ProcessManager::start_process(flm_path, args, "", false); + + int timeout_seconds = 60; + for (int i = 0; i < timeout_seconds * 10; ++i) { + if (!lemon::utils::ProcessManager::is_running(handle)) { + int exit_code = lemon::utils::ProcessManager::get_exit_code(handle); + if (exit_code != 0) { + throw std::runtime_error("FLM remove failed for " + checkpoint + + " (exit code " + std::to_string(exit_code) + ")"); + } + return; + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + lemon::utils::ProcessManager::stop_process(handle); + throw std::runtime_error("FLM remove timed out for " + checkpoint); +} + +} // namespace fastflowlm +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp similarity index 80% rename from src/cpp/server/backends/fastflowlm_server.cpp rename to src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp index dc38928e3..050b5a961 100644 --- a/src/cpp/server/backends/fastflowlm_server.cpp +++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp @@ -1,5 +1,10 @@ -#include "lemon/backends/fastflowlm_server.h" +#include "lemon/backends/fastflowlm/fastflowlm_server.h" +#include "lemon/backends/fastflowlm/fastflowlm.h" +#include "lemon/backends/fastflowlm/fastflowlm_models.h" +#include "lemon/backends/backend_registry.h" +#include "lemon/backends/backend_ops.h" #include "lemon/backends/backend_utils.h" +#include "lemon/model_manager.h" #include "lemon/system_info.h" #include "lemon/error_types.h" #include "lemon/utils/process_manager.h" @@ -9,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -160,13 +166,13 @@ void FastFlowLMServer::load(const std::string& model_name, #ifdef _WIN32 // On Windows, auto-install FLM binary if needed (downloads zip and extracts) - backend_manager_->install_backend(SPEC.recipe, "npu"); + backend_manager_->install_backend(fastflowlm::spec()->recipe, "npu"); #endif // Validate NPU hardware/drivers std::string flm_path = get_flm_path(); std::string validate_error; - if (!utils::run_flm_validate(flm_path, validate_error)) { + if (!fastflowlm::run_flm_validate(flm_path, validate_error)) { throw std::runtime_error("FLM NPU validation failed: " + validate_error + "\nVisit " + DRIVER_INSTALL_URL + " for driver installation instructions."); } @@ -444,7 +450,7 @@ std::string FastFlowLMServer::get_flm_path() { #ifdef _WIN32 // On Windows, use the standard install directory (auto-installed zip) try { - std::string path = BackendUtils::get_backend_binary_path(SPEC, "npu"); + std::string path = BackendUtils::get_backend_binary_path(*fastflowlm::spec(), "npu"); LOG(INFO, "FastFlowLM") << "Found flm at: " << path << std::endl; return path; } catch (const std::exception& e) { @@ -453,7 +459,7 @@ std::string FastFlowLMServer::get_flm_path() { } #else // On Linux, FLM is installed as a system package (in PATH) - std::string flm_path = utils::find_flm_executable(); + std::string flm_path = fastflowlm::find_flm_executable(); if (!flm_path.empty()) { LOG(INFO, "FastFlowLM") << "Found flm at: " << flm_path << std::endl; } else { @@ -465,3 +471,101 @@ std::string FastFlowLMServer::get_flm_path() { } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace fastflowlm { + +std::unique_ptr create(const BackendContext& ctx) { + return make_server(ctx); +} + +namespace { +// FLM model-management behavior: max context window from the model's config.json. +class FlmOps : public BackendOps { +public: + void populate_metadata(ModelInfo& info, const BackendOpsContext&) const override { + info.max_context_window = read_flm_max_context_window(info); + } + + std::string resolve_checkpoint_path(const ModelInfo&, + const CheckpointResolveContext& ctx) const override { + // FLM uses the checkpoint string as-is (e.g. "gemma3:4b"); no local file. + return ctx.checkpoint; + } + + std::vector discover_models(const BackendOpsContext&) const override { + return flm_discover_models(); + } + + bool is_downloaded(const ModelInfo& info, const BackendOpsContext&) const override { + const auto installed = flm_installed_checkpoints(); + return std::find(installed.begin(), installed.end(), info.checkpoint()) != installed.end(); + } + + void download_model(const ModelInfo& info, bool do_not_upgrade, DownloadProgressCallback progress, + const BackendOpsContext&) const override { + flm_download(info.checkpoint(), do_not_upgrade, progress); + } + + bool invalidates_cache_after_download() const override { return true; } + + std::string resolve_version(const std::string&, const std::string& file_version) const override { + // On Linux FLM is a system package with no version.txt; query the CLI. + if (file_version.empty() || file_version == "unknown") { + return flm_version(); + } + return file_version; + } + + InstallCheck check_install(const std::string&, bool binary_found) const override { + // On Linux FLM is a system package on PATH, not in the managed install dir. + if (!binary_found && !find_flm_executable().empty()) { + return {true, ""}; + } + return {binary_found, ""}; + } + + std::optional classify_unavailable( + const std::string&, const std::string& install_error, + const std::string& default_install_command) const override { + // FLM needs richer state to guide users through manual setup (installing + // the .deb, xrt drivers, etc.) rather than an automatic backend install. + bool is_not_installed = install_error.empty() + || install_error.find("not installed") != std::string::npos + || install_error.find("not found") != std::string::npos; + bool is_version_mismatch = install_error.find("requires") != std::string::npos; + + UnavailableState s; + if (is_not_installed) { + s.state = "installable"; + } else if (is_version_mismatch) { + s.state = "update_required"; + } else { + s.state = "action_required"; + } + s.message = install_error; + s.attach_installed_version = !is_not_installed; + +#ifdef __linux__ + (void)default_install_command; + s.action = "Visit https://lemonade-server.ai/flm_npu_linux.html?mode=troubleshoot"; +#elif defined(_WIN32) + if (!is_not_installed && !is_version_mismatch) { + s.action = "Visit https://lemonade-server.ai/driver_install.html"; + } else { + s.action = default_install_command; + } +#else + s.action = default_install_command; +#endif + return s; + } +}; +} // namespace + +const BackendSpec* spec() { return make_spec(descriptor); } +const BackendOps* ops() { return single_ops(); } +} // namespace fastflowlm +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/hf_cache_util.cpp b/src/cpp/server/backends/hf_cache_util.cpp new file mode 100644 index 000000000..028b25ee4 --- /dev/null +++ b/src/cpp/server/backends/hf_cache_util.cpp @@ -0,0 +1,72 @@ +#include "lemon/backends/hf_cache_util.h" + +#include + +#ifdef _WIN32 +#include +#endif + +namespace fs = std::filesystem; + +namespace lemon { +namespace backends { +namespace hf_cache { + +bool exists(const fs::path& p) { +#ifdef _WIN32 + // The HF cache uses symlinks for dedup; MSVC's std::filesystem refuses + // "untrusted" reparse points when the token lacks symlink privilege, so use + // the Win32 API which has no such restriction. + return GetFileAttributesW(p.c_str()) != INVALID_FILE_ATTRIBUTES; +#else + std::error_code ec; + return fs::exists(p, ec); +#endif +} + +fs::directory_options dir_options() { +#ifdef _WIN32 + return fs::directory_options::skip_permission_denied; +#else + return fs::directory_options::none; +#endif +} + +namespace { +std::string read_ref_main(const fs::path& model_cache_path) { + std::ifstream refs_file(model_cache_path / "refs" / "main"); + if (!refs_file.is_open()) { + return ""; + } + std::string ref; + std::getline(refs_file, ref); + ref.erase(0, ref.find_first_not_of(" \t\r\n")); + size_t last = ref.find_last_not_of(" \t\r\n"); + if (last == std::string::npos) { + return ""; + } + ref.erase(last + 1); + return ref; +} +} // namespace + +fs::path active_snapshot_path(const fs::path& model_cache_path) { + std::string ref = read_ref_main(model_cache_path); + if (ref.empty()) { + return fs::path(); + } + fs::path snapshot_path = model_cache_path / "snapshots" / ref; + return lemon::backends::hf_cache::exists(snapshot_path) ? snapshot_path : fs::path(); +} + +std::string repo_id_to_cache_dir_name(const std::string& repo_id) { + std::string cache_dir_name = "models--"; + for (char c : repo_id) { + cache_dir_name += (c == '/') ? "--" : std::string(1, c); + } + return cache_dir_name; +} + +} // namespace hf_cache +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/kokoro_server.cpp b/src/cpp/server/backends/kokoro/kokoro_server.cpp similarity index 80% rename from src/cpp/server/backends/kokoro_server.cpp rename to src/cpp/server/backends/kokoro/kokoro_server.cpp index 7a707cd7e..95d46de6a 100644 --- a/src/cpp/server/backends/kokoro_server.cpp +++ b/src/cpp/server/backends/kokoro/kokoro_server.cpp @@ -1,5 +1,12 @@ -#include "lemon/backends/kokoro_server.h" +#include "lemon/backends/kokoro/kokoro_server.h" +#include "lemon/backends/kokoro/kokoro.h" +#include "lemon/backends/backend_registry.h" +#include "lemon/backends/backend_ops.h" #include "lemon/backends/backend_utils.h" +#include "lemon/backends/hf_cache_util.h" +#include "lemon/model_manager.h" +#include "lemon/utils/path_utils.h" +#include #include "lemon/backend_manager.h" #include "lemon/utils/process_manager.h" #include "lemon/utils/json_utils.h" @@ -68,7 +75,7 @@ void KokoroServer::load(const std::string& model_name, const ModelInfo& model_in // Install kokoros if needed const std::string backend = default_kokoro_backend(); - backend_manager_->install_backend(SPEC.recipe, backend); + backend_manager_->install_backend(kokoro::spec()->recipe, backend); // Use pre-resolved model path fs::path model_path = fs::path(model_info.resolved_path()); @@ -88,7 +95,7 @@ void KokoroServer::load(const std::string& model_name, const ModelInfo& model_in LOG(INFO, "KokoroServer") << "Using model: " << model_index["model"] << std::endl; // Get koko executable path - std::string exe_path = BackendUtils::get_backend_binary_path(SPEC, backend); + std::string exe_path = BackendUtils::get_backend_binary_path(*kokoro::spec(), backend); // Choose a port port_ = choose_port(); @@ -203,3 +210,38 @@ void KokoroServer::audio_speech(const json& request, httplib::DataSink& sink) { } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace kokoro { + +std::unique_ptr create(const BackendContext& ctx) { + return make_server(ctx); +} + + +namespace { +class KokoroOps : public BackendOps { +public: + std::string resolve_checkpoint_path(const ModelInfo&, + const CheckpointResolveContext& ctx) const override { + // Kokoro models are a directory; resolve to the index.json file inside. + std::filesystem::path dir = lemon::utils::path_from_utf8(ctx.model_cache_path); + if (hf_cache::exists(dir)) { + for (const auto& entry : + std::filesystem::recursive_directory_iterator(dir, hf_cache::dir_options())) { + if (entry.is_regular_file() && entry.path().filename() == "index.json") { + return lemon::utils::path_to_utf8(entry.path()); + } + } + } + return ctx.model_cache_path; // directory even if index not found + } +}; +} // namespace + +const BackendSpec* spec() { return make_spec(descriptor); } +const BackendOps* ops() { return single_ops(); } +} // namespace kokoro +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp b/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp new file mode 100644 index 000000000..81cc1c555 --- /dev/null +++ b/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp @@ -0,0 +1,250 @@ +#include "lemon/backends/llamacpp/llamacpp_gguf.h" + +#include +#include +#include +#include +#include +#include "lemon/backends/hf_cache_util.h" +#include "lemon/hf_variants.h" +#include "lemon/utils/aixlog.hpp" +#include "lemon/utils/path_utils.h" + +namespace fs = std::filesystem; + +namespace lemon { +namespace backends { +namespace llamacpp { +namespace { + +using lemon::utils::path_from_utf8; +using lemon::utils::path_to_utf8; + +std::string to_lower(std::string s) { + std::transform(s.begin(), s.end(), s.begin(), [](unsigned char c) { return std::tolower(c); }); + return s; +} + +} // namespace + +std::string resolve_gguf_path(const std::string& model_cache_path, const std::string& variant) { + fs::path model_cache_path_fs = path_from_utf8(model_cache_path); + if (!hf_cache::exists(model_cache_path_fs)) { + return model_cache_path; // Return directory path even if not found + } + + // Collect the (sorted, mmproj-excluded) GGUF files under a search root. + auto collect_gguf_files = [](const fs::path& search_root) { + std::vector files; + if (search_root.empty() || !hf_cache::exists(search_root)) { + return files; + } + + std::error_code ec; + for (const auto& entry : fs::recursive_directory_iterator(search_root, hf_cache::dir_options(), ec)) { + if (ec) break; + if (!entry.is_regular_file(ec)) { + ec.clear(); + continue; + } + + std::string filename = entry.path().filename().string(); + std::string filename_lower = filename; + std::transform(filename_lower.begin(), filename_lower.end(), filename_lower.begin(), ::tolower); + + if (filename.find(".gguf") != std::string::npos && filename_lower.find("mmproj") == std::string::npos) { + files.push_back(path_to_utf8(entry.path())); + } + } + // Sort for consistent ordering (important for sharded models) and so the + // active/whole-cache sets compare equal when they hold the same files. + std::sort(files.begin(), files.end()); + return files; + }; + + const std::string variant_lower = to_lower(variant); + + // Resolve the requested GGUF variant within a candidate list of files. + // Returns the matched absolute path, or "" if this candidate set does not + // contain the variant. Factored into a lambda so the search can be retried + // against a broader set of snapshots (see #2300 below) without duplicating + // the matching logic. + auto resolve_gguf_variant = [&](const std::vector& gguf_files) -> std::string { + if (gguf_files.empty()) { + return ""; + } + + // Case 0: Wildcard (*) - return first file (llama-server auto-loads shards) + if (variant == "*") { + return gguf_files[0]; + } + + // Case 1: Empty variant - return first file + if (variant.empty()) { + return gguf_files[0]; + } + + // Case 2: Exact filename match (variant ends with .gguf) + if (variant.find(".gguf") != std::string::npos) { + for (const auto& filepath : gguf_files) { + if (path_from_utf8(filepath).filename().string() == variant) { + return filepath; + } + } + return ""; // Exact variant not found in this candidate set + } + + // Case 3: Files ending with {variant}.gguf (case insensitive) + const std::string suffix = variant_lower + ".gguf"; + for (const auto& filepath : gguf_files) { + std::string filename_lower = to_lower(path_from_utf8(filepath).filename().string()); + if (filename_lower.size() >= suffix.size() && + filename_lower.substr(filename_lower.size() - suffix.size()) == suffix) { + return filepath; + } + } + + // Case 4: Folder-based sharding (files in variant/ folder) + const std::string folder_prefix_lower = variant_lower + "/"; + for (const auto& filepath : gguf_files) { + std::string relative_lower = to_lower(path_to_utf8( + path_from_utf8(filepath).lexically_relative(model_cache_path_fs))); + std::replace(relative_lower.begin(), relative_lower.end(), '\\', '/'); + if (relative_lower.find(folder_prefix_lower) != std::string::npos) { + return filepath; + } + } + + // Case 5: Local quant-token fallback. + // + // Keep the existing resolver cases above as the primary logic: exact + // filenames, suffix matches, and folder-based sharding are more + // specific and preserve the CHECKPOINT:VARIANT contract. + // + // Some GGUF repositories name files with the quant token in the middle, + // for example: + // Qwen3.6-27B-MTP-IMAT-IQ4_XS-Q8nextn.gguf + // for variant: + // IQ4_XS + // That file does not end with IQ4_XS.gguf, so mirror the downloader's + // GGUF variant enumeration over the files that are already present in + // the local HF cache before declaring the model missing. + // + // HF cache paths have an extra snapshots// prefix that is not + // part of the repository-relative filename. Strip it before calling + // enumerate_gguf_variants(); otherwise the enumerator treats + // "snapshots" as a top-level sharded-folder variant and never extracts + // the quant token from the actual GGUF filename. + std::vector relative_gguf_files; + std::map absolute_by_relative; + auto repo_relative_from_cache_relative = [](std::string rel) { + std::replace(rel.begin(), rel.end(), '\\', '/'); + + static const std::string snapshots_prefix = "snapshots/"; + if (rel.rfind(snapshots_prefix, 0) == 0) { + size_t revision_end = rel.find('/', snapshots_prefix.size()); + if (revision_end != std::string::npos && revision_end + 1 < rel.size()) { + rel = rel.substr(revision_end + 1); + } + } + + return rel; + }; + + for (const auto& filepath : gguf_files) { + std::string relative_path = path_to_utf8( + path_from_utf8(filepath).lexically_relative(model_cache_path_fs)); + relative_path = repo_relative_from_cache_relative(relative_path); + + // Multiple HF snapshots can contain the same repo-relative file. + // Keep the first absolute path from the sorted file list so + // duplicates do not create false ambiguity. + if (absolute_by_relative.emplace(relative_path, filepath).second) { + relative_gguf_files.push_back(relative_path); + } + } + + std::vector enumerated_matches; + auto local_variants = lemon::enumerate_gguf_variants(relative_gguf_files); + for (const auto& local_variant : local_variants.variants) { + if (to_lower(local_variant.name) != variant_lower) { + continue; + } + + auto it = absolute_by_relative.find(local_variant.primary_file); + if (it != absolute_by_relative.end()) { + enumerated_matches.push_back(it->second); + } + } + + if (enumerated_matches.size() == 1) { + LOG(INFO, "ModelManager") + << "Resolved local GGUF variant '" << variant + << "' via quant-token fallback: " << enumerated_matches[0] << std::endl; + return enumerated_matches[0]; + } + + if (enumerated_matches.size() > 1) { + LOG(WARNING, "ModelManager") + << "Multiple local GGUF files matched variant '" << variant + << "' via quant-token fallback; refusing to guess" << std::endl; + return ""; + } + + // No match in this candidate set. Do not fall back to another + // quantization in the same Hugging Face repo; otherwise a custom + // download with a different quant can make a built-in model appear + // downloaded and allow deleting the wrong file. + return ""; + }; + + // Prefer the active refs/main snapshot so that when upstream only changed + // README/metadata Lemonade keeps using the previous snapshot's artifacts. + std::vector active_gguf_files = + collect_gguf_files(hf_cache::active_snapshot_path(model_cache_path_fs)); + + // Whole-repo-cache candidates spanning every snapshot, populated on demand. + std::vector all_cache_gguf_files; + bool all_cache_computed = false; + auto whole_cache_gguf_files = [&]() -> const std::vector& { + if (!all_cache_computed) { + all_cache_gguf_files = collect_gguf_files(model_cache_path_fs); + all_cache_computed = true; + } + return all_cache_gguf_files; + }; + + if (active_gguf_files.empty() && whole_cache_gguf_files().empty()) { + return model_cache_path; // Return directory if no GGUF found anywhere + } + + std::string resolved_path = resolve_gguf_variant(active_gguf_files); + + // #2300: a sibling variant that shares this HF repo can live in a snapshot + // other than the one refs/main points at. refs/main advances to the + // snapshot of whichever variant was pulled or updated last, leaving the + // other variants' symlinks behind in earlier snapshots; after a restart the + // refs/main-only search above then reports them as missing. If the active + // snapshot did not contain the requested variant, broaden the search to + // every snapshot in this repo's cache before declaring it missing. Blobs are + // content-addressed and shared, so reading an older snapshot is safe, and + // resolving against the active snapshot first preserves the CHECKPOINT:VARIANT + // contract (a different quant is never substituted while the exact one exists). + // + // The whole-cache set is a superset of the active set, so the two are equal + // only when refs/main's snapshot is the sole snapshot holding GGUFs — in + // which case the broader search is identical and skipped. + if (resolved_path.empty()) { + const std::vector& all_files = whole_cache_gguf_files(); + if (all_files != active_gguf_files) { + resolved_path = resolve_gguf_variant(all_files); + } + } + + return resolved_path; +} + +} // namespace llamacpp +} // namespace backends +} // namespace lemon + diff --git a/src/cpp/server/backends/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp similarity index 75% rename from src/cpp/server/backends/llamacpp_server.cpp rename to src/cpp/server/backends/llamacpp/llamacpp_server.cpp index a8b731f63..eb766e798 100644 --- a/src/cpp/server/backends/llamacpp_server.cpp +++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp @@ -1,5 +1,16 @@ -#include "lemon/backends/llamacpp_server.h" +#include "lemon/backends/llamacpp/llamacpp_server.h" +#include "lemon/backends/llamacpp/llamacpp.h" +#include "lemon/backends/llamacpp/llamacpp_gguf.h" +#include "lemon/backends/backend_registry.h" +#include "lemon/backends/backend_ops.h" #include "lemon/backends/backend_utils.h" +#include "lemon/gguf_capabilities.h" +#include "lemon/gguf_reader.h" +#include "lemon/model_manager.h" +#include +#include +#include +#include #include "lemon/auto_tune.h" #include "lemon/backend_manager.h" #include "lemon/runtime_config.h" @@ -281,12 +292,12 @@ void LlamaCppServer::load(const std::string& model_name, device_type_ = use_gpu ? DEVICE_GPU : DEVICE_CPU; // Install llama-server if needed (use per-model backend) - backend_manager_->install_backend(SPEC.recipe, llamacpp_backend); + backend_manager_->install_backend(llamacpp::spec()->recipe, llamacpp_backend); // Use pre-resolved GGUF path. Skipped for hf_load models because llama-server // sources the weights itself via -hf; those models may not have local files. std::string gguf_path = model_info.resolved_path(); - if (gguf_path.empty() && !model_info.hf_load) { + if (gguf_path.empty() && !model_info.extra("hf_load", false)) { throw std::runtime_error("GGUF file not found for checkpoint: " + model_info.checkpoint()); } @@ -302,7 +313,7 @@ void LlamaCppServer::load(const std::string& model_name, port_ = choose_port(); // Get executable path - std::string executable = BackendUtils::get_backend_binary_path(SPEC, llamacpp_backend); + std::string executable = BackendUtils::get_backend_binary_path(*llamacpp::spec(), llamacpp_backend); // Check for embeddings and reranking support based on model type bool supports_embeddings = (model_info.type == ModelType::EMBEDDING); @@ -323,7 +334,7 @@ void LlamaCppServer::load(const std::string& model_name, // is required for models like Qwen2.5-Omni where the manual -m + --mmproj // path rejects audio content parts in /v1/chat/completions — the -hf path // drives the dual-clip (vision+audio) context correctly. - if (model_info.hf_load) { + if (model_info.extra("hf_load", false)) { push_arg(args, reserved_flags, "-hf", model_info.checkpoint(), std::vector{"--hf-repo", "-mr", "--hf-file", "-mf"}); } else { @@ -345,7 +356,7 @@ void LlamaCppServer::load(const std::string& model_name, // Add mmproj file if present (for vision models). Skip when hf_load is set — // llama-server resolves the mmproj companion itself from the HF repo. - if (!mmproj_path.empty() && !model_info.hf_load) { + if (!mmproj_path.empty() && !model_info.extra("hf_load", false)) { push_arg(args, reserved_flags, "--mmproj", mmproj_path); if (!use_gpu) { LOG(DEBUG, "LlamaCpp") << "Skipping mmproj argument since GPU mode is not enabled" << std::endl; @@ -651,3 +662,207 @@ json LlamaCppServer::responses(const json& request) { } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace llamacpp { + +std::unique_ptr create(const BackendContext& ctx) { + return make_server(ctx); +} + +namespace { +std::string system_llamacpp_version() { + std::string output; + #ifdef _WIN32 + std::string command = "llama-server --version 2>NUL"; + int rc = lemon::utils::ProcessManager::run_command(command, output); + #else + FILE* pipe = popen("llama-server --version 2>/dev/null", "r"); + if (!pipe) { + return "unknown"; + } + + char buffer[256]; + if (fgets(buffer, sizeof(buffer), pipe) != nullptr) { + output = buffer; + } + + pclose(pipe); + #endif + + // Parse version from output like "version: 3432 (e2b2a632)" or "llama.cpp version b3432" + if (!output.empty()) { + // Try to find a version number + std::regex version_regex(R"(version:\s*(\d+)|version\s+b?(\d+))"); + std::smatch match; + if (std::regex_search(output, match, version_regex)) { + for (size_t i = 1; i < match.size(); ++i) { + if (match[i].matched) { + return "b" + match[i].str(); + } + } + } + return "detected"; + } + + return "unknown"; +} + + +bool is_ggml_hip_plugin_available() { +#ifdef __linux__ + // Allow distros/packagers that install outside the FHS paths below + // (e.g. NixOS, custom prefixes) to point directly at libggml-hip.so. + if (const char* env = std::getenv("LEMONADE_GGML_HIP_PATH"); env && *env) { + // Require the basename to look like the HIP plugin (libggml-hip*.so*, + // case-insensitive, versioned sonames allowed). This is a sanity check, + // not a security boundary: the path is not forwarded to ggml's loader, + // so we cannot verify it is actually loadable. It only guards against an + // accidental override pointing at an unrelated existing file. + std::string name = fs::path(env).filename().string(); + std::transform(name.begin(), name.end(), name.begin(), + [](unsigned char c) { return std::tolower(c); }); + const bool name_matches = name.rfind("libggml-hip", 0) == 0 && + name.find(".so") != std::string::npos; + // LEMONADE_GGML_HIP_PATH is user-controlled, so use the non-throwing + // filesystem overload: an odd or malformed path resolves to "not a + // regular file" (ec set) instead of raising a filesystem_error. + std::error_code hip_path_ec; + if (name_matches && fs::is_regular_file(env, hip_path_ec)) { + return true; + } + } + // On Linux x86_64, check common system library paths for the HIP plugin + std::vector possible_paths = { + // Debian/Ubuntu multiarch path (most common) + "/usr/lib/x86_64-linux-gnu/ggml/backends0/libggml-hip.so", + // Arch AUR path + "/usr/lib/libggml-hip.so", + // Standard Linux paths + "/usr/lib/ggml/backends0/libggml-hip.so", + "/usr/lib64/ggml/backends0/libggml-hip.so" + }; + + // Check all possible paths + for (const auto& path : possible_paths) { + if (fs::exists(path)) { + return true; + } + } +#endif + + return false; +} + + +// llamacpp model-management behavior: GGUF metadata + capability labels. +class LlamaCppOps : public BackendOps { +public: + void populate_metadata(ModelInfo& info, const BackendOpsContext&) const override { + const std::string gguf_path = info.resolved_path(); + if (gguf_path.size() < 5) { + return; + } + std::string ext = gguf_path.substr(gguf_path.size() - 5); + std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); + if (ext != ".gguf") { + return; + } + std::error_code ec; + if (!std::filesystem::exists(lemon::utils::path_from_utf8(gguf_path), ec)) { + return; + } + GgufMetadata meta; + if (!read_gguf_metadata(meta, gguf_path)) { + return; + } + info.max_context_window = meta.context_length; + info.gguf = std::move(meta); + // GGUF vision/tool metadata are LLM capabilities. Don't apply them to + // embedding/reranking models, or labels like tool-calling would + // reclassify the model away from its endpoint type. + if (info.type == ModelType::LLM) { + apply_gguf_capability_labels(info.labels, info.gguf.caps); + } + } + + std::string resolve_checkpoint_path(const ModelInfo& info, + const CheckpointResolveContext& ctx) const override { + // The main checkpoint is a GGUF file (with sharding/variant resolution); + // auxiliary checkpoints (mmproj, …) use the shared default. + if (ctx.type == "main") { + return resolve_gguf_path(ctx.model_cache_path, ctx.variant); + } + return BackendOps::resolve_checkpoint_path(info, ctx); + } + + std::string find_imported_checkpoint(const std::string& import_dir) const override { + // The primary artifact is the (non-mmproj) GGUF file. + return resolve_gguf_path(import_dir, ""); + } + + std::string validate_registration_checkpoint(const std::string& checkpoint) const override { + // A GGUF checkpoint must name its quant via CHECKPOINT:VARIANT. + std::string lower = checkpoint; + std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower); + if (lower.find("gguf") != std::string::npos && + checkpoint.find(':') == std::string::npos) { + return "You are required to provide a 'variant' in the checkpoint field when " + "registering a GGUF model. The variant is provided as CHECKPOINT:VARIANT. " + "For example: Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_0 or " + "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:qwen2.5-coder-3b-instruct-q4_0.gguf"; + } + return ""; + } + + std::string validate_checkpoint_file(const std::string& resolved_path) const override { + // A .gguf file in the cache must start with the GGUF magic, else it's a + // truncated/corrupt download and the model is not really present. + std::error_code ec; + std::filesystem::path p = lemon::utils::path_from_utf8(resolved_path); + if (std::filesystem::is_directory(p, ec)) { + return ""; + } + std::string ext = resolved_path.size() >= 5 ? resolved_path.substr(resolved_path.size() - 5) : ""; + std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); + if (ext != ".gguf") { + return ""; + } + std::ifstream in(p, std::ios::binary); + char magic[4] = {}; + in.read(magic, sizeof(magic)); + bool ok = in.gcount() == static_cast(sizeof(magic)) && + magic[0] == 'G' && magic[1] == 'G' && magic[2] == 'U' && magic[3] == 'F'; + return ok ? "" : "Invalid GGUF cache file"; + } + + std::string resolve_version(const std::string& backend, + const std::string& file_version) const override { + // The PATH-installed "system" llama-server has no version.txt; query it. + if (backend == "system") { + return system_llamacpp_version(); + } + return file_version; + } + + InstallCheck check_install(const std::string& backend, bool binary_found) const override { + // The system llama-server also needs the ggml HIP plugin for ROCm GPU + // acceleration when an AMD GPU (KFD) is present. + if (binary_found && backend == "system") { +#ifdef __linux__ + if (std::filesystem::exists("/sys/class/kfd") && !is_ggml_hip_plugin_available()) { + return {false, "HIP plugin libggml-hip.so not installed"}; + } +#endif + } + return {binary_found, ""}; + } +}; +} // namespace + +const BackendSpec* spec() { return make_spec(descriptor); } +const BackendOps* ops() { return single_ops(); } +} // namespace llamacpp +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/moonshine_server.cpp b/src/cpp/server/backends/moonshine/moonshine_server.cpp similarity index 86% rename from src/cpp/server/backends/moonshine_server.cpp rename to src/cpp/server/backends/moonshine/moonshine_server.cpp index d3729d435..bcf263d67 100644 --- a/src/cpp/server/backends/moonshine_server.cpp +++ b/src/cpp/server/backends/moonshine/moonshine_server.cpp @@ -1,4 +1,6 @@ -#include "lemon/backends/moonshine_server.h" +#include "lemon/backends/moonshine/moonshine_server.h" +#include "lemon/backends/moonshine/moonshine.h" +#include "lemon/backends/backend_registry.h" #include "lemon/backends/backend_utils.h" #include "lemon/backend_manager.h" #include "lemon/runtime_config.h" @@ -6,8 +8,10 @@ #include "lemon/utils/http_client.h" #include "lemon/utils/process_manager.h" #include "lemon/error_types.h" +#include #include #include +#include #include #include #include @@ -71,7 +75,7 @@ void MoonshineServer::load(const std::string& model_name, device_type_ = DEVICE_CPU; // Install moonshine-server if needed - backend_manager_->install_backend(SPEC.recipe, "cpu"); + backend_manager_->install_backend(moonshine::spec()->recipe, "cpu"); // Resolve model path from ModelManager (standard HF cache) std::string model_path = model_info.resolved_path(); @@ -83,7 +87,7 @@ void MoonshineServer::load(const std::string& model_name, // Resolve model architecture. Prefer the explicit registry field; fall back // to inferring from the checkpoint variant (onnx/tiny, onnx/small, etc.). - int model_arch = model_info.moonshine_arch; + int model_arch = model_info.extra("moonshine_arch", -1); if (model_arch < 0) { std::string variant = model_info.checkpoint(); std::transform(variant.begin(), variant.end(), variant.begin(), ::tolower); @@ -97,7 +101,7 @@ void MoonshineServer::load(const std::string& model_name, } // Get executable path - std::string executable = BackendUtils::get_backend_binary_path(SPEC, "cpu"); + std::string executable = BackendUtils::get_backend_binary_path(*moonshine::spec(), "cpu"); LOG(INFO, "MoonshineServer") << "Using executable: " << executable << std::endl; // moonshine-server binds three consecutive ports: HTTP, WS (+1), TCP (+2). @@ -358,3 +362,53 @@ json MoonshineServer::audio_transcriptions(const json& request) { } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace moonshine { + +std::unique_ptr create(const BackendContext& ctx) { + return make_server(ctx); +} + + +namespace { +class MoonshineOps : public BackendOps { +public: + std::optional> select_checkpoint_files( + const std::string& main_variant, const std::vector& repo_files) const override { + // A Moonshine variant names a directory (e.g. "medium-streaming-en/quantized"); + // download every file under it. + std::string folder_prefix = main_variant; + if (!folder_prefix.empty() && folder_prefix.back() != '/') { + folder_prefix += "/"; + } + auto starts_with_ci = [](const std::string& s, const std::string& p) { + if (s.size() < p.size()) return false; + for (size_t i = 0; i < p.size(); ++i) { + if (std::tolower(static_cast(s[i])) != + std::tolower(static_cast(p[i]))) { + return false; + } + } + return true; + }; + std::vector files; + for (const auto& f : repo_files) { + if (starts_with_ci(f, folder_prefix)) { + files.push_back(f); + } + } + if (files.empty()) { + throw std::runtime_error("No Moonshine model files found in folder: " + main_variant); + } + return files; + } +}; +} // namespace + +const BackendSpec* spec() { return make_spec(descriptor); } +const BackendOps* ops() { return single_ops(); } +} // namespace moonshine +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/ryzenaiserver.cpp b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp similarity index 67% rename from src/cpp/server/backends/ryzenaiserver.cpp rename to src/cpp/server/backends/ryzenai/ryzenai_server.cpp index 6e250fa35..69e1eed16 100644 --- a/src/cpp/server/backends/ryzenaiserver.cpp +++ b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp @@ -1,4 +1,10 @@ -#include "lemon/backends/ryzenaiserver.h" +#include "lemon/backends/ryzenai/ryzenai_server.h" +#include "lemon/backends/ryzenai/ryzenai.h" +#include "lemon/backends/backend_registry.h" +#include "lemon/model_manager.h" +#include "lemon/backends/backend_ops.h" +#include "lemon/backends/hf_cache_util.h" +#include "lemon/utils/path_utils.h" #include "lemon/backends/backend_utils.h" #include "lemon/backend_manager.h" #include "lemon/utils/process_manager.h" @@ -38,7 +44,7 @@ RyzenAIServer::~RyzenAIServer() { bool RyzenAIServer::is_available() { try { - return !backends::BackendUtils::get_backend_binary_path(SPEC, "npu").empty(); + return !backends::BackendUtils::get_backend_binary_path(*backends::ryzenai::spec(), "npu").empty(); } catch (...) { return false; } @@ -55,7 +61,7 @@ void RyzenAIServer::load(const std::string& model_name, backend_manager_->install_backend("ryzenai-llm", "npu"); // Get the path to ryzenai-server - std::string ryzenai_server_path = backends::BackendUtils::get_backend_binary_path(SPEC, "npu"); + std::string ryzenai_server_path = backends::BackendUtils::get_backend_binary_path(*backends::ryzenai::spec(), "npu"); if (ryzenai_server_path.empty()) { throw std::runtime_error("RyzenAI-Server executable not found even after installation attempt"); } @@ -167,3 +173,54 @@ json RyzenAIServer::responses(const json& request) { } } // namespace lemon + +namespace lemon { +namespace backends { +namespace ryzenai { + +std::unique_ptr create(const BackendContext& ctx) { + // RyzenAI resolves its model path before load (set_model_path), matching the + // original router factory's special-casing. + auto server = std::make_unique<::lemon::RyzenAIServer>( + ctx.model_info->model_name, ctx.log_level == "debug", + ctx.model_manager, ctx.backend_manager); + server->set_model_path(ctx.model_info->resolved_path()); + return server; +} + + +namespace { +class RyzenAiOps : public BackendOps { +public: + std::string resolve_checkpoint_path(const ModelInfo&, + const CheckpointResolveContext& ctx) const override { + // RyzenAI models are a directory containing genai_config.json. + std::string found = find_imported_checkpoint(ctx.model_cache_path); + return found.empty() ? ctx.model_cache_path : found; // dir if not found + } + + std::string find_imported_checkpoint(const std::string& import_dir) const override { + // The primary artifact is the directory holding genai_config.json. + std::filesystem::path dir = lemon::utils::path_from_utf8(import_dir); + if (hf_cache::exists(dir)) { + for (const auto& entry : + std::filesystem::recursive_directory_iterator(dir, hf_cache::dir_options())) { + if (entry.is_regular_file() && entry.path().filename() == "genai_config.json") { + return lemon::utils::path_to_utf8(entry.path().parent_path()); + } + } + } + return ""; // register the directory itself + } +}; +} // namespace + +const BackendSpec* spec() { + static const BackendSpec kSpec("ryzenai-server", descriptor.binary, + ::lemon::RyzenAIServer::get_install_params, /*split=*/false); + return &kSpec; +} +const BackendOps* ops() { return single_ops(); } +} // namespace ryzenai +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/sd_server.cpp b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp similarity index 97% rename from src/cpp/server/backends/sd_server.cpp rename to src/cpp/server/backends/sdcpp/sdcpp_server.cpp index 734454c36..a4b1787f9 100644 --- a/src/cpp/server/backends/sd_server.cpp +++ b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp @@ -1,4 +1,6 @@ -#include "lemon/backends/sd_server.h" +#include "lemon/backends/sdcpp/sdcpp_server.h" +#include "lemon/backends/sdcpp/sdcpp.h" +#include "lemon/backends/backend_registry.h" #include "lemon/backends/backend_utils.h" #include "lemon/backend_manager.h" #include "lemon/runtime_config.h" @@ -202,7 +204,7 @@ void SDServer::load(const std::string& model_name, RuntimeConfig::validate_backend_choice("sdcpp", backend); // Update device type based on the actual backend selected. - // get_device_type_from_recipe() defaults sd-cpp to CPU, but rocm/vulkan/metal/cuda are GPU backends. + // The descriptor defaults sd-cpp to CPU; rocm/vulkan/metal/cuda variants are GPU backends. if (backend == "rocm" || backend == "vulkan" || backend == "metal" || backend == "cuda") { device_type_ = DEVICE_GPU; } else { @@ -210,7 +212,7 @@ void SDServer::load(const std::string& model_name, } // Install sd-server if needed - backend_manager_->install_backend(SPEC.recipe, backend); + backend_manager_->install_backend(sdcpp::spec()->recipe, backend); // Get model path std::string model_path = model_info.resolved_path("main"); @@ -232,7 +234,7 @@ void SDServer::load(const std::string& model_name, LOG(DEBUG, "SDServer") << "Using model: " << model_path << std::endl; // Get sd-server executable path - std::string exe_path = BackendUtils::get_backend_binary_path(SPEC, backend); + std::string exe_path = BackendUtils::get_backend_binary_path(*sdcpp::spec(), backend); // Choose a port port_ = choose_port(); @@ -746,3 +748,18 @@ std::string SDServer::upscale_via_cli( } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace sdcpp { + +std::unique_ptr create(const BackendContext& ctx) { + return make_server(ctx); +} + + +const BackendSpec* spec() { return make_spec(descriptor); } +const BackendOps* ops() { return default_backend_ops(); } +} // namespace sdcpp +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/vllm_server.cpp b/src/cpp/server/backends/vllm/vllm_server.cpp similarity index 95% rename from src/cpp/server/backends/vllm_server.cpp rename to src/cpp/server/backends/vllm/vllm_server.cpp index 7584d56d9..60a79c95f 100644 --- a/src/cpp/server/backends/vllm_server.cpp +++ b/src/cpp/server/backends/vllm/vllm_server.cpp @@ -1,4 +1,6 @@ -#include "lemon/backends/vllm_server.h" +#include "lemon/backends/vllm/vllm_server.h" +#include "lemon/backends/vllm/vllm.h" +#include "lemon/backends/backend_registry.h" #include "lemon/backends/backend_utils.h" #include "lemon/model_manager.h" #include "lemon/runtime_config.h" @@ -122,7 +124,7 @@ void VLLMServer::load(const std::string& model_name, RuntimeConfig::validate_backend_choice("vllm", vllm_backend); // Install vllm-server if needed - backend_manager_->install_backend(SPEC.recipe, vllm_backend); + backend_manager_->install_backend(vllm::spec()->recipe, vllm_backend); // vLLM uses HuggingFace model names, not local file paths. // The checkpoint field in server_models.json is the HF model ID. @@ -137,7 +139,7 @@ void VLLMServer::load(const std::string& model_name, port_ = choose_port(); // Get executable path - std::string executable = BackendUtils::get_backend_binary_path(SPEC, vllm_backend); + std::string executable = BackendUtils::get_backend_binary_path(*vllm::spec(), vllm_backend); // Build command line arguments std::vector args; @@ -311,3 +313,18 @@ void VLLMServer::forward_streaming_request(const std::string& endpoint, } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace vllm { + +std::unique_ptr create(const BackendContext& ctx) { + return make_server(ctx); +} + + +const BackendSpec* spec() { return make_spec(descriptor, /*split=*/true); } +const BackendOps* ops() { return default_backend_ops(); } +} // namespace vllm +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/whisper_server.cpp b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp similarity index 89% rename from src/cpp/server/backends/whisper_server.cpp rename to src/cpp/server/backends/whispercpp/whispercpp_server.cpp index 9f50da020..d1222e551 100644 --- a/src/cpp/server/backends/whisper_server.cpp +++ b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp @@ -1,5 +1,10 @@ -#include "lemon/backends/whisper_server.h" +#include "lemon/backends/whispercpp/whispercpp_server.h" +#include "lemon/backends/whispercpp/whispercpp.h" +#include "lemon/backends/backend_registry.h" +#include "lemon/backends/backend_ops.h" #include "lemon/backends/backend_utils.h" +#include "lemon/backends/hf_cache_util.h" +#include "lemon/model_manager.h" #include "lemon/backend_manager.h" #include "lemon/runtime_config.h" #include "lemon/system_info.h" @@ -229,7 +234,7 @@ void WhisperServer::load(const std::string& model_name, RuntimeConfig::validate_backend_choice("whispercpp", whispercpp_backend); // Update device type based on the actual backend selected. - // get_device_type_from_recipe() defaults whispercpp to CPU, but npu/vulkan use different devices. + // The descriptor defaults whispercpp to CPU; npu/vulkan variants use different devices. if (whispercpp_backend == "npu") { device_type_ = DEVICE_NPU; } else if (whispercpp_backend == "vulkan" || whispercpp_backend == "metal") { @@ -238,7 +243,7 @@ void WhisperServer::load(const std::string& model_name, device_type_ = DEVICE_CPU; } - backend_manager_->install_backend(SPEC.recipe, whispercpp_backend); + backend_manager_->install_backend(whispercpp::spec()->recipe, whispercpp_backend); std::string model_path = model_info.resolved_path(); if (model_path.empty()) { @@ -254,7 +259,7 @@ void WhisperServer::load(const std::string& model_name, } // Get whisper-server executable path - std::string exe_path = BackendUtils::get_backend_binary_path(SPEC, whispercpp_backend); + std::string exe_path = BackendUtils::get_backend_binary_path(*whispercpp::spec(), whispercpp_backend); // Choose a port port_ = choose_port(); @@ -701,3 +706,65 @@ json WhisperServer::audio_transcriptions(const json& request) { } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace whispercpp { + +std::unique_ptr create(const BackendContext& ctx) { + return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); +} + + +namespace { +class WhisperOps : public BackendOps { +public: + std::string resolve_checkpoint_path(const ModelInfo& info, + const CheckpointResolveContext& ctx) const override { + // With no variant, find any .bin model file; otherwise use the shared + // default (variant/aux resolution). + if (ctx.variant.empty()) { + std::filesystem::path dir = lemon::utils::path_from_utf8(ctx.model_cache_path); + if (!hf_cache::exists(dir)) { + return ctx.model_cache_path; + } + std::vector bin_files; + for (const auto& entry : + std::filesystem::recursive_directory_iterator(dir, hf_cache::dir_options())) { + if (entry.is_regular_file() && + entry.path().filename().string().find(".bin") != std::string::npos) { + bin_files.push_back(lemon::utils::path_to_utf8(entry.path())); + } + } + if (bin_files.empty()) { + return ctx.model_cache_path; + } + std::sort(bin_files.begin(), bin_files.end()); + return bin_files[0]; + } + return BackendOps::resolve_checkpoint_path(info, ctx); + } + + std::string find_imported_checkpoint(const std::string& import_dir) const override { + // The primary artifact is the .bin model file. + std::filesystem::path dir = lemon::utils::path_from_utf8(import_dir); + if (!hf_cache::exists(dir)) { + return ""; + } + for (const auto& entry : + std::filesystem::recursive_directory_iterator(dir, hf_cache::dir_options())) { + if (entry.is_regular_file() && + entry.path().filename().string().find(".bin") != std::string::npos) { + return lemon::utils::path_to_utf8(entry.path()); + } + } + return ""; + } +}; +} // namespace + +const BackendSpec* spec() { return make_spec(descriptor); } +const BackendOps* ops() { return single_ops(); } +} // namespace whispercpp +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/config_file.cpp b/src/cpp/server/config_file.cpp index d8f6955af..2787c0167 100644 --- a/src/cpp/server/config_file.cpp +++ b/src/cpp/server/config_file.cpp @@ -1,4 +1,5 @@ #include "lemon/config_file.h" +#include "lemon/backends/backend_descriptor_registry.h" #include "lemon/utils/json_utils.h" #include "lemon/utils/path_utils.h" @@ -27,10 +28,28 @@ static json load_json_file(const fs::path& path) { } } -json ConfigFile::get_defaults() { +json ConfigFile::base_defaults() { json defaults = load_json_file(utils::path_from_utf8( utils::get_resource_path("resources/defaults.json"))); + // Seed each backend's config.json section from its descriptor. The per-recipe + // defaults are authored in the backend's descriptor; resources/defaults.json + // is the generated, committed mirror (see GET /internal/config/defaults and + // docs/tools/gen_backend_boilerplate.py). Re-seeding here keeps the descriptor + // authoritative even if the committed file lags. Empty result = no section. + for (const auto* d : backends::all_descriptors()) { + json block = d->config_defaults(); + if (!block.empty()) { + defaults[d->effective_config_section()] = block; + } + } + + return defaults; +} + +json ConfigFile::get_defaults() { + json defaults = base_defaults(); + #ifndef _WIN32 fs::path distro_defaults = "/usr/share/lemonade/defaults.json"; if (fs::exists(distro_defaults)) { diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp index 0d51efb2c..02679e803 100644 --- a/src/cpp/server/model_manager.cpp +++ b/src/cpp/server/model_manager.cpp @@ -1,17 +1,17 @@ #include #include #include -#include -#include #include #include #include #include #include +#include +#include #include -#include +#include +#include #include -#include #include #include #include @@ -132,116 +132,15 @@ static std::string cache_key_to_canonical_id(const std::string& cache_key) { // launched from a parent process that predates the FLM install and therefore // doesn't see FLM_MODEL_PATH, so we also probe every documented default. // Order is most-specific to most-historical. -static std::vector get_flm_models_dir_candidates() { - std::vector roots; - - const char* flm_model_path = std::getenv("FLM_MODEL_PATH"); - if (flm_model_path && *flm_model_path) { - roots.push_back(path_from_utf8(flm_model_path) / "models"); - } - -#ifdef _WIN32 - const char* userprofile = std::getenv("USERPROFILE"); - if (userprofile && *userprofile) { - fs::path home = path_from_utf8(userprofile); - roots.push_back(home / ".flm" / "models"); // current installer default - roots.push_back(home / "Documents" / "flm" / "models"); // legacy installer default - roots.push_back(home / "flm" / "models"); - } -#else - const char* xdg_config_home = std::getenv("XDG_CONFIG_HOME"); - if (xdg_config_home && *xdg_config_home) { - roots.push_back(path_from_utf8(xdg_config_home) / "flm" / "models"); - } - const char* home = std::getenv("HOME"); - if (home && *home) { - fs::path home_path = path_from_utf8(home); - roots.push_back(home_path / ".flm" / "models"); - roots.push_back(home_path / ".config" / "flm" / "models"); - } -#endif - - return roots; -} - -static fs::path find_flm_config_path_from_repo_dir(const std::string& repo_dir) { - if (repo_dir.empty()) return fs::path(); - - for (const auto& root : get_flm_models_dir_candidates()) { - fs::path candidate = root / repo_dir / "config.json"; - if (safe_exists(candidate)) return candidate; - } - return fs::path(); -} - -static std::string repo_dir_from_url(const std::string& url) { - std::string clean = url; - while (!clean.empty() && clean.back() == '/') clean.pop_back(); - size_t query_pos = clean.find_first_of("?#"); - if (query_pos != std::string::npos) clean = clean.substr(0, query_pos); - - for (const std::string marker : {"/tree/", "/resolve/"}) { - size_t marker_pos = clean.find(marker); - if (marker_pos != std::string::npos) { - clean = clean.substr(0, marker_pos); - break; - } - } - - size_t slash = clean.find_last_of('/'); - return slash == std::string::npos ? clean : clean.substr(slash + 1); -} - -static int64_t read_flm_max_context_window(const ModelInfo& info) { - if (info.type != ModelType::LLM) return 0; - - std::string config_path = info.resolved_path("config"); - if (config_path.empty()) return 0; - - try { - json config = JsonUtils::load_from_file(config_path); - if (config.contains("max_position_embeddings") && config["max_position_embeddings"].is_number_integer()) { - int64_t value = config["max_position_embeddings"].get(); - return value > 0 ? value : 0; - } - if (config.contains("text_config") && config["text_config"].is_object()) { - const auto& text_config = config["text_config"]; - if (text_config.contains("max_position_embeddings") && text_config["max_position_embeddings"].is_number_integer()) { - int64_t value = text_config["max_position_embeddings"].get(); - return value > 0 ? value : 0; - } - } - } catch (const std::exception& e) { - LOG(DEBUG, "ModelManager") << "Could not read FLM config metadata for " - << info.model_name << ": " << e.what() << std::endl; - } - return 0; -} static void populate_model_metadata(ModelInfo& info) { info.max_context_window = 0; if (!info.downloaded) return; - if (info.recipe == "llamacpp") { - std::string gguf_path = info.resolved_path(); - if (!gguf_path.empty() && gguf_reader_detail::ends_with_ignore_case(gguf_path, ".gguf") && safe_exists(path_from_utf8(gguf_path))) { - GgufMetadata meta; - if (read_gguf_metadata(meta, gguf_path)) { - info.max_context_window = meta.context_length; - info.gguf = std::move(meta); - - // GGUF vision/tool metadata are LLM capabilities. Do not apply - // them to embedding/reranking models, otherwise labels such as - // tool-calling would reclassify the model away from its endpoint - // type and break /embeddings or /rerank. - if (info.type == ModelType::LLM) { - apply_gguf_capability_labels(info.labels, info.gguf.caps); - } - } - } - } else if (info.recipe == "flm") { - info.max_context_window = read_flm_max_context_window(info); - } + // Per-backend metadata (GGUF arch/labels for llamacpp, config.json ctx for + // flm, …) is read by the backend's ops, not a recipe switchboard here. + backends::BackendOpsContext ctx; + backends::ops_for(info.recipe)->populate_metadata(info, ctx); } static bool is_user_model_name(const std::string& model_name) { @@ -359,6 +258,35 @@ static void parse_image_defaults(ModelInfo& info, const json& model_json) { } } +// Populate ModelInfo::extras with any model-JSON key not consumed by a typed +// ModelInfo field. This lets a new backend read custom per-model fields in load() +// without editing the shared ModelInfo struct. Keep this set in sync with the +// keys read by the parse blocks in build_cache(). +static void parse_extras(ModelInfo& info, const json& model_json) { + static const std::set kKnownKeys = { + "checkpoint", "checkpoints", "components", "mmproj", "recipe", "suggested", + "source", "size", "cloud_provider", + "labels", "image_defaults", "recipe_options" + }; + if (!model_json.is_object()) return; + for (auto& [key, value] : model_json.items()) { + if (kKnownKeys.count(key) == 0) { + info.extras[key] = value; + } + } +} + +// Default device for a recipe: the backend descriptor is authoritative for +// registered backends; collection/unknown recipes fall back to the recipe map. +// (A backend whose device depends on the chosen backend variant resolves the +// final device at load time via WrappedServer::effective_device.) +static DeviceType device_type_for_recipe(const std::string& recipe) { + if (const auto* desc = lemon::backends::descriptor_for(recipe)) { + return desc->default_device; + } + return get_device_type_from_recipe(recipe); +} + // Build merged recipe options: image_defaults -> JSON recipe_options -> user-saved overrides. // json_recipe_options: pre-extracted recipe_options for this model (from build_cache's // two-phase pattern). Pass a null json if the model JSON should be read directly instead. @@ -1017,7 +945,7 @@ std::map ModelManager::discover_extra_models() const { info.downloaded = true; info.source = EXTRA_MODEL_SOURCE; info.labels.push_back("custom"); - info.device = get_device_type_from_recipe(EXTRA_MODEL_RECIPE); + info.device = device_type_for_recipe(EXTRA_MODEL_RECIPE); return info; }; @@ -1135,426 +1063,38 @@ std::map ModelManager::discover_extra_models() const { } std::string ModelManager::resolve_model_path(const ModelInfo& info, const std::string& type, const std::string& checkpoint) const { - // Collections are virtual entries with no direct checkpoint to resolve + // Collections are virtual entries with no direct checkpoint to resolve. if (is_collection_recipe(info.recipe)) { return ""; } - // Cloud-offloaded models have no local artifacts; checkpoint is the - // upstream provider's model id, used directly when forwarding requests. - if (info.recipe == "cloud") { - return ""; - } - - // FLM models use checkpoint as-is (e.g., "gemma3:4b") - if (info.recipe == "flm") { - return checkpoint; - } - - // Local path models use checkpoint as-is (absolute path to file) + // Local-path models use the checkpoint as-is (absolute path to a file). if (info.source == "local_path") { return checkpoint; } std::string hf_cache = get_hf_cache_dir(); - // Local uploads: checkpoint is relative path from HF cache + // Local uploads: checkpoint is a relative path from the HF cache. if (info.source == "local_upload") { std::string normalized = checkpoint; std::replace(normalized.begin(), normalized.end(), '\\', '/'); return hf_cache + "/" + normalized; } - // For now, NPU cache is handled directly in whisper.cpp - if (type == "npu_cache") { - return ""; - } - - // HuggingFace models: need to find the GGUF file in cache - // Parse checkpoint to get repo_id and variant - // Use the checkpoint's own repo, falling back to main repo for backward compatibility - std::string checkpoint_repo_id = checkpoint_to_repo_id(checkpoint); - std::string main_repo_id = checkpoint_to_repo_id(info.checkpoint("main")); - std::string repo_id = checkpoint_repo_id; - std::string variant = checkpoint_to_variant(checkpoint); - - std::string model_cache_path = hf_cache + "/" + repo_id_to_cache_dir_name(repo_id); - fs::path model_cache_path_fs = path_from_utf8(model_cache_path); - - // For RyzenAI LLM models, look for genai_config.json directory - if (info.recipe == "ryzenai-llm") { - if (safe_exists(model_cache_path_fs)) { - for (const auto& entry : fs::recursive_directory_iterator(model_cache_path_fs, safe_dir_options)) { - if (entry.is_regular_file() && entry.path().filename() == "genai_config.json") { - return path_to_utf8(entry.path().parent_path()); - } - } - } - return model_cache_path; // Return directory even if genai_config not found - } - - // For kokoro models, look for index.json directory - if (info.recipe == "kokoro") { - if (safe_exists(model_cache_path_fs)) { - for (const auto& entry : fs::recursive_directory_iterator(model_cache_path_fs, safe_dir_options)) { - if (entry.is_regular_file() && entry.path().filename() == "index.json") { - return path_to_utf8(entry.path()); - } - } - } - - return model_cache_path; // Return directory even if index not found - } - - // For whispercpp, find the .bin model file - if (info.recipe == "whispercpp" && variant.empty()) { - // No variant specified - use fallback logic to find any .bin file - if (!safe_exists(model_cache_path_fs)) { - return model_cache_path; // Return directory path even if not found - } - - // Collect all .bin files - std::vector all_bin_files; - for (const auto& entry : fs::recursive_directory_iterator(model_cache_path_fs, safe_dir_options)) { - if (entry.is_regular_file()) { - std::string filename = entry.path().filename().string(); - if (filename.find(".bin") != std::string::npos) { - all_bin_files.push_back(path_to_utf8(entry.path())); - } - } - } - - if (all_bin_files.empty()) { - return model_cache_path; // Return directory if no .bin found - } - - // Sort files for consistent ordering - std::sort(all_bin_files.begin(), all_bin_files.end()); - - // Return first .bin file as fallback (only when no variant specified) - return all_bin_files[0]; - } - - // For llamacpp, find the GGUF file with advanced sharded model support - if (info.recipe == "llamacpp" && type == "main") { - if (!safe_exists(model_cache_path_fs)) { - return model_cache_path; // Return directory path even if not found - } - - // Prefer the active HF snapshot recorded in refs/main. This lets - // Lemonade keep using the previous snapshot when upstream only changed - // README/metadata and the requested model artifacts are unchanged. - auto collect_gguf_files = [](const fs::path& search_root) { - std::vector files; - if (search_root.empty() || !safe_exists(search_root)) { - return files; - } - - std::error_code ec; - for (const auto& entry : fs::recursive_directory_iterator(search_root, safe_dir_options, ec)) { - if (ec) break; - if (!entry.is_regular_file(ec)) { - ec.clear(); - continue; - } - - std::string filename = entry.path().filename().string(); - std::string filename_lower = filename; - std::transform(filename_lower.begin(), filename_lower.end(), filename_lower.begin(), ::tolower); - - if (filename.find(".gguf") != std::string::npos && filename_lower.find("mmproj") == std::string::npos) { - files.push_back(path_to_utf8(entry.path())); - } - } - return files; - }; - - // Resolve the requested GGUF variant within a candidate list of files. - // Returns the matched absolute path, or "" if this candidate set does not - // contain the variant. Factored into a lambda so the search can be retried - // against a broader set of snapshots (see #2300 below) without duplicating - // the matching logic. - auto resolve_gguf_variant = - [&](const std::vector& gguf_files) -> std::string { - if (gguf_files.empty()) { - return ""; - } - - // Case 0: Wildcard (*) - return first file (llama-server will auto-load shards) - if (variant == "*") { - return gguf_files[0]; - } - - // Case 1: Empty variant - return first file - if (variant.empty()) { - return gguf_files[0]; - } - - // Case 2: Exact filename match (variant ends with .gguf) - if (variant.find(".gguf") != std::string::npos) { - for (const auto& filepath : gguf_files) { - std::string filename = path_from_utf8(filepath).filename().string(); - if (filename == variant) { - return filepath; - } - } - return ""; // Exact variant not found in this candidate set - } - - // Case 3: Files ending with {variant}.gguf (case insensitive) - std::string variant_lower = variant; - std::transform(variant_lower.begin(), variant_lower.end(), variant_lower.begin(), ::tolower); - std::string suffix = variant_lower + ".gguf"; - - std::vector matching_files; - for (const auto& filepath : gguf_files) { - std::string filename = path_from_utf8(filepath).filename().string(); - std::string filename_lower = filename; - std::transform(filename_lower.begin(), filename_lower.end(), filename_lower.begin(), ::tolower); - - if (filename_lower.size() >= suffix.size() && - filename_lower.substr(filename_lower.size() - suffix.size()) == suffix) { - matching_files.push_back(filepath); - } - } - - if (!matching_files.empty()) { - return matching_files[0]; - } - - // Case 4: Folder-based sharding (files in variant/ folder) - std::string folder_prefix_lower = variant_lower + "/"; - - for (const auto& filepath : gguf_files) { - // Get relative path from model cache path - std::string relative_path = path_to_utf8( - path_from_utf8(filepath).lexically_relative(model_cache_path_fs)); - std::string relative_lower = relative_path; - // Normalize path separators and case so folder-variant matching works cross-platform. - std::transform(relative_lower.begin(), relative_lower.end(), relative_lower.begin(), ::tolower); - std::replace(relative_lower.begin(), relative_lower.end(), '\\', '/'); - - if (relative_lower.find(folder_prefix_lower) != std::string::npos) { - return filepath; - } - } - - // Case 5: Local quant-token fallback. - // - // Keep the existing resolver cases above as the primary logic: exact - // filenames, suffix matches, and folder-based sharding are more - // specific and preserve the CHECKPOINT:VARIANT contract. - // - // Some GGUF repositories name files with the quant token in the middle, - // for example: - // Qwen3.6-27B-MTP-IMAT-IQ4_XS-Q8nextn.gguf - // for variant: - // IQ4_XS - // That file does not end with IQ4_XS.gguf, so mirror the downloader's - // GGUF variant enumeration over the files that are already present in - // the local HF cache before declaring the model missing. - // - // HF cache paths have an extra snapshots// prefix that is not - // part of the repository-relative filename. Strip it before calling - // enumerate_gguf_variants(); otherwise the enumerator treats - // "snapshots" as a top-level sharded-folder variant and never extracts - // the quant token from the actual GGUF filename. - std::vector relative_gguf_files; - std::map absolute_by_relative; - auto repo_relative_from_cache_relative = [](std::string rel) { - std::replace(rel.begin(), rel.end(), '\\', '/'); - - static const std::string snapshots_prefix = "snapshots/"; - if (rel.rfind(snapshots_prefix, 0) == 0) { - size_t revision_end = rel.find('/', snapshots_prefix.size()); - if (revision_end != std::string::npos && revision_end + 1 < rel.size()) { - rel = rel.substr(revision_end + 1); - } - } - - return rel; - }; - - for (const auto& filepath : gguf_files) { - std::string relative_path = path_to_utf8( - path_from_utf8(filepath).lexically_relative(model_cache_path_fs)); - relative_path = repo_relative_from_cache_relative(relative_path); - - // Multiple HF snapshots can contain the same repo-relative file. - // Keep the first absolute path from the sorted gguf_files list - // so duplicates do not create false ambiguity. - if (absolute_by_relative.emplace(relative_path, filepath).second) { - relative_gguf_files.push_back(relative_path); - } - } - - std::vector enumerated_matches; - auto local_variants = lemon::enumerate_gguf_variants(relative_gguf_files); - for (const auto& local_variant : local_variants.variants) { - if (gguf_reader_detail::to_lower(local_variant.name) != variant_lower) { - continue; - } - - auto it = absolute_by_relative.find(local_variant.primary_file); - if (it != absolute_by_relative.end()) { - enumerated_matches.push_back(it->second); - } - } - - if (enumerated_matches.size() == 1) { - LOG(INFO, "ModelManager") - << "Resolved local GGUF variant '" << variant - << "' via quant-token fallback: " << enumerated_matches[0] << std::endl; - return enumerated_matches[0]; - } - - if (enumerated_matches.size() > 1) { - LOG(WARNING, "ModelManager") - << "Multiple local GGUF files matched variant '" << variant - << "' via quant-token fallback; refusing to guess" << std::endl; - return ""; - } - - // No match in this candidate set. Do not fall back to another - // quantization in the same Hugging Face repo; otherwise a custom - // download with a different quant can make a built-in model appear - // downloaded and allow deleting the wrong file. - return ""; - }; - - // Prefer the active refs/main snapshot so that when upstream only changed - // README/metadata Lemonade keeps using the previous snapshot's artifacts. - // (Sorted for consistent ordering, important for sharded models.) - std::vector active_gguf_files = - collect_gguf_files(active_hf_snapshot_path(model_cache_path_fs)); - std::sort(active_gguf_files.begin(), active_gguf_files.end()); - - // Whole-repo-cache candidates spanning every snapshot, populated on demand. - std::vector all_cache_gguf_files; - bool all_cache_collected = false; - auto whole_cache_gguf_files = [&]() -> const std::vector& { - if (!all_cache_collected) { - all_cache_gguf_files = collect_gguf_files(model_cache_path_fs); - std::sort(all_cache_gguf_files.begin(), all_cache_gguf_files.end()); - all_cache_collected = true; - } - return all_cache_gguf_files; - }; - - if (active_gguf_files.empty() && whole_cache_gguf_files().empty()) { - return model_cache_path; // Return directory if no GGUF found anywhere - } - - std::string resolved_path = resolve_gguf_variant(active_gguf_files); - - // #2300: a sibling variant that shares this HF repo can live in a snapshot - // other than the one refs/main points at. refs/main advances to the - // snapshot of whichever variant was pulled or updated last, leaving the - // other variants' symlinks behind in earlier snapshots; after a restart the - // refs/main-only search above then reports them as missing. If the active - // snapshot did not contain the requested variant, broaden the search to - // every snapshot in this repo's cache before declaring it missing. Blobs are - // content-addressed and shared, so reading an older snapshot is safe, and - // resolving against the active snapshot first preserves the CHECKPOINT:VARIANT - // contract (a different quant is never substituted while the exact one exists). - // - // The whole-cache set is a superset of the active set (it recurses the repo - // cache, which contains the active snapshot dir), so the two are equal only - // when refs/main's snapshot is the sole snapshot holding GGUFs — in which case - // the broader search is identical and skipped. Comparing the (sorted) sets, - // rather than just their sizes, makes that intent explicit and stays correct - // even if that superset relationship ever changes. - // - // When more than one inactive snapshot holds the requested variant, the - // existing first-by-sorted-path dedup (see Case 5) picks one deterministically; - // every such copy is a valid GGUF of that quant, so this is safe for the - // resolve/downloaded-status purpose. Preferring the newest snapshot per variant - // would need per-variant snapshot state the HF cache does not record today and - // is left as a follow-up (out of scope for this fix). - if (resolved_path.empty()) { - const std::vector& all_files = whole_cache_gguf_files(); - if (all_files != active_gguf_files) { - resolved_path = resolve_gguf_variant(all_files); - } - } - - return resolved_path; - } - - // Everything else - if (!variant.empty()) { - // Prefer refs/main for auxiliary checkpoints too (for example mmproj), - // so companion files stay on the same active snapshot as the main model - // when unchanged artifacts are reused across README-only commits. - fs::path active_snapshot = active_hf_snapshot_path(model_cache_path_fs); - if (!active_snapshot.empty()) { - fs::path direct_variant_path = active_snapshot / path_from_utf8(variant); - if (safe_exists(direct_variant_path)) { - return path_to_utf8(direct_variant_path); - } - - std::error_code ec; - for (const auto& entry : fs::recursive_directory_iterator(active_snapshot, safe_dir_options, ec)) { - if (ec) break; - if (entry.is_regular_file(ec)) { - std::string filename = entry.path().filename().string(); - if (filename == variant) { - return path_to_utf8(entry.path()); - } - } else if (entry.is_directory(ec)) { - fs::path variant_path = entry.path() / path_from_utf8(variant); - if (safe_exists(variant_path)) { - return path_to_utf8(variant_path); - } - } - ec.clear(); - } - } - - // Try to find the exact variant in snapshots subdirectories - if (safe_exists(model_cache_path_fs)) { - for (const auto& entry : fs::recursive_directory_iterator(model_cache_path_fs, safe_dir_options)) { - if (entry.is_regular_file()) { - std::string filename = entry.path().filename().string(); - if (filename == variant) { - return path_to_utf8(entry.path()); - } - } else if (entry.is_directory()) { - fs::path variant_path = entry.path() / path_from_utf8(variant); - if (safe_exists(variant_path)) { - return path_to_utf8(variant_path); - } - } - } - } - // Variant not found in checkpoint's own repo - try main repo as fallback - // (backward compat: older downloads placed all files in the main repo dir) - if (checkpoint_repo_id != main_repo_id) { - std::string main_cache_path = hf_cache + "/" + repo_id_to_cache_dir_name(main_repo_id); - fs::path main_cache_path_fs = path_from_utf8(main_cache_path); - if (fs::exists(main_cache_path_fs)) { - for (const auto& entry : fs::recursive_directory_iterator(main_cache_path_fs)) { - if (entry.is_regular_file()) { - std::string filename = entry.path().filename().string(); - if (filename == variant) { - return path_to_utf8(entry.path()); - } - } else if (entry.is_directory()) { - fs::path variant_path = entry.path() / path_from_utf8(variant); - if (fs::exists(variant_path)) { - return path_to_utf8(variant_path); - } - } - } - } - } - - // Variant not found - return empty string to indicate model not downloaded - return ""; - } + // Compute the HF cache location for this checkpoint's repo, then let the + // backend's ops find its artifact within (a .gguf file, a genai_config.json + // directory, a .bin, …) — no per-recipe switchboard here. + backends::CheckpointResolveContext ctx; + ctx.hf_cache = hf_cache; + ctx.repo_id = checkpoint_to_repo_id(checkpoint); + ctx.main_repo_id = checkpoint_to_repo_id(info.checkpoint("main")); + ctx.variant = checkpoint_to_variant(checkpoint); + ctx.model_cache_path = hf_cache + "/" + repo_id_to_cache_dir_name(ctx.repo_id); + ctx.type = type; + ctx.checkpoint = checkpoint; - // Fallback: return directory path - return model_cache_path; + return backends::ops_for(info.recipe)->resolve_checkpoint_path(info, ctx); } void ModelManager::resolve_all_model_paths(ModelInfo& info) { @@ -1729,21 +1269,6 @@ static bool has_partial_files(const fs::path& dir) { return false; } -static bool is_valid_gguf_file_for_cache(const std::string& path) { - std::ifstream in(path_from_utf8(path), std::ios::binary); - if (!in.is_open()) { - return false; - } - - char magic[4] = {}; - in.read(magic, sizeof(magic)); - return in.gcount() == static_cast(sizeof(magic)) && - magic[0] == 'G' && - magic[1] == 'G' && - magic[2] == 'U' && - magic[3] == 'F'; -} - static bool is_checkpoint_path_complete(const std::string& path_str) { if (path_str.empty()) return false; @@ -1779,20 +1304,26 @@ static bool are_required_checkpoints_complete(const ModelInfo& info) { return false; } - fs::path resolved = path_from_utf8(resolved_path); - if (info.recipe == "llamacpp" && - !safe_is_directory(resolved) && - gguf_reader_detail::ends_with_ignore_case(resolved_path, ".gguf") && - !is_valid_gguf_file_for_cache(resolved_path)) { + // Per-backend file validation (e.g. llamacpp checks GGUF magic). + std::string invalid = backends::ops_for(info.recipe)->validate_checkpoint_file(resolved_path); + if (!invalid.empty()) { LOG(WARNING, "ModelManager") - << "Invalid GGUF cache file; marking model as not downloaded: " - << resolved_path << std::endl; + << invalid << "; marking model as not downloaded: " << resolved_path << std::endl; return false; } } return true; } +bool ModelManager::checkpoints_complete(const ModelInfo& info) const { + return are_required_checkpoints_complete(info); +} + +void ModelManager::download_from_huggingface_engine(const ModelInfo& info, + DownloadProgressCallback progress_callback) { + download_from_huggingface(info, progress_callback); +} + void ModelManager::build_cache() { std::lock_guard lock(models_cache_mutex_); @@ -1816,11 +1347,9 @@ void ModelManager::build_cache() { parse_components(info, value); info.recipe = JsonUtils::get_or_default(value, "recipe", ""); info.suggested = JsonUtils::get_or_default(value, "suggested", false); - info.hf_load = JsonUtils::get_or_default(value, "hf_load", false); info.source = JsonUtils::get_or_default(value, "source", ""); info.size = JsonUtils::get_or_default(value, "size", 0.0); info.cloud_provider = JsonUtils::get_or_default(value, "cloud_provider", ""); - info.moonshine_arch = JsonUtils::get_or_default(value, "moonshine_arch", -1); // HF-backed collections store their components on Hugging Face — the // cached manifest is the single source of truth. Rebuild the component @@ -1842,6 +1371,7 @@ void ModelManager::build_cache() { } parse_image_defaults(info, value); + parse_extras(info, value); // Parse recipe_options if present (for per-model runtime config like sdcpp_args) if (value.contains("recipe_options") && value["recipe_options"].is_object()) { @@ -1850,7 +1380,7 @@ void ModelManager::build_cache() { // Populate type and device fields (multi-model support) info.type = get_model_type_from_labels(info.labels); - info.device = get_device_type_from_recipe(info.recipe); + info.device = device_type_for_recipe(info.recipe); try { resolve_all_model_paths(info); @@ -1870,11 +1400,9 @@ void ModelManager::build_cache() { parse_components(info, value); info.recipe = JsonUtils::get_or_default(value, "recipe", ""); info.suggested = JsonUtils::get_or_default(value, "suggested", true); - info.hf_load = JsonUtils::get_or_default(value, "hf_load", false); info.source = JsonUtils::get_or_default(value, "source", ""); info.size = JsonUtils::get_or_default(value, "size", 0.0); info.cloud_provider = JsonUtils::get_or_default(value, "cloud_provider", ""); - info.moonshine_arch = JsonUtils::get_or_default(value, "moonshine_arch", -1); // HF-backed user collections (created by `lemonade pull /`) // keep only a repo pointer in user_models.json; their components live in @@ -1895,6 +1423,7 @@ void ModelManager::build_cache() { } parse_image_defaults(info, value); + parse_extras(info, value); // Parse recipe_options if present (for per-model runtime config like sdcpp_args) if (value.contains("recipe_options") && value["recipe_options"].is_object()) { @@ -1903,7 +1432,7 @@ void ModelManager::build_cache() { // Populate type and device fields (multi-model support) info.type = get_model_type_from_labels(info.labels); - info.device = get_device_type_from_recipe(info.recipe); + info.device = device_type_for_recipe(info.recipe); try { resolve_all_model_paths(info); @@ -1928,56 +1457,20 @@ void ModelManager::build_cache() { all_models[name] = info; } - // Step 1.6: Discover FLM models from 'flm list --json' - // Only discover FLM models if FLM is fully installed - // Precedence: server_models.json > user_models.json > extra_models > flm_list - auto flm_status = SystemInfoCache::get_flm_status(); - if (flm_status.is_ready()) { - auto flm_available = get_flm_available_models(); - for (const auto& info : flm_available) { - // Use emplace to only add if key doesn't exist (respect precedence) - all_models.emplace(info.model_name, info); - } - } - - // Cloud-offload discovery is server-side and automatic. For each - // installed cloud provider with a resolvable credential (env var or - // runtime-auth POST), call discover_models and merge the results into - // all_models. Per AGENTS.md invariant #11, the registry persists only - // {provider, base_url} pairs — API keys live in env vars or process - // memory, never on disk. Failures are logged, never propagated, so a - // single offline provider can't block the rest of cache build. - if (cloud_registry_ != nullptr) { - auto installed = cloud_registry_->list_installed(); - for (const auto& rec : installed) { - const std::string api_key = cloud_registry_->resolve_key(rec.name); - if (api_key.empty() || rec.base_url.empty()) { - LOG(INFO, "ModelManager") << "Skipping cloud discovery for '" - << rec.name << "': no API key resolvable" - << " (set " << CloudProviderRegistry::env_var_name(rec.name) - << " or POST /v1/cloud/auth)" << std::endl; - continue; - } - if (CloudProviderRegistry::is_http_base_url(rec.base_url) && - !rec.allow_insecure_http) { - LOG(WARNING, "ModelManager") << "Skipping cloud discovery for '" - << rec.name << "': http:// with API key " - << "requires allow_insecure_http=true" - << std::endl; - continue; - } - std::vector discovered; - try { - discovered = backends::CloudServer::discover_models(rec.name, api_key, rec.base_url); - } catch (const std::exception& e) { - LOG(WARNING, "ModelManager") << "Cloud discovery threw for '" - << rec.name << "': " << e.what() - << std::endl; + // Step 1.6: Dynamic discovery. Backends whose models are supplied at runtime + // (descriptor dynamic_models = true — flm from `flm list`, cloud from each + // provider) contribute their models via ops->discover_models(). Each carries + // its own downloaded status. Precedence: server/user/extra models win, so we + // emplace (don't overwrite). Failures are handled inside each backend's ops. + { + backends::BackendOpsContext octx; + octx.model_manager = this; + octx.cloud_registry = cloud_registry_; + for (const auto* desc : backends::all_descriptors()) { + if (!desc->dynamic_models) { continue; } - for (auto& m : discovered) { - if (m.recipe != "cloud" || m.model_name.empty()) continue; - // Same merge precedence as FLM: emplace, don't overwrite. + for (auto& m : backends::ops_for(desc->recipe)->discover_models(octx)) { all_models.emplace(m.model_name, std::move(m)); } } @@ -1994,21 +1487,21 @@ void ModelManager::build_cache() { // Step 2: Filter by backend availability all_models = filter_models_by_backend(all_models); - // Step 3: Check download status ONCE for all models - auto flm_models = get_flm_installed_models(); - std::unordered_set flm_set(flm_models.begin(), flm_models.end()); + // Step 3: Check download status for all models. Dynamic-discovery backends + // (flm, cloud) already set downloaded during discovery; everyone else asks + // its backend ops (default = shared HF completeness check). + backends::BackendOpsContext status_ctx; + status_ctx.model_manager = this; int downloaded_count = 0; // First pass: determine download status for non-collection models for (auto& [name, info] : all_models) { if (is_collection_recipe(info.recipe)) { continue; // Handled in second pass after components are resolved - } else if (info.recipe == "flm") { - info.downloaded = flm_set.count(info.checkpoint()) > 0; - } else if (info.recipe == "cloud") { - info.downloaded = true; // Cloud-offloaded models have no local artifacts - } else { - info.downloaded = are_required_checkpoints_complete(info); + } + const auto* desc = backends::descriptor_for(info.recipe); + if (!(desc && desc->dynamic_models)) { + info.downloaded = backends::ops_for(info.recipe)->is_downloaded(info, status_ctx); } if (info.downloaded) { @@ -2076,12 +1569,12 @@ void ModelManager::add_model_to_cache(const std::string& model_name) { info.cloud_provider = JsonUtils::get_or_default(*model_json, "cloud_provider", ""); parse_image_defaults(info, *model_json); + parse_extras(info, *model_json); json jro = (model_json->contains("recipe_options") && (*model_json)["recipe_options"].is_object()) ? (*model_json)["recipe_options"] : json(nullptr); info.recipe_options = build_recipe_options(info, jro, cache_key_to_canonical_id(model_name), recipe_options_); info.suggested = JsonUtils::get_or_default(*model_json, "suggested", is_user_model); - info.hf_load = JsonUtils::get_or_default(*model_json, "hf_load", false); info.source = JsonUtils::get_or_default(*model_json, "source", ""); if (model_json->contains("labels") && (*model_json)["labels"].is_array()) { @@ -2092,7 +1585,7 @@ void ModelManager::add_model_to_cache(const std::string& model_name) { // Populate type and device fields (multi-model support) info.type = get_model_type_from_labels(info.labels); - info.device = get_device_type_from_recipe(info.recipe); + info.device = device_type_for_recipe(info.recipe); resolve_all_model_paths(info); @@ -2105,16 +1598,14 @@ void ModelManager::add_model_to_cache(const std::string& model_name) { return; // Backend not available, don't add to cache } - // Check download status + // Check download status (collections aggregate their components; everyone + // else asks its backend ops). if (is_collection_recipe(info.recipe)) { info.downloaded = check_component_downloaded(info, models_cache_); - } else if (info.recipe == "flm") { - auto flm_models = get_flm_installed_models(); - info.downloaded = std::find(flm_models.begin(), flm_models.end(), info.checkpoint()) != flm_models.end(); - } else if (info.recipe == "cloud") { - info.downloaded = true; // Cloud-offloaded models have no local artifacts } else { - info.downloaded = are_required_checkpoints_complete(info); + backends::BackendOpsContext octx; + octx.model_manager = this; + info.downloaded = backends::ops_for(info.recipe)->is_downloaded(info, octx); } populate_model_metadata(info); @@ -2153,10 +1644,10 @@ void ModelManager::update_model_in_cache(const std::string& model_name, bool dow // The path changes now that files exist on disk if (downloaded) { resolve_all_model_paths(it->second); - if (it->second.recipe == "flm") { + if (backends::ops_for(it->second.recipe)->invalidates_cache_after_download()) { cache_valid_ = false; - LOG(INFO, "ModelManager") << "Invalidated model cache after FLM download for '" - << model_name << "'" << std::endl; + LOG(INFO, "ModelManager") << "Invalidated model cache after download for '" + << model_name << "' (backend rebuilds its model list)" << std::endl; return; } populate_model_metadata(it->second); @@ -2668,16 +2159,12 @@ void ModelManager::register_user_model(const std::string& model_name, // loop above; this local is just for the label inference below. std::string recipe = model_data.value("recipe", ""); - if (recipe == "sd-cpp") { - labels.insert("image"); - } - if (recipe == "whispercpp") { - labels.insert("transcription"); - labels.insert("realtime-transcription"); - } - if (recipe == "moonshine") { - labels.insert("transcription"); - labels.insert("realtime-transcription"); + // Inject the backend's default labels for models that omit them (e.g. sd-cpp + // -> image, whispercpp/moonshine -> transcription). Sourced from the descriptor. + if (const auto* desc = lemon::backends::descriptor_for(recipe)) { + for (const auto& label : desc->default_labels) { + labels.insert(label); + } } model_entry["labels"] = labels; @@ -2738,188 +2225,8 @@ void ModelManager::unregister_user_model(const std::string& model_name) { cache_valid_ = false; } -// Find the FLM executable: install dir on Windows, system PATH on Linux. -// Returns empty string if not found. -static std::string find_flm_binary() { - try { - return backends::BackendUtils::get_backend_binary_path( - backends::FastFlowLMServer::SPEC, "npu"); - } catch (...) { -#ifndef _WIN32 - return utils::find_flm_executable(); -#else - return ""; -#endif - } -} - -// Helper function to get FLM installed models by calling 'flm list --filter installed --quiet' -std::vector ModelManager::get_flm_installed_models() { - std::vector installed_models; - - std::string flm_path = find_flm_binary(); - if (flm_path.empty()) return installed_models; - // Run 'flm list --filter installed --quiet --json' to get only installed models - std::string output; -#ifdef _WIN32 - std::string command = "\"" + flm_path + "\" list --filter installed --quiet --json 2>NUL"; - int rc = lemon::utils::ProcessManager::run_command(command, output); -#else - std::string command = "\"" + flm_path + "\" list --filter installed --quiet --json 2>/dev/null"; - FILE* pipe = popen(command.c_str(), "r"); - if (!pipe) { - return installed_models; - } - - char buffer[256]; - while (fgets(buffer, sizeof(buffer), pipe) != nullptr) { - output += buffer; - } - - pclose(pipe); -#endif - - // Parse output: { "models": [ { "name": "modelname:tag", ... }, ... ] } - try { - json j = JsonUtils::parse(output); - if (j.contains("models") && j["models"].is_array()) { - for (const auto& model : j["models"]) { - if (model.contains("name") && model["name"].is_string()) { - installed_models.push_back(model["name"].get()); - } - } - return installed_models; - } - } catch (...) { - // Fallback to legacy parsing if JSON parsing fails - } - - // Legacy parsing - cleaner format without emojis - // Expected format: - // Models: - // - modelname:tag - // - another:model - std::istringstream stream(output); - std::string line; - while (std::getline(stream, line)) { - // Trim whitespace - line.erase(0, line.find_first_not_of(" \t\r\n")); - line.erase(line.find_last_not_of(" \t\r\n") + 1); - - // Skip the "Models:" header line or empty lines - if (line == "Models:" || line.empty()) { - continue; - } - - // Parse model checkpoint (format: " - modelname:tag") - if (line.find("- ") == 0) { - std::string checkpoint = line.substr(2); - // Trim any remaining whitespace - checkpoint.erase(0, checkpoint.find_first_not_of(" \t")); - checkpoint.erase(checkpoint.find_last_not_of(" \t") + 1); - if (!checkpoint.empty()) { - installed_models.push_back(checkpoint); - } - } - } - - return installed_models; -} - -std::vector ModelManager::get_flm_available_models() { - std::vector flm_models; - - std::string flm_path = find_flm_binary(); - if (flm_path.empty()) return flm_models; - - LOG(INFO, "ModelManager") << "FLM binary found at: " << flm_path << std::endl; - - // Run 'flm list --json' to get all available models - std::string output; -#ifdef _WIN32 - std::string command = "\"" + flm_path + "\" list --json"; - int rc = lemon::utils::ProcessManager::run_command(command, output); - LOG(INFO, "ModelManager") << "flm list --json exit code: " << rc - << ", output length: " << output.size() << std::endl; - if (rc != 0 || output.empty()) { - LOG(WARNING, "ModelManager") << "flm list --json failed or returned empty. " - << "Output: " << output.substr(0, 200) << std::endl; - } -#else - std::string command = "\"" + flm_path + "\" list --json 2>/dev/null"; - FILE* pipe = popen(command.c_str(), "r"); - if (!pipe) { - return flm_models; - } - - char buffer[256]; - while (fgets(buffer, sizeof(buffer), pipe) != nullptr) { - output += buffer; - } - - pclose(pipe); -#endif - - // Parse output: { "models": [ { "name": "modelname:tag", "footprint": 1.23, ... }, ... ] } - try { - json j = JsonUtils::parse(output); - if (j.contains("models") && j["models"].is_array()) { - for (const auto& m : j["models"]) { - if (m.contains("name") && m["name"].is_string()) { - std::string checkpoint = m["name"].get(); - - // Format display name: replace : with -, append -FLM - // e.g., "llama3.2:1b" -> "llama3.2-1b-FLM" - std::string display_name = checkpoint; - // Replace : with - - std::replace(display_name.begin(), display_name.end(), ':', '-'); - - std::string model_name = display_name + "-FLM"; - - ModelInfo info; - info.model_name = model_name; - info.checkpoints["main"] = checkpoint; - info.recipe = "flm"; - info.suggested = true; // All official FLM models are suggested - - if (JsonUtils::get_or_default(m, "installed", false) && m.contains("url") && m["url"].is_string()) { - fs::path config_path = find_flm_config_path_from_repo_dir(repo_dir_from_url(m["url"].get())); - if (!config_path.empty()) { - info.resolved_paths["config"] = path_to_utf8(config_path); - } - } - - // Size in GB (footprint field contains disk size in GB) - if (m.contains("footprint") && m["footprint"].is_number()) { - info.size = m["footprint"].get(); - } - - // Labels from FLM metadata - if (m.contains("label") && m["label"].is_array()) { - for (const auto& l : m["label"]) { - if (l.is_string()) { - info.labels.push_back(l.get()); - } - } - } - - // Populate type and device fields (multi-model support) - info.type = get_model_type_from_labels(info.labels); - info.device = get_device_type_from_recipe(info.recipe); - - flm_models.push_back(info); - } - } - } - } catch (const std::exception& e) { - LOG(WARNING, "ModelManager") << "FLM model discovery failed: " << e.what() << std::endl; - } catch (...) { - LOG(WARNING, "ModelManager") << "FLM model discovery failed with unknown error" << std::endl; - } - return flm_models; -} bool ModelManager::is_model_downloaded(const std::string& model_name) { // Build cache if needed @@ -2943,19 +2250,17 @@ bool ModelManager::is_model_downloaded(const std::string& model_name) { return false; } -void ModelManager::download_registered_model(const ModelInfo& info, bool do_not_upgrade, DownloadProgressCallback progress_callback) { - // Cloud models have no local artifacts; "downloading" is a no-op. - if (info.recipe == "cloud") { - update_model_in_cache(info.model_name, true); - return; - } +bool ModelManager::backend_self_manages_downloads(const std::string& recipe) const { + const auto* desc = backends::descriptor_for(recipe); + return desc && desc->self_manages_downloads; +} - // Use recipe-specific download paths - if (info.recipe == "flm") { - download_from_flm(info.checkpoint(), do_not_upgrade, progress_callback); - } else { - download_from_huggingface(info, progress_callback); - } +void ModelManager::download_registered_model(const ModelInfo& info, bool do_not_upgrade, DownloadProgressCallback progress_callback) { + // The backend's ops own the download (shared HF engine by default; flm pulls + // via the flm CLI; cloud is a no-op). + backends::BackendOpsContext octx; + octx.model_manager = this; + backends::ops_for(info.recipe)->download_model(info, do_not_upgrade, progress_callback, octx); // Update cache after successful download update_model_in_cache(info.model_name, true); @@ -3349,20 +2654,11 @@ void ModelManager::download_model(const std::string& model_name, ); } - // Validate GGUF models (llamacpp recipe) require a variant - if (actual_recipe == "llamacpp") { - std::string checkpoint_lower = actual_checkpoint; - std::transform(checkpoint_lower.begin(), checkpoint_lower.end(), - checkpoint_lower.begin(), ::tolower); - if (checkpoint_lower.find("gguf") != std::string::npos && - actual_checkpoint.find(':') == std::string::npos) { - throw std::runtime_error( - "You are required to provide a 'variant' in the checkpoint field when " - "registering a GGUF model. The variant is provided as CHECKPOINT:VARIANT. " - "For example: Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_0 or " - "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:qwen2.5-coder-3b-instruct-q4_0.gguf" - ); - } + // Backend-specific checkpoint validation (llamacpp: GGUF needs :variant). + if (auto err = backends::ops_for(actual_recipe)->validate_registration_checkpoint( + actual_checkpoint); + !err.empty()) { + throw std::runtime_error(err); } LOG(INFO, "ModelManager") << "Registering new user model: " << model_name << std::endl; @@ -4185,7 +3481,11 @@ void ModelManager::download_from_huggingface(const ModelInfo& info, bool is_direct_file = ends_with(main_variant, ".safetensors") || ends_with(main_variant, ".pth") || ends_with(main_variant, ".ckpt"); - bool is_moonshine = info.recipe == "moonshine"; + + // Backends with a bespoke artifact layout (moonshine = a directory of + // files) select their own download set; nullopt = the default paths. + auto backend_files = + backends::ops_for(info.recipe)->select_checkpoint_files(main_variant, repo_files); if (is_direct_file) { // For non-GGUF model files, download the specified file directly @@ -4195,22 +3495,10 @@ void ModelManager::download_from_huggingface(const ModelInfo& info, } else { throw std::runtime_error("Model file not found in repository: " + main_variant); } - } else if (is_moonshine) { - // Moonshine variant is a directory path (e.g., "medium-streaming-en/quantized") - // Download all files under that directory - std::string folder_prefix = main_variant; - if (!folder_prefix.empty() && folder_prefix.back() != '/') { - folder_prefix += "/"; - } - for (const auto& file : repo_files) { - if (gguf_reader_detail::starts_with_ignore_case(file, folder_prefix)) { - files_to_download[main_repo_id].push_back(file); - } - } - if (files_to_download[main_repo_id].empty()) { - throw std::runtime_error("No Moonshine model files found in folder: " + main_variant); - } - LOG(INFO, "ModelManager") << "Moonshine: downloading " << files_to_download[main_repo_id].size() + } else if (backend_files) { + files_to_download[main_repo_id] = std::move(*backend_files); + LOG(INFO, "ModelManager") << info.recipe << ": downloading " + << files_to_download[main_repo_id].size() << " files from " << main_variant << std::endl; } else { // GGUF model: Use identify_gguf_models to determine which files to download @@ -4440,224 +3728,6 @@ void ModelManager::download_from_huggingface(const ModelInfo& info, LOG(INFO, "ModelManager") << "Download location: " << reported_download_path << std::endl; } -void ModelManager::download_from_flm(const std::string& checkpoint, - bool do_not_upgrade, - DownloadProgressCallback progress_callback) { - LOG(INFO, "ModelManager") << "Pulling FLM model: " << checkpoint << std::endl; - - // Ensure FLM is ready (single source of truth) - auto status = SystemInfoCache::get_flm_status(); - if (!status.is_ready()) { - throw std::runtime_error(status.error_string()); - } - - std::string flm_path = find_flm_binary(); - if (flm_path.empty()) { - throw std::runtime_error("FLM executable not found"); - } - - // Prepare arguments - std::vector args = {"pull", checkpoint}; - if (!do_not_upgrade) { - args.push_back("--force"); - } - - LOG(INFO, "ProcessManager") << "Starting process: \"" << flm_path << "\""; - for (const auto& arg : args) { - LOG(INFO, "ProcessManager") << " \"" << arg << "\""; - } - LOG(INFO, "ProcessManager") << std::endl; - - // State for parsing FLM output - int total_files = 0; - int current_file_index = 0; - std::string current_filename; - bool cancelled = false; - - // Run flm pull command and parse output - int exit_code = utils::ProcessManager::run_process_with_output( - flm_path, args, - [&](const std::string& line) -> bool { - // Always print the line to console - LOG(INFO, "FLM") << line << std::endl; - - // Parse FLM output to extract progress information - // Pattern: "[FLM] Downloading X/Y: filename" - if (line.find("[FLM] Downloading ") != std::string::npos && - line.find("/") != std::string::npos && - line.find(":") != std::string::npos) { - - // Extract "X/Y: filename" from "[FLM] Downloading X/Y: filename" - size_t start = line.find("Downloading ") + 12; - size_t slash = line.find("/", start); - size_t colon = line.find(":", slash); - - if (slash != std::string::npos && colon != std::string::npos) { - try { - current_file_index = std::stoi(line.substr(start, slash - start)); - total_files = std::stoi(line.substr(slash + 1, colon - slash - 1)); - current_filename = line.substr(colon + 2); // Skip ": " - - // Send progress update - if (progress_callback) { - DownloadProgress progress; - progress.file = current_filename; - progress.file_index = current_file_index; - progress.total_files = total_files; - progress.bytes_downloaded = 0; - progress.bytes_total = 0; - progress.percent = (total_files > 0) ? - ((current_file_index - 1) * 100 / total_files) : 0; - - if (!progress_callback(progress)) { - cancelled = true; - return false; // Kill the process - } - } - } catch (...) { - // Ignore parse errors - } - } - } - // Pattern: "[FLM] Downloading: XX.X% (XXX.XMB / XXX.XMB)" - else if (line.find("[FLM] Downloading: ") != std::string::npos && - line.find("%") != std::string::npos) { - - // Extract percentage and bytes - size_t start = line.find("Downloading: ") + 13; - size_t pct_end = line.find("%", start); - - if (pct_end != std::string::npos) { - try { - std::string pct_str = line.substr(start, pct_end - start); - double file_percent = std::stod(pct_str); - - // Try to extract bytes (XXX.XMB / XXX.XMB) - size_t open_paren = line.find("(", pct_end); - size_t slash = line.find("/", open_paren); - size_t close_paren = line.find(")", slash); - - size_t bytes_downloaded = 0; - size_t bytes_total = 0; - - if (open_paren != std::string::npos && slash != std::string::npos) { - std::string downloaded_str = line.substr(open_paren + 1, slash - open_paren - 1); - std::string total_str = line.substr(slash + 1, close_paren - slash - 1); - - // Parse "XXX.XMB" format - auto parse_size = [](const std::string& s) -> size_t { - double val = 0; - size_t mb_pos = s.find("MB"); - size_t gb_pos = s.find("GB"); - size_t kb_pos = s.find("KB"); - - if (mb_pos != std::string::npos) { - val = std::stod(s.substr(0, mb_pos)); - return static_cast(val * 1024 * 1024); - } else if (gb_pos != std::string::npos) { - val = std::stod(s.substr(0, gb_pos)); - return static_cast(val * 1024 * 1024 * 1024); - } else if (kb_pos != std::string::npos) { - val = std::stod(s.substr(0, kb_pos)); - return static_cast(val * 1024); - } - return 0; - }; - - bytes_downloaded = parse_size(downloaded_str); - bytes_total = parse_size(total_str); - } - - // Send progress update with byte-level info - if (progress_callback) { - DownloadProgress progress; - progress.file = current_filename; - progress.file_index = current_file_index; - progress.total_files = total_files; - progress.bytes_downloaded = bytes_downloaded; - progress.bytes_total = bytes_total; - // Use intra-file percent when we have byte-level progress - progress.percent = static_cast(file_percent); - - if (!progress_callback(progress)) { - cancelled = true; - return false; // Kill the process - } - } - } catch (...) { - // Ignore parse errors - } - } - } - // Pattern: "[FLM] Overall progress: XX.X% (X/Y files)" - else if (line.find("[FLM] Overall progress: ") != std::string::npos) { - size_t start = line.find("progress: ") + 10; - size_t pct_end = line.find("%", start); - - if (pct_end != std::string::npos) { - try { - int overall_percent = static_cast(std::stod(line.substr(start, pct_end - start))); - - if (progress_callback) { - DownloadProgress progress; - progress.file = current_filename; - progress.file_index = current_file_index; - progress.total_files = total_files; - progress.bytes_downloaded = 0; // Not available for overall progress - progress.bytes_total = 0; - progress.percent = overall_percent; - - if (!progress_callback(progress)) { - cancelled = true; - return false; // Kill the process - } - } - } catch (...) { - // Ignore parse errors - } - } - } - // Pattern: "[FLM] Missing files (N):" - else if (line.find("[FLM] Missing files (") != std::string::npos) { - size_t start = line.find("(") + 1; - size_t end = line.find(")", start); - if (end != std::string::npos) { - try { - total_files = std::stoi(line.substr(start, end - start)); - } catch (...) { - // Ignore parse errors - } - } - } - - return true; // Continue - }, - "", // Working directory - 3600 // 1 hour timeout for large model downloads - ); - - if (cancelled) { - LOG(INFO, "ModelManager") << "FLM download cancelled by client" << std::endl; - throw std::runtime_error("Download cancelled"); - } - - if (exit_code != 0) { - LOG(ERROR, "ModelManager") << "FLM pull failed with exit code: " << exit_code << std::endl; - throw std::runtime_error("FLM pull failed with exit code: " + std::to_string(exit_code)); - } - - // Send completion event - if (progress_callback) { - DownloadProgress progress; - progress.complete = true; - progress.file_index = total_files; - progress.total_files = total_files; - progress.percent = 100; - (void)progress_callback(progress); // Ignore return - download already complete - } - - LOG(INFO, "ModelManager") << "FLM model pull completed successfully" << std::endl; -} void ModelManager::delete_model(const std::string& model_name) { auto info = get_model_info(model_name); @@ -4673,55 +3743,9 @@ void ModelManager::delete_model(const std::string& model_name) { "Delete the file directly from: " + info.checkpoint()); } - // Handle FLM models separately + // FLM models have no local HF cache; deletion is the backend's `flm remove`. if (info.recipe == "flm") { - LOG(INFO, "ModelManager") << "Deleting FLM model: " << info.checkpoint() << std::endl; - - // Validate checkpoint is not empty - if (info.checkpoint().empty()) { - throw std::runtime_error("FLM model has empty checkpoint field, cannot delete"); - } - - // Find flm executable — on Windows flm.exe lives under the lemonade - // cache dir, not on PATH, so we must resolve the full path. - std::string flm_path = find_flm_binary(); - if (flm_path.empty()) { - throw std::runtime_error("FLM executable not found"); - } - - // Prepare arguments for 'flm remove' command - std::vector args = {"remove", info.checkpoint()}; - - LOG(INFO, "ProcessManager") << "Starting process: \"" << flm_path << "\""; - for (const auto& arg : args) { - LOG(INFO, "ProcessManager") << " \"" << arg << "\""; - } - LOG(INFO, "ProcessManager") << std::endl; - - // Run flm remove command - auto handle = utils::ProcessManager::start_process(flm_path, args, "", false); - - // Wait for process to complete - int timeout_seconds = 60; // 1 minute timeout for removal - for (int i = 0; i < timeout_seconds * 10; ++i) { - if (!utils::ProcessManager::is_running(handle)) { - int exit_code = utils::ProcessManager::get_exit_code(handle); - if (exit_code != 0) { - LOG(ERROR, "ModelManager") << "FLM remove failed with exit code: " << exit_code << std::endl; - throw std::runtime_error("Failed to delete FLM model " + canonical_model_name + ": FLM remove failed with exit code " + std::to_string(exit_code)); - } - break; - } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - - // Check if process is still running (timeout) - if (utils::ProcessManager::is_running(handle)) { - LOG(ERROR, "ModelManager") << "FLM remove timed out" << std::endl; - throw std::runtime_error("Failed to delete FLM model " + canonical_model_name + ": FLM remove timed out"); - } - - LOG(INFO, "ModelManager") << "Successfully deleted FLM model: " << canonical_model_name << std::endl; + backends::fastflowlm::flm_remove(info.checkpoint()); // Remove from user models if it's a user model if (is_user_model_name(canonical_model_name)) { @@ -5214,7 +4238,6 @@ ModelInfo ModelManager::get_model_info_unfiltered(const std::string& model_name) parse_components(info, *model_json); info.recipe = JsonUtils::get_or_default(*model_json, "recipe", ""); info.suggested = JsonUtils::get_or_default(*model_json, "suggested", false); - info.hf_load = JsonUtils::get_or_default(*model_json, "hf_load", false); info.source = JsonUtils::get_or_default(*model_json, "source", ""); // Parse labels array @@ -5233,10 +4256,7 @@ ModelInfo ModelManager::get_model_info_unfiltered(const std::string& model_name) } } - // Parse moonshine_arch - if (model_json->contains("moonshine_arch") && (*model_json)["moonshine_arch"].is_number_integer()) { - info.moonshine_arch = (*model_json)["moonshine_arch"].get(); - } + parse_extras(info, *model_json); return info; } diff --git a/src/cpp/server/ollama_api.cpp b/src/cpp/server/ollama_api.cpp index 7687caab4..0604a3935 100644 --- a/src/cpp/server/ollama_api.cpp +++ b/src/cpp/server/ollama_api.cpp @@ -238,8 +238,9 @@ void OllamaApi::auto_load_model(const std::string& model) { auto info = model_manager_->get_model_info(name); - // Download if not cached - if (info.recipe != "flm" && !model_manager_->is_model_downloaded(name)) { + // Download if not cached (backends that self-manage downloads pull on load) + if (!model_manager_->backend_self_manages_downloads(info.recipe) && + !model_manager_->is_model_downloaded(name)) { LOG(INFO, "OllamaApi") << "Model not cached, downloading..." << std::endl; model_manager_->download_registered_model(info, true); info = model_manager_->get_model_info(name); diff --git a/src/cpp/server/prometheus_metrics.cpp b/src/cpp/server/prometheus_metrics.cpp index 8ecfdb288..88f7bdaf3 100644 --- a/src/cpp/server/prometheus_metrics.cpp +++ b/src/cpp/server/prometheus_metrics.cpp @@ -1,5 +1,6 @@ #include "lemon/prometheus_metrics.h" +#include "lemon/backends/backend_descriptor_registry.h" #include "lemon/version.h" #include @@ -274,7 +275,8 @@ void append_llamacpp_backend_metrics(PrometheusBuilder& metrics, const json& model, const std::map& labels, std::set& described_backend_metrics) { - if (model.value("recipe", "") != "llamacpp") { + const auto* desc = backends::descriptor_for(model.value("recipe", "")); + if (desc == nullptr || !desc->exposes_prometheus_metrics) { return; } diff --git a/src/cpp/server/recipe_options.cpp b/src/cpp/server/recipe_options.cpp index 65d4bb676..70c188e34 100644 --- a/src/cpp/server/recipe_options.cpp +++ b/src/cpp/server/recipe_options.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -12,78 +13,68 @@ namespace lemon { using json = nlohmann::json; -static const json DEFAULTS = { - {"ctx_size", -1}, // -1 triggers auto-resolution (memory + arch metadata) - {"merge_args", true}, - {"llamacpp_device", ""}, - {"llamacpp_backend", ""}, // Will be overridden dynamically - {"llamacpp_args", ""}, - {"sd-cpp_backend", ""}, // "" means auto-detect (mapped from "auto" in config.json) - {"sdcpp_args", ""}, - {"whispercpp_backend", ""}, // "" means auto-detect (mapped from "auto" in config.json) - {"whispercpp_args", ""}, - {"moonshine_args", ""}, // Custom arguments to pass to moonshine-server - // Image generation defaults (for sd-cpp recipe) - // These are recipe-level defaults only, not CLI arguments — per reviewer guidance, - // there are too many image gen params for CLI flags, and no universal defaults. - {"steps", 20}, - {"cfg_scale", 7.0}, - {"width", 512}, - {"height", 512}, - {"sampling_method", ""}, - {"flow_shift", 0.0}, - // vLLM-specific options - {"vllm_backend", ""}, // "" means auto-detect - {"vllm_args", ""}, // Custom arguments to pass to vllm-server - // Cloud recipe has no backend variants (provider selection lives on the - // per-model cloud_provider field). The empty string satisfies Router's - // per-backend-args lookup; cloud reads no backend-specific config. - {"cloud_backend", ""}, - - // Auto-eviction options - {"auto_evict", nullptr}, // nullptr means fallback to global config - {"evict_idle_timeout", 300}, // Default hard idle timeout (5 mins) - {"downsize_idle_timeout", 60}, // Default soft idle timeout (1 min) - {"evict_weight_factor", 1.0}, // Eviction-protection weight (higher = more protected) - {"pinned", false} -}; - - -// Mapping from flat option names to CLI flags (used by to_cli_options) -// Note: Image generation params (steps, cfg_scale, width, height, sampling_method, -// flow_shift) are recipe-level defaults only — not exposed as CLI arguments. -// Runtime options (diffusion_fa, offload_to_cpu) go through --sdcpp-args. -static const std::map OPTION_TO_CLI_FLAG = { - {"ctx_size", "--ctx-size"}, - {"merge_args", "--merge-args"}, - {"llamacpp_backend", "--llamacpp"}, - {"llamacpp_device", "--llamacpp-device"}, - {"llamacpp_args", "--llamacpp-args"}, - {"sd-cpp_backend", "--sdcpp"}, - {"sdcpp_args", "--sdcpp-args"}, - {"whispercpp_backend", "--whispercpp"}, - {"whispercpp_args", "--whispercpp-args"}, - {"moonshine_args", "--moonshine-args"}, - {"vllm_backend", "--vllm"}, - {"vllm_args", "--vllm-args"} -}; +// Options shared by every backend. Per-backend options (and ctx_size opt-in) +// come from each backend's descriptor; these are the universal kit. +static const json& common_defaults() { + static const json d = { + {"ctx_size", -1}, // -1 triggers auto-resolution (memory + arch metadata) + {"merge_args", true}, + // Auto-eviction options (apply to every recipe) + {"auto_evict", nullptr}, // nullptr means fallback to global config + {"evict_idle_timeout", 300}, // Default hard idle timeout (5 mins) + {"downsize_idle_timeout", 60}, // Default soft idle timeout (1 min) + {"evict_weight_factor", 1.0}, // Eviction-protection weight (higher = more protected) + {"pinned", false}, + }; + return d; +} + +// Defaults for every option: the common kit plus each backend descriptor's +// declared options. Built once from the registry so config defaults, CLI flags, +// and load-time resolution can never drift from the descriptors. +static const json& get_defaults() { + static const json defaults = [] { + json d = common_defaults(); + for (const auto* desc : lemon::backends::all_descriptors()) { + for (const auto& opt : desc->options) { + d[opt.name] = opt.default_value; + } + } + return d; + }(); + return defaults; +} + +// Flat option name -> CLI flag, for to_cli_options(). ctx_size/merge_args are +// the common flags; the rest come from descriptor options that declare a flag. +static const std::map& get_option_to_cli_flag() { + static const std::map mapping = [] { + std::map m{ + {"ctx_size", "--ctx-size"}, + {"merge_args", "--merge-args"}, + }; + for (const auto* desc : lemon::backends::all_descriptors()) { + for (const auto& opt : desc->options) { + if (!opt.cli_flag.empty()) { + m[opt.name] = opt.cli_flag; + } + } + } + return m; + }(); + return mapping; +} static std::vector get_keys_for_recipe(const std::string& recipe) { std::vector keys; - if (recipe == "llamacpp") { - keys = {"ctx_size", "llamacpp_device", "llamacpp_backend", "llamacpp_args", "merge_args"}; - } else if (recipe == "whispercpp") { - keys = {"whispercpp_backend", "whispercpp_args", "merge_args"}; - } else if (recipe == "moonshine") { - keys = {"moonshine_args", "merge_args"}; - } else if (recipe == "flm") { - return {"ctx_size", "merge_args"}; - } else if (recipe == "ryzenai-llm") { - keys = {"ctx_size"}; - } else if (recipe == "sd-cpp") { - keys = {"sd-cpp_backend", "sdcpp_args", "steps", "cfg_scale", "width", "height", "sampling_method", "flow_shift", "merge_args"}; - } else if (recipe == "vllm") { - keys = {"ctx_size", "vllm_backend", "vllm_args", "merge_args"}; + if (const auto* desc = lemon::backends::descriptor_for(recipe)) { + if (desc->uses_ctx_size) { + keys.push_back("ctx_size"); + } + for (const auto& opt : desc->options) { + keys.push_back(opt.name); + } + keys.push_back("merge_args"); } // Add auto-eviction options for all recipes @@ -125,7 +116,7 @@ static bool try_get_backend_options(const std::string& opt_name, SystemInfo::Sup std::vector RecipeOptions::to_cli_options(const json& raw_options) { std::vector cli; - for (auto& [opt_name, cli_flag] : OPTION_TO_CLI_FLAG) { + for (auto& [opt_name, cli_flag] : get_option_to_cli_flag()) { if (raw_options.contains(opt_name)) { auto val = raw_options[opt_name]; if (!val.is_null() && val != "") { @@ -146,7 +137,7 @@ std::vector RecipeOptions::to_cli_options(const json& raw_options) std::vector RecipeOptions::known_keys() { std::vector keys; - for (auto& [key, value] : DEFAULTS.items()) { + for (auto& [key, value] : get_defaults().items()) { keys.push_back(key); } return keys; @@ -239,7 +230,7 @@ json RecipeOptions::get_option(const std::string& opt) const { } } #endif - return DEFAULTS.contains(opt) ? DEFAULTS[opt] : json(); + return get_defaults().contains(opt) ? get_defaults()[opt] : json(); } void RecipeOptions::set_option(const std::string& opt, const json& value) { @@ -247,29 +238,38 @@ void RecipeOptions::set_option(const std::string& opt, const json& value) { } #ifdef LEMONADE_CLI -// CLI_OPTIONS used only by the lemonade CLI client for add_cli_options -static const json CLI_OPTIONS = { - {"--ctx-size", {{"option_name", "ctx_size"}, {"type_name", "SIZE"}, {"help", "Context size for the model"}, {"group", "General Options"}}}, - {"--merge-args", {{"option_name", "merge_args"}, {"type_name", "BOOL"}, {"help", "Merge global and model arguments when loading the model"}, {"group", "General Options"}}}, - {"--llamacpp", {{"option_name", "llamacpp_backend"}, {"type_name", "BACKEND"}, {"help", "LlamaCpp backend to use"}, {"group", "Llama.cpp Backend Options"}}}, - {"--llamacpp-device", {{"option_name", "llamacpp_device"}, {"type_name", "DEVICES"}, {"help", "Comma-separated list of accelerator devices to use (e.g. Vulkan0)"}, {"group", "Llama.cpp Backend Options"}}}, - {"--llamacpp-args", {{"option_name", "llamacpp_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to llama-server"}, {"group", "Llama.cpp Backend Options"}}}, - {"--sdcpp", {{"option_name", "sd-cpp_backend"}, {"type_name", "BACKEND"}, {"help", "SD.cpp backend to use"}, {"group", "Stable Diffusion Options"}}}, - {"--sdcpp-args", {{"option_name", "sdcpp_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to sd-server (must not conflict with managed args)"}, {"group", "Stable Diffusion Options"}}}, - {"--whispercpp", {{"option_name", "whispercpp_backend"}, {"type_name", "BACKEND"}, {"help", "WhisperCpp backend to use"}, {"group", "Whisper.cpp Options"}}}, - {"--whispercpp-args", {{"option_name", "whispercpp_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to whisper-server"}, {"group", "Whisper.cpp Options"}}}, - {"--moonshine-args", {{"option_name", "moonshine_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to moonshine-server"}}}, - {"--vllm", {{"option_name", "vllm_backend"}, {"type_name", "BACKEND"}, {"help", "vLLM backend to use"}, {"group", "vLLM Options"}}}, - {"--vllm-args", {{"option_name", "vllm_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to vllm-server"}, {"group", "vLLM Options"}}}, - // Note: Image gen params (--steps, --cfg-scale, --width, --height) removed — recipe-level only. - // Runtime options (--diffusion-fa, --offload-to-cpu) go through --sdcpp-args. -}; +// CLI_OPTIONS used only by the lemonade CLI client for add_cli_options. +// ctx_size/merge_args are the common flags; everything else is derived from +// descriptor options that declare a CLI flag, so the CLI never needs editing +// when a backend is added. Image-gen params (steps/cfg_scale/width/height) have +// no cli_flag in their descriptor, so they stay recipe-level only as before. +static const json& get_cli_options() { + static const json cli_options = [] { + json o = json::object(); + o["--ctx-size"] = {{"option_name", "ctx_size"}, {"type_name", "SIZE"}, {"help", "Context size for the model"}, {"group", "General Options"}}; + o["--merge-args"] = {{"option_name", "merge_args"}, {"type_name", "BOOL"}, {"help", "Merge global and model arguments when loading the model"}, {"group", "General Options"}}; + for (const auto* desc : lemon::backends::all_descriptors()) { + for (const auto& opt : desc->options) { + if (opt.cli_flag.empty()) { + continue; + } + json entry = {{"option_name", opt.name}, {"type_name", opt.type_name}, {"help", opt.help}}; + if (!opt.group.empty()) { + entry["group"] = opt.group; + } + o[opt.cli_flag] = entry; + } + } + return o; + }(); + return cli_options; +} void RecipeOptions::add_cli_options(CLI::App& app, json& storage) { - for (auto& [key, opt] : CLI_OPTIONS.items()) { + for (auto& [key, opt] : get_cli_options().items()) { const std::string opt_name = opt["option_name"]; CLI::Option* o; - json defval = DEFAULTS[opt_name]; + json defval = get_defaults()[opt_name]; if (defval.is_number_float()) { o = app.add_option_function(key, [opt_name, &storage = storage](double val) { storage[opt_name] = val; }, opt["help"]); diff --git a/src/cpp/server/router.cpp b/src/cpp/server/router.cpp index b3ec22c3b..514a9773e 100644 --- a/src/cpp/server/router.cpp +++ b/src/cpp/server/router.cpp @@ -1,14 +1,15 @@ #include "lemon/router.h" #include "lemon/cloud_provider_registry.h" -#include "lemon/backends/cloud_server.h" -#include "lemon/backends/llamacpp_server.h" -#include "lemon/backends/fastflowlm_server.h" -#include "lemon/backends/ryzenaiserver.h" -#include "lemon/backends/whisper_server.h" -#include "lemon/backends/moonshine_server.h" -#include "lemon/backends/kokoro_server.h" -#include "lemon/backends/sd_server.h" -#include "lemon/backends/vllm_server.h" +#include "lemon/backends/backend_registry.h" +#include "lemon/backends/cloud/cloud_server.h" +#include "lemon/backends/llamacpp/llamacpp_server.h" +#include "lemon/backends/fastflowlm/fastflowlm_server.h" +#include "lemon/backends/ryzenai/ryzenai_server.h" +#include "lemon/backends/whispercpp/whispercpp_server.h" +#include "lemon/backends/moonshine/moonshine_server.h" +#include "lemon/backends/kokoro/kokoro_server.h" +#include "lemon/backends/sdcpp/sdcpp_server.h" +#include "lemon/backends/vllm/vllm_server.h" #include "lemon/server_capabilities.h" #include "lemon/error_types.h" #include "lemon/recipe_options.h" @@ -143,12 +144,26 @@ bool Router::reload_model_after_watchdog_reset(const std::string& requested_mode } } +// Slot/eviction policy for a recipe, from its descriptor (default Standard). +// This is the recipe-static policy used for pre-load slot decisions, mirroring +// the historical use of get_device_type_from_recipe at load time. +static SlotPolicy slot_policy_for_recipe(const std::string& recipe) { + if (const auto* desc = backends::descriptor_for(recipe)) { + return desc->slot_policy; + } + return SlotPolicy::Standard; +} + +static bool is_unmetered_recipe(const std::string& recipe) { + return slot_policy_for_recipe(recipe) == SlotPolicy::Unmetered; +} + int Router::count_servers_by_type(ModelType type) const { int count = 0; for (const auto& server : loaded_servers_) { - // Cloud servers consume no local memory and stay loaded for free, so - // they are excluded from the slot accounting that drives LRU eviction. - if (server->get_recipe_options().get_recipe() == "cloud") { + // Unmetered backends (cloud) consume no local memory and stay loaded for + // free, so they are excluded from the slot accounting that drives LRU eviction. + if (is_unmetered_recipe(server->get_recipe_options().get_recipe())) { continue; } if (server->is_backend_alive() && server->get_model_type() == type) { @@ -162,10 +177,10 @@ WrappedServer* Router::find_lru_server_by_type(ModelType type) const { WrappedServer* lru = nullptr; for (const auto& server : loaded_servers_) { - // Cloud servers are not eviction candidates; they have no memory cost - // and reloading them is essentially free, but evicting them throws - // away the cached api key/upstream-id binding for no benefit. - if (server->get_recipe_options().get_recipe() == "cloud") { + // Unmetered backends (cloud) are not eviction candidates; they have no + // memory cost and reloading them is essentially free, but evicting them + // throws away the cached api key/upstream-id binding for no benefit. + if (is_unmetered_recipe(server->get_recipe_options().get_recipe())) { continue; } if (server->is_backend_alive() && server->get_model_type() == type) { @@ -210,10 +225,11 @@ WrappedServer* Router::find_npu_server_by_recipe(const std::string& recipe) cons return nullptr; } -WrappedServer* Router::find_flm_server_by_type(ModelType type) const { +WrappedServer* Router::find_coexisting_server_by_type(ModelType type) const { for (const auto& server : loaded_servers_) { if (server->is_backend_alive() && - server->get_recipe_options().get_recipe() == "flm" && + slot_policy_for_recipe(server->get_recipe_options().get_recipe()) == + SlotPolicy::CoexistByType && server->get_model_type() == type) { return server.get(); } @@ -299,49 +315,28 @@ void Router::simulate_vram_pressure(double pct) { } std::unique_ptr Router::create_backend_server(const ModelInfo& model_info) { - std::unique_ptr new_server; std::string log_level = config_->log_level(); - if (model_info.recipe == "cloud") { - LOG(DEBUG, "Router") << "Creating CloudServer backend (provider: " - << model_info.cloud_provider << ")" << std::endl; - new_server = std::make_unique(model_info.cloud_provider, log_level, - model_manager_, backend_manager_, - cloud_registry_); - } else if (model_info.recipe == "whispercpp") { - LOG(DEBUG, "Router") << "Creating WhisperServer backend" << std::endl; - new_server = std::make_unique(log_level, model_manager_, backend_manager_); - } else if (model_info.recipe == "moonshine") { - LOG(DEBUG, "Router") << "Creating MoonshineServer backend" << std::endl; - new_server = std::make_unique(log_level, model_manager_, backend_manager_); - } else if (model_info.recipe == "kokoro") { - LOG(DEBUG, "Router") << "Creating Kokoro backend" << std::endl; - new_server = std::make_unique(log_level, model_manager_, backend_manager_); - } else if (model_info.recipe == "sd-cpp") { - LOG(DEBUG, "Router") << "Creating SDServer backend" << std::endl; - new_server = std::make_unique(log_level, model_manager_, backend_manager_); - } else if (model_info.recipe == "flm") { - LOG(DEBUG, "Router") << "Creating FastFlowLM backend" << std::endl; - new_server = std::make_unique(log_level, model_manager_, backend_manager_); - } else if (model_info.recipe == "ryzenai-llm") { - LOG(DEBUG, "Router") << "Creating RyzenAI-Server backend" << std::endl; - - std::string model_path = model_info.resolved_path(); - LOG(DEBUG, "Router") << "Using model path: " << model_path << std::endl; - - auto* ryzenai_server = new RyzenAIServer(model_info.model_name, - log_level == "debug", model_manager_, backend_manager_); - ryzenai_server->set_model_path(model_path); - new_server.reset(ryzenai_server); - } else if (model_info.recipe == "vllm") { - LOG(DEBUG, "Router") << "Creating vLLM backend" << std::endl; - new_server = std::make_unique(log_level, model_manager_, backend_manager_); - } else { - LOG(DEBUG, "Router") << "Creating LlamaCpp backend" << std::endl; - new_server = std::make_unique(log_level, model_manager_, backend_manager_); + backends::BackendContext ctx; + ctx.log_level = log_level; + ctx.model_manager = model_manager_; + ctx.backend_manager = backend_manager_; + ctx.cloud_registry = cloud_registry_; + ctx.model_info = &model_info; + + // The backend registry binds each recipe's descriptor to its create(). It is + // the single source of truth for backend construction (see LEMON_BACKENDS). + std::unique_ptr new_server = backends::create_server(model_info.recipe, ctx); + if (new_server) { + LOG(DEBUG, "Router") << "Created backend for recipe '" << model_info.recipe + << "' via registry" << std::endl; + return new_server; } - return new_server; + // Unknown recipe: fall back to llamacpp, preserving the historical default. + LOG(DEBUG, "Router") << "No registered backend for recipe '" << model_info.recipe + << "', defaulting to LlamaCpp" << std::endl; + return std::make_unique(log_level, model_manager_, backend_manager_); } void Router::load_model(const std::string& model_name, @@ -427,52 +422,61 @@ void Router::load_model(const std::string& model_name, // Get max models for this type (same limit for all types) int max_models = config_->max_loaded_models(); - // NPU EXCLUSIVITY CHECK (recipe-aware rules) - // FLM can run up to 3 concurrent NPU processes (1 LLM + 1 transcription + 1 embedding) - // RyzenAI and WhisperCpp lock the entire NPU exclusively - if (device_type & DEVICE_NPU) { - if (model_info.recipe == "ryzenai-llm" || model_info.recipe == "whispercpp") { - // Exclusive NPU recipes - evict ALL NPU servers + // NPU EXCLUSIVITY CHECK — driven by the backend's slot policy (descriptor). + // ExclusiveNpu (ryzenai-llm, whisper-on-npu): lock the entire NPU, + // evicting ALL NPU servers first. + // CoexistByType (flm): coexist with other FLM types (max 1 per type), + // but evict exclusive-NPU peers. + // Standard/Unmetered backends share no device exclusivity. + switch (slot_policy_for_recipe(model_info.recipe)) { + case SlotPolicy::ExclusiveNpu: { if (has_npu_server()) { LOG(INFO, "Router") << model_info.recipe << " requires exclusive NPU access, evicting all NPU servers..." << std::endl; evict_all_npu_servers(); } - } else if (model_info.recipe == "flm") { - // FLM can coexist with other FLM types, but not with exclusive-NPU recipes - // 1. Evict any exclusive-NPU server (mutually exclusive) - for (const std::string& exclusive_recipe : {"ryzenai-llm", "whispercpp"}) { - WrappedServer* exclusive_server = find_npu_server_by_recipe(exclusive_recipe); - if (exclusive_server) { - LOG(INFO, "Router") << "FLM cannot coexist with " << exclusive_recipe - << ", evicting: " << exclusive_server->get_model_name() << std::endl; - evict_server(exclusive_server); + break; + } + case SlotPolicy::CoexistByType: { + // 1. Evict every NPU holder that is not itself a coexisting (FLM) + // backend — i.e. exclusive-NPU peers like ryzenai-llm and + // whisper-on-npu. Collect first; evict_server mutates loaded_servers_. + std::vector exclusive_peers; + for (const auto& server : loaded_servers_) { + if (server->is_backend_alive() && (server->get_device_type() & DEVICE_NPU) && + slot_policy_for_recipe(server->get_recipe_options().get_recipe()) != + SlotPolicy::CoexistByType) { + exclusive_peers.push_back(server.get()); } } + for (auto* peer : exclusive_peers) { + LOG(INFO, "Router") << "FLM cannot coexist with " + << peer->get_recipe_options().get_recipe() + << ", evicting: " << peer->get_model_name() << std::endl; + evict_server(peer); + } // 2. Evict FLM of the SAME model type (max 1 per type: 1 LLM, 1 transcription, 1 embed) - WrappedServer* same_type_flm = find_flm_server_by_type(model_type); + WrappedServer* same_type_flm = find_coexisting_server_by_type(model_type); if (same_type_flm) { LOG(INFO, "Router") << "FLM " << model_type_to_string(model_type) << " slot occupied by: " << same_type_flm->get_model_name() << ", evicting..." << std::endl; evict_server(same_type_flm); } - } else { - // Unknown NPU recipe - default to exclusive access - if (has_npu_server()) { - LOG(INFO, "Router") << "Unknown NPU recipe, evicting all NPU servers..." << std::endl; - evict_all_npu_servers(); - } + break; } + case SlotPolicy::Standard: + case SlotPolicy::Unmetered: + break; } // LRU EVICTION CHECK (from spec: Least Recently Used Cache) - // Skip eviction if unlimited (-1). Cloud-recipe loads also skip the + // Skip eviction if unlimited (-1). Unmetered (cloud) loads also skip the // check entirely: they consume no local resources, so they have no // business kicking a warm local model out of memory. - bool is_cloud_load = (model_info.recipe == "cloud"); + bool is_unmetered_load = is_unmetered_recipe(model_info.recipe); int current_count = count_servers_by_type(model_type); - if (!is_cloud_load && max_models != -1 && current_count >= max_models) { + if (!is_unmetered_load && max_models != -1 && current_count >= max_models) { WrappedServer* lru = find_lru_server_by_type(model_type); if (lru) { LOG(INFO, "Router") << "Slot limit reached for type " @@ -1446,7 +1450,8 @@ void Router::responses_stream(const std::string& request_body, httplib::DataSink int Router::count_pinned_servers_by_type(ModelType type) const { int count = 0; for (const auto& server : loaded_servers_) { - if (server->get_recipe_options().get_recipe() == "cloud") { + // Unmetered servers (cloud) never occupy a slot, so they don't count. + if (is_unmetered_recipe(server->get_recipe_options().get_recipe())) { continue; } if (server->is_backend_alive() && server->get_model_type() == type && server->is_pinned()) { diff --git a/src/cpp/server/runtime_config.cpp b/src/cpp/server/runtime_config.cpp index 261de4477..cc9bd6189 100644 --- a/src/cpp/server/runtime_config.cpp +++ b/src/cpp/server/runtime_config.cpp @@ -1,4 +1,5 @@ #include "lemon/runtime_config.h" +#include "lemon/backends/backend_descriptor_registry.h" #include "lemon/system_info.h" #include "lemon/utils/aixlog.hpp" #include "lemon/utils/path_utils.h" @@ -29,22 +30,26 @@ RuntimeConfig* RuntimeConfig::global() { return s_global_instance.load(std::memory_order_acquire); } -static const std::vector s_backend_names = { - "llamacpp", "whispercpp", "moonshine", "sdcpp", "flm", "vllm", "ryzenai", "kokoro" -}; - +// A valid config.json backend section is the config_section of any descriptor +// that runs a local subprocess (binary != ""). Cloud has no binary, so it is not +// a backend section. Derived from descriptors — no hand-maintained list. static bool is_backend_name(const std::string& key) { - return std::find(s_backend_names.begin(), s_backend_names.end(), key) != s_backend_names.end(); + for (const auto* desc : lemon::backends::all_descriptors()) { + if (!desc->binary.empty() && desc->effective_config_section() == key) { + return true; + } + } + return false; } -// Backends that have a selectable "backend" key -static const std::vector s_selectable_backends = { - "llamacpp", "whispercpp", "sdcpp", "vllm" -}; - +// A config section has a selectable "backend" key iff its descriptor opts in. static bool has_backend_selection(const std::string& config_section) { - return std::find(s_selectable_backends.begin(), s_selectable_backends.end(), - config_section) != s_selectable_backends.end(); + for (const auto* desc : lemon::backends::all_descriptors()) { + if (desc->selectable_backend && desc->effective_config_section() == config_section) { + return true; + } + } + return false; } static std::pair normalize_config_set_changes(const json& changes) { @@ -106,12 +111,18 @@ static std::pair normalize_config_set_changes(const json& cha } std::string RuntimeConfig::config_section_to_recipe(const std::string& config_section) { - if (config_section == "sdcpp") return "sd-cpp"; + for (const auto* desc : lemon::backends::all_descriptors()) { + if (desc->effective_config_section() == config_section) { + return desc->recipe; + } + } return config_section; } std::string RuntimeConfig::recipe_to_config_section(const std::string& recipe) { - if (recipe == "sd-cpp") return "sdcpp"; + if (const auto* desc = lemon::backends::descriptor_for(recipe)) { + return desc->effective_config_section(); + } return recipe; } @@ -278,9 +289,16 @@ std::string RuntimeConfig::rocm_channel() const { std::string RuntimeConfig::rocm_channel_for_recipe(const std::string& recipe) const { std::string channel = rocm_channel(); - // sd-cpp currently has no nightly artifacts; use stable builds. - if (recipe == "sd-cpp" && channel == "nightly") { - return "stable"; + // Clamp to a channel the backend actually publishes. A backend that lists + // only {"stable"} (e.g. sd-cpp, which has no nightly artifacts) falls back to + // its first channel when "nightly" is requested. Driven by the descriptor's + // rocm_channels, so no per-recipe special case lives here. + const auto* desc = lemon::backends::descriptor_for(recipe); + if (desc && !desc->rocm_channels.empty()) { + const auto& channels = desc->rocm_channels; + if (std::find(channels.begin(), channels.end(), channel) == channels.end()) { + return channels.front(); + } } return channel; } @@ -340,56 +358,43 @@ json RuntimeConfig::recipe_options(const std::string& backend) const { return val; }; - const std::string backend_args = backend + "_args"; - - if (config_.contains("llamacpp")) { - const auto& lc = config_["llamacpp"]; - if (lc.contains("backend")) result["llamacpp_backend"] = resolve_auto(lc["backend"]); - if (lc.contains(backend_args) && lc[backend_args] != "") { - result["llamacpp_args"] = lc[backend_args]; - } else if (lc.contains("args")) { - result["llamacpp_args"] = lc["args"]; - } - if (lc.contains("device")) result["llamacpp_device"] = lc["device"]; - } - - if (config_.contains("whispercpp")) { - const auto& wc = config_["whispercpp"]; - if (wc.contains("backend")) result["whispercpp_backend"] = resolve_auto(wc["backend"]); - if (wc.contains(backend_args) && wc[backend_args] != "") { - result["whispercpp_args"] = wc[backend_args]; - } else if (wc.contains("args")) { - result["whispercpp_args"] = wc["args"]; - } - } + auto ends_with = [](const std::string& s, const std::string& suf) { + return s.size() >= suf.size() && s.compare(s.size() - suf.size(), suf.size(), suf) == 0; + }; - if (config_.contains("moonshine")) { - const auto& ms = config_["moonshine"]; - if (ms.contains(backend_args) && ms[backend_args] != "") { - result["moonshine_args"] = ms[backend_args]; - } else if (ms.contains("args")) { - result["moonshine_args"] = ms["args"]; - } - } + const std::string backend_args = backend + "_args"; - if (config_.contains("sdcpp")) { - const auto& sd = config_["sdcpp"]; - if (sd.contains("backend")) result["sd-cpp_backend"] = resolve_auto(sd["backend"]); - if (sd.contains(backend_args) && sd[backend_args] != "") { - result["sdcpp_args"] = sd[backend_args]; - } else if (sd.contains("args")) { - result["sdcpp_args"] = sd["args"]; + // Translate each backend's nested config.json section into the flat + // recipe_options format, driven by the descriptor's option list — no + // per-recipe block. The flat key is the descriptor option name; the + // config.json key is derived from the option's role (its name suffix): + // *_backend -> "backend" *_args -> variant "_args" then "args" + // *_device -> "device" everything else -> the option name verbatim + // (sd-cpp's steps/cfg_scale/width/height/…) + for (const auto* desc : lemon::backends::all_descriptors()) { + const std::string section = desc->effective_config_section(); + if (!config_.contains(section) || !config_[section].is_object()) { + continue; + } + const auto& cfg = config_[section]; + for (const auto& opt : desc->options) { + if (ends_with(opt.name, "_backend")) { + if (cfg.contains("backend")) { + result[opt.name] = resolve_auto(cfg["backend"]); + } + } else if (ends_with(opt.name, "_args")) { + if (cfg.contains(backend_args) && cfg[backend_args] != "") { + result[opt.name] = cfg[backend_args]; + } else if (cfg.contains("args")) { + result[opt.name] = cfg["args"]; + } + } else { + const std::string ckey = ends_with(opt.name, "_device") ? "device" : opt.name; + if (cfg.contains(ckey)) { + result[opt.name] = cfg[ckey]; + } + } } - if (sd.contains("steps")) result["steps"] = sd["steps"]; - if (sd.contains("cfg_scale")) result["cfg_scale"] = sd["cfg_scale"]; - if (sd.contains("width")) result["width"] = sd["width"]; - if (sd.contains("height")) result["height"] = sd["height"]; - } - - if (config_.contains("vllm")) { - const auto& vl = config_["vllm"]; - if (vl.contains("backend")) result["vllm_backend"] = resolve_auto(vl["backend"]); - if (vl.contains("args")) result["vllm_args"] = vl["args"]; } if (config_.contains("ctx_size")) result["ctx_size"] = config_["ctx_size"]; diff --git a/src/cpp/server/server.cpp b/src/cpp/server/server.cpp index b2b0327ab..511aa080c 100644 --- a/src/cpp/server/server.cpp +++ b/src/cpp/server/server.cpp @@ -5,8 +5,8 @@ #include "lemon/config_file.h" #include "lemon/mcp_server.h" #include "lemon/ollama_api.h" -#include "lemon/backends/cloud_server.h" -#include "lemon/backends/sd_server.h" +#include "lemon/backends/cloud/cloud_server.h" +#include "lemon/backends/sdcpp/sdcpp_server.h" #include "lemon/backends/backend_utils.h" #include #include "lemon/utils/json_utils.h" @@ -677,6 +677,9 @@ void Server::setup_routes(httplib::Server &web_server) { web_server.Get("/internal/config", [this](const httplib::Request& req, httplib::Response& res) { handle_config_get(req, res); }); + web_server.Get("/internal/config/defaults", [this](const httplib::Request& req, httplib::Response& res) { + handle_config_defaults_get(req, res); + }); web_server.Post("/internal/cleanup-cache", [this](const httplib::Request& req, httplib::Response& res) { handle_cleanup_cache(req, res); }); @@ -1698,7 +1701,8 @@ void Server::auto_load_model_if_needed(const std::string& requested_model) { // - If model is NOT downloaded: Download it from HuggingFace // - If model IS downloaded: Skip HuggingFace API check entirely (use cached version) // Only the /pull endpoint should check for updates (uses do_not_upgrade=false) - if (info.recipe != "flm" && !model_manager_->is_model_downloaded(requested_model)) { + if (!model_manager_->backend_self_manages_downloads(info.recipe) && + !model_manager_->is_model_downloaded(requested_model)) { LOG(INFO, "Server") << "Model not cached, downloading from Hugging Face..." << std::endl; LOG(INFO, "Server") << "This may take several minutes for large models." << std::endl; model_manager_->download_registered_model(info, true); @@ -3244,7 +3248,7 @@ void Server::handle_image_upscale(const httplib::Request& req, httplib::Response // as a separate request from generation, which lets the frontend show // the original and upscaled images side by side with independent timing. std::string exe_dir = lemon::backends::BackendUtils::get_backend_binary_path( - lemon::backends::SDServer::SPEC, backend); + *lemon::backends::try_get_spec_for_recipe("sd-cpp"), backend); std::filesystem::path cli_exe = std::filesystem::path(exe_dir).parent_path() / #ifdef _WIN32 "sd-cli.exe"; @@ -4146,60 +4150,12 @@ void Server::resolve_and_register_local_model( std::string recipe = model_data.value("recipe", ""); bool vision = model_data.value("vision", false); - std::string resolved_checkpoint; + // The backend's ops locate its primary artifact within the imported + // directory (.gguf / .bin file, genai_config.json dir, …); "" means register + // the directory itself. + std::string resolved_checkpoint = backends::ops_for(recipe)->find_imported_checkpoint(dest_path); std::string resolved_mmproj; - // For RyzenAI LLM models, find genai_config.json - if (recipe == "ryzenai-llm") { - for (const auto& entry : std::filesystem::recursive_directory_iterator(dest_path)) { - if (entry.is_regular_file() && entry.path().filename() == "genai_config.json") { - resolved_checkpoint = entry.path().parent_path().string(); - break; - } - } - if (resolved_checkpoint.empty()) { - resolved_checkpoint = dest_path; - } - } - // For llamacpp models, find the GGUF file - else if (recipe == "llamacpp") { - std::string gguf_file_found; - - // If no variant or variant not found, search for any .gguf file (excluding mmproj) - if (gguf_file_found.empty()) { - for (const auto& entry : std::filesystem::recursive_directory_iterator(dest_path)) { - if (entry.is_regular_file()) { - std::string filename = entry.path().filename().string(); - std::string filename_lower = filename; - std::transform(filename_lower.begin(), filename_lower.end(), filename_lower.begin(), ::tolower); - - if (filename_lower.find(".gguf") != std::string::npos && - filename_lower.find("mmproj") == std::string::npos) { - gguf_file_found = entry.path().string(); - break; - } - } - } - } - - resolved_checkpoint = gguf_file_found.empty() ? dest_path : gguf_file_found; - } - // For whispercpp, find .bin file - else if (recipe == "whispercpp") { - for (const auto& entry : std::filesystem::recursive_directory_iterator(dest_path)) { - if (entry.is_regular_file()) { - std::string filename = entry.path().filename().string(); - if (filename.find(".bin") != std::string::npos) { - resolved_checkpoint = entry.path().string(); - break; - } - } - } - if (resolved_checkpoint.empty()) { - resolved_checkpoint = dest_path; - } - } - // Search for mmproj file if vision is enabled or mmproj hint provided if (vision || !mmproj.empty()) { for (const auto& entry : std::filesystem::recursive_directory_iterator(dest_path)) { @@ -4515,6 +4471,20 @@ void Server::handle_config_get(const httplib::Request& /*req*/, httplib::Respons } } +void Server::handle_config_defaults_get(const httplib::Request& /*req*/, httplib::Response& res) { + try { + // The canonical default config (global keys + descriptor-derived per-recipe + // sections), independent of this host's config.json or deployment overrides. + // gen_backend_boilerplate.py reads this to regenerate resources/defaults.json. + res.set_content(ConfigFile::base_defaults().dump(2), "application/json"); + } catch (const std::exception& e) { + LOG(ERROR, "Server") << "ERROR in handle_config_defaults_get: " << e.what() << std::endl; + res.status = 500; + nlohmann::json error = {{"error", e.what()}}; + res.set_content(error.dump(), "application/json"); + } +} + void Server::handle_bin_change(const std::string& section, const std::string& bin_key, const std::string& new_value) { @@ -4525,9 +4495,8 @@ void Server::handle_bin_change(const std::string& section, std::string backend = bin_key.substr(0, bin_key.size() - 4); // The "server_bin" key (as in ryzenai.server_bin) is not consumed by the - // current install flow — find_external_backend_binary uses recipe-based - // section lookup and there is no recipe whose section equals "ryzenai". - // Skip the hot-swap rather than attempt an install that won't help. + // current install flow, so skip the hot-swap rather than attempt an install + // that won't help. if (backend == "server") { LOG(WARNING, "Server") << section << "." << bin_key << " is not consumed by the install flow; " diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp index cf0adfc52..f7cccc162 100644 --- a/src/cpp/server/system_info.cpp +++ b/src/cpp/server/system_info.cpp @@ -7,6 +7,9 @@ #include "lemon/utils/json_utils.h" #include "lemon/utils/process_manager.h" #include "lemon/backends/backend_utils.h" +#include "lemon/backends/backend_descriptor_registry.h" +#include "lemon/backends/backend_registry.h" +#include "lemon/recipe_backend_def.h" #include #include #include @@ -404,15 +407,8 @@ std::vector query_dxg_amd_gpus(const std::string& gpu_type) { // Recipe/Backend definition table - single source of truth for support matrix // ============================================================================ -// Device constraints: device_type -> set of allowed families (empty = all families) -using DeviceConstraints = std::map>; - -struct RecipeBackendDef { - std::string recipe; - std::string backend; - std::set supported_os; - DeviceConstraints devices; -}; +// RecipeBackendDef and DeviceConstraints are declared in lemon/recipe_backend_def.h +// so backend descriptors can carry their own support rows. // Recipe definitions table - single source of truth for all recipe/backend support // Format: {recipe, backend, {supported_os}, {{device_type, {allowed_families}}}} @@ -422,115 +418,23 @@ struct RecipeBackendDef { // Example: metal is listed before vulkan on macOS, vulkan before cpu elsewhere. // // Empty family set {} means "all families of that device type" -static const std::vector RECIPE_DEFS = { - // llamacpp with multiple backends (order = preference) - {"llamacpp", "system", {"linux"}, { - {"cpu", {"x86_64", "arm64"}}, // Placeholder, actual check is PATH-based - }}, - {"llamacpp", "metal", {"macos"}, - { - {"metal", {}}, - }}, - {"llamacpp", "cuda", {"windows", "linux"}, { - {"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}, - }}, - {"llamacpp", "vulkan", {"windows", "linux"}, { - {"cpu", {"x86_64", "arm64"}}, - {"amd_gpu", {}}, // all AMD GPU families - }}, - {"llamacpp", "rocm", {"windows", "linux"}, { - {"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}}, // STX iGPUs + RDNA2/3/4 dGPUs - }}, - {"llamacpp", "cpu", {"windows", "linux"}, { - {"cpu", {"x86_64", "arm64"}}, - }}, - - // whisper.cpp - NPU, ROCm GPU, Vulkan, CPU, Metal - {"whispercpp", "npu", {"windows"}, { - {"amd_npu", {"XDNA2"}}, - }}, - {"whispercpp", "rocm", {"windows", "linux"}, { - // gfx103X omitted: lemonade-sdk/whisper.cpp-rocm publishes no gfx103X - // ROCm whisper build, so advertising it would yield a 404 on install. - {"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}, - }}, - {"whispercpp", "vulkan", {"windows", "linux"}, { - {"cpu", {"x86_64"}}, - {"amd_gpu", {}}, - }}, - {"whispercpp", "cpu", {"windows", "linux"}, { - {"cpu", {"x86_64"}}, - }}, - {"whispercpp", "metal", {"macos"}, { - {"metal", {}}, - }}, - - // kokoro - Windows/Linux x86_64; macOS arm64 (Metal) - {"kokoro", "cpu", {"windows", "linux"}, { - {"cpu", {"x86_64"}}, - }}, - {"kokoro", "metal", {"macos"}, { - {"metal", {}}, - }}, - - // stable-diffusion.cpp - ROCm backend for AMD GPUs - {"sd-cpp", "rocm", {"windows", "linux"}, { - {"amd_gpu", { - "gfx1150", "gfx1151", "gfx1152", - "gfx103X", "gfx110X", "gfx120X" - }}, - }}, - - // stable-diffusion.cpp - CUDA backend for NVIDIA GPUs (Linux) - {"sd-cpp", "cuda", {"linux"}, { - {"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}, - }}, - - // stable-diffusion.cpp - Vulkan backend (Windows/Linux x86_64) - {"sd-cpp", "vulkan", {"windows", "linux"}, { - {"cpu", {"x86_64"}}, - {"amd_gpu", {}}, - {"nvidia_gpu", {}}, - }}, - - // stable-diffusion.cpp - CPU backend (Windows/Linux x86_64) - {"sd-cpp", "cpu", {"windows", "linux"}, { - {"cpu", {"x86_64"}}, - }}, - - // stable-diffusion.cpp - Metal backend (macOS arm64) - {"sd-cpp", "metal", {"macos"}, { - {"metal", {}}, - }}, - - // FLM - NPU (XDNA2) - {"flm", "npu", {"windows", "linux"}, { - {"amd_npu", {"XDNA2"}}, - }}, - - // RyzenAI LLM - Windows NPU (XDNA2) - {"ryzenai-llm", "npu", {"windows"}, { - {"amd_npu", {"XDNA2"}}, - }}, - - // vLLM - ROCm backend for AMD GPUs (Linux only) - {"vllm", "rocm", {"linux"}, { - {"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}, - }}, - - // Moonshine - CPU-only streaming STT. Platforms match the published - // moonshine-server-rocm bundles (moonshine-voice wheels): Windows x64, - // Linux x64/arm64, macOS arm64. No Intel macOS or Windows-arm64 wheel. - {"moonshine", "cpu", {"windows"}, { - {"cpu", {"x86_64"}}, - }}, - {"moonshine", "cpu", {"linux"}, { - {"cpu", {"x86_64", "arm64"}}, - }}, - {"moonshine", "cpu", {"macos"}, { - {"cpu", {"arm64"}}, - }}, -}; +// The recipe/backend support matrix is assembled from every backend descriptor's +// `support` rows (see lemon/backends/*_descriptor.cpp). Concatenated in registry +// order; within a recipe, row order is the backend preference order. This is the +// single source of truth — there is no separate hand-maintained table. +static const std::vector& recipe_defs() { + static const std::vector defs = [] { + std::vector v; + for (const auto* desc : lemon::backends::all_descriptors()) { + for (const auto& row : desc->support) { + // Fill in the recipe (the owning descriptor's) per support row. + v.push_back({desc->recipe, row.backend, row.supported_os, row.devices, row.device_summary}); + } + } + return v; + }(); + return defs; +} // ============================================================================ // Device family to human-readable name mapping @@ -592,7 +496,7 @@ std::string SystemInfo::get_unsupported_backend_error(const std::string& recipe, std::string error; // Find the recipe/backend in RECIPE_DEFS - for (const auto& def : RECIPE_DEFS) { + for (const auto& def : recipe_defs()) { if (def.recipe == recipe && def.backend == backend) { // Collect all required family names std::vector family_names; @@ -674,76 +578,49 @@ static bool device_matches_constraint(const std::string& device_family, // Generic installation check static bool is_recipe_installed(const std::string& recipe, const std::string& backend, std::string& error_message) { - bool is_llamacpp_rocm_backend = recipe == "llamacpp" && backend == "rocm"; - - // Special handling for ROCm backends on gfx1151 (Strix Halo) if kernel CWSR fix is missing - bool is_vllm_rocm_backend = recipe == "vllm" && backend == "rocm"; - if ((recipe == "sd-cpp" && backend == "rocm") || is_llamacpp_rocm_backend || is_vllm_rocm_backend) { - if (needs_gfx1151_cwsr_fix()) { - error_message = "Linux kernel missing support"; - return false; - } + // Special handling for ROCm backends on gfx1151 (Strix Halo) if the kernel + // CWSR fix is missing — which backends' rocm build needs it is a descriptor flag. + const auto* cwsr_desc = backends::descriptor_for(recipe); + if (backend == "rocm" && cwsr_desc && cwsr_desc->rocm_requires_cwsr_fix && + needs_gfx1151_cwsr_fix()) { + error_message = "Linux kernel missing support"; + return false; } - auto* spec = try_get_spec_for_recipe(recipe); - if (spec) { + // Find the managed binary, then let the backend's ops decide installed-ness + // (llamacpp "system" also needs the HIP plugin; flm can be a PATH package). + bool binary_found = false; + if (auto* spec = try_get_spec_for_recipe(recipe)) { try { BackendUtils::get_backend_binary_path(*spec, backend); - - // For system llamacpp backend, also verify the HIP plugin is available - // This is required for ROCm GPU acceleration with dynamically loaded backends - if (recipe == "llamacpp" && backend == "system") { -#ifdef __linux__ - // Check if AMD GPU driver is loaded (KFD indicates amdgpu driver) - if (fs::exists("/sys/class/kfd")) { - // System has AMD GPU(s), so we need the HIP plugin - if (!is_ggml_hip_plugin_available()) { - error_message = "HIP plugin libggml-hip.so not installed"; - return false; - } - } -#endif - } - - return true; + binary_found = true; } catch (...) { -#ifndef _WIN32 - // On Linux, FLM is installed as a system package (in PATH, not install dir) - if (recipe == "flm" && !utils::find_flm_executable().empty()) { - return true; - } -#endif - return false; + binary_found = false; } } - return false; + auto check = backends::ops_for(recipe)->check_install(backend, binary_found); + if (!check.installed && !check.error.empty()) { + error_message = check.error; + } + return check.installed; } static std::string get_recipe_version(const std::string& recipe, const std::string& backend) { - if (recipe == "llamacpp" && backend == "system") { - return SystemInfo::get_system_llamacpp_version(); - } + // Read the on-disk version.txt generically, then let the backend's ops + // override (llamacpp "system" runs llama-server --version; flm queries the + // CLI when no file is present). No per-recipe branches here. auto* spec = try_get_spec_for_recipe(recipe); + std::string file_version; if (spec) { std::string version_file = BackendUtils::get_installed_version_file(*spec, backend); - if (version_file.empty()) { -#ifndef _WIN32 - // On Linux, FLM is a system package with no version.txt - query directly - if (recipe == "flm") { - return SystemInfo::get_flm_version(); - } -#endif - return "unknown"; - } - std::string version = read_version_file(version_file); -#ifndef _WIN32 - // On Linux, version.txt may not exist on disk for system-installed FLM - if (recipe == "flm" && (version.empty() || version == "unknown")) { - return SystemInfo::get_flm_version(); + if (!version_file.empty()) { + file_version = read_version_file(version_file); } -#endif - return version; } - return ""; + std::string resolved = backends::ops_for(recipe)->resolve_version(backend, file_version); + if (!spec && resolved.empty()) { + return ""; + } + return resolved.empty() ? "unknown" : resolved; } static std::string get_install_command(const std::string& recipe, const std::string& backend) { @@ -828,7 +705,7 @@ static std::string get_expected_backend_version(const std::string& recipe, const // version pins ("rocm-stable", "rocm-nightly") in backend_versions.json. // Mirror the resolution done by BackendUtils::get_backend_version(). std::string resolved_backend = backend; - if ((recipe == "llamacpp" || recipe == "sd-cpp") && backend == "rocm") { + if (backends::recipe_has_rocm_channels(recipe) && backend == "rocm") { std::string channel = "stable"; if (auto* cfg = RuntimeConfig::global()) { channel = cfg->rocm_channel_for_recipe(recipe); @@ -1215,12 +1092,12 @@ json SystemInfo::build_recipes_info(const json& devices) { std::map configured_default_backends; if (auto* cfg = RuntimeConfig::global()) { std::set processed_recipes; - for (const auto& def : RECIPE_DEFS) { + for (const auto& def : recipe_defs()) { if (!processed_recipes.insert(def.recipe).second) continue; std::string section = RuntimeConfig::recipe_to_config_section(def.recipe); std::string backend = cfg->backend_string(section, "backend"); if (backend.empty() || backend == "auto") continue; - bool known = std::any_of(RECIPE_DEFS.begin(), RECIPE_DEFS.end(), + bool known = std::any_of(recipe_defs().begin(), recipe_defs().end(), [&](const RecipeBackendDef& d) { return d.recipe == def.recipe && d.backend == backend; }); @@ -1280,7 +1157,7 @@ json SystemInfo::build_recipes_info(const json& devices) { } // Build recipes from the definition table - for (const auto& def : RECIPE_DEFS) { + for (const auto& def : recipe_defs()) { // Skip if not supported on current OS if (def.supported_os.count(current_os) == 0) { // Helper to format OS name nicely @@ -1439,41 +1316,21 @@ json SystemInfo::build_recipes_info(const json& devices) { backend["message"] = message; backend["action"] = ""; } else if (!available) { - // FLM on Linux needs richer state to guide users through manual setup - // (installing .deb, xrt drivers, etc.) - if (def.recipe == "flm") { - bool is_not_installed = install_error.empty() - || install_error.find("not installed") != std::string::npos - || install_error.find("not found") != std::string::npos; - bool is_version_mismatch = install_error.find("requires") != std::string::npos; - - if (is_not_installed) { - backend["state"] = "installable"; - } else if (is_version_mismatch) { - backend["state"] = "update_required"; - } else { - backend["state"] = "action_required"; - } - backend["message"] = install_error; - - if (!is_not_installed) { + // Backends with bespoke unavailable-state guidance (flm: a system .deb + // + drivers needing manual setup) classify themselves; everyone else + // uses the generic installable/no-fetch default below. + const std::string default_install_command = get_install_command(def.recipe, def.backend); + if (auto st = backends::ops_for(def.recipe)->classify_unavailable( + def.backend, install_error, default_install_command)) { + backend["state"] = st->state; + backend["message"] = st->message; + backend["action"] = st->action; + if (st->attach_installed_version) { std::string installed_version = get_recipe_version(def.recipe, def.backend); if (!installed_version.empty() && installed_version != "unknown") { backend["version"] = installed_version; } } - -#ifdef __linux__ - backend["action"] = "Visit https://lemonade-server.ai/flm_npu_linux.html?mode=troubleshoot"; -#elif defined(_WIN32) - if (!is_not_installed && !is_version_mismatch) { - backend["action"] = "Visit https://lemonade-server.ai/driver_install.html"; - } else { - backend["action"] = get_install_command(def.recipe, def.backend); - } -#else - backend["action"] = get_install_command(def.recipe, def.backend); -#endif } else { auto* cfg = RuntimeConfig::global(); bool no_fetch = cfg && cfg->no_fetch_executables(); @@ -1483,16 +1340,16 @@ json SystemInfo::build_recipes_info(const json& devices) { : "Backend is supported but not installed."; backend["message"] = install_error.empty() ? default_message : install_error; - bool is_rocm_backend = (def.recipe == "sd-cpp" && def.backend == "rocm") || - (def.recipe == "llamacpp" && def.backend == "rocm") || - (def.recipe == "vllm" && def.backend == "rocm"); + const auto* cwsr_desc = backends::descriptor_for(def.recipe); + bool is_rocm_backend = def.backend == "rocm" && cwsr_desc && + cwsr_desc->rocm_requires_cwsr_fix; - // Special action for ROCm backends on llamacpp/sd-cpp/vllm if CWSR fix is missing + // Special action for ROCm backends that need the gfx1151 CWSR fix. if (is_rocm_backend && !install_error.empty() && needs_gfx1151_cwsr_fix()) { backend["action"] = "Visit https://lemonade-server.ai/gfx1151_linux.html"; } else { - backend["action"] = get_install_command(def.recipe, def.backend); + backend["action"] = default_install_command; } } } else { @@ -1537,9 +1394,10 @@ json SystemInfo::build_recipes_info(const json& devices) { return installed.compare(0, prefix.size(), prefix) == 0; }; #if !defined(_WIN32) - // On non-Windows, FLM is a system-managed package; a version newer - // than the minimum required is acceptable. - if (def.recipe == "flm") { + // System-managed packages (e.g. flm on Linux) accept a version newer + // than the minimum required. + const auto* ver_desc = backends::descriptor_for(def.recipe); + if (ver_desc && ver_desc->version_policy == VersionPolicy::AtLeast) { auto installed_ver = utils::Version::parse(installed_version); auto expected_ver = utils::Version::parse(expected_version); // If either version cannot be parsed, fall back to exact equality check @@ -1611,6 +1469,60 @@ json SystemInfo::build_recipes_info(const json& devices) { } } + // Enrich each recipe entry with descriptor metadata so clients (the desktop + // app, the docs generator) can render display names and per-recipe option + // schemas without hardcoding them. This is the single source the frontend + // reads instead of its own per-recipe TypeScript tables. + int recipe_order = 0; + for (const auto* desc : lemon::backends::all_descriptors()) { + auto it = recipes.find(desc->recipe); + if (it == recipes.end()) { + ++recipe_order; + continue; // recipe not surfaced on this system (e.g. cloud has no support rows) + } + json& entry = it.value(); + entry["order"] = recipe_order++; // descriptor registry order, for deterministic doc rendering + entry["display_name"] = desc->display_name; + entry["selectable_backend"] = desc->selectable_backend; + entry["uses_ctx_size"] = desc->uses_ctx_size; + entry["modality"] = desc->modality; + entry["experimental"] = desc->experimental; + entry["web_display_name"] = desc->web_display_name.empty() ? desc->display_name : desc->web_display_name; + entry["web_priority"] = desc->web_priority; + entry["slot_policy"] = slot_policy_to_string(desc->slot_policy); + // Machine-independent support matrix (OS + device families + friendly + // device summary per backend), straight from the descriptor — used by the + // docs generator to render the README support matrix etc. + json support = json::array(); + for (const auto& row : desc->support) { + json devices = json::array(); + for (const auto& [device, families] : row.devices) { + devices.push_back({{"device", device}, + {"families", std::vector(families.begin(), families.end())}}); + } + support.push_back({ + {"backend", row.backend}, + {"os", std::vector(row.supported_os.begin(), row.supported_os.end())}, + {"devices", devices}, + {"device_summary", row.device_summary}, + }); + } + entry["support"] = support; + json options = json::array(); + for (const auto& opt : desc->options) { + json o = { + {"name", opt.name}, + {"cli_flag", opt.cli_flag}, + {"default", opt.default_value}, + {"type_name", opt.type_name}, + {"help", opt.help}, + {"group", opt.group}, + }; + options.push_back(o); + } + entry["options"] = options; + } + return recipes; } @@ -1643,7 +1555,7 @@ SystemInfo::SupportedBackendsResult SystemInfo::get_supported_backends(const std } // Collect remaining supported backends and capture first error (in preference order from RECIPE_DEFS) - for (const auto& def : RECIPE_DEFS) { + for (const auto& def : recipe_defs()) { if (def.recipe == recipe) { // Skip the default_backend since we already added it if (def.backend == default_backend) { @@ -1672,11 +1584,12 @@ SystemInfo::SupportedBackendsResult SystemInfo::get_supported_backends(const std } std::string SystemInfo::check_recipe_supported(const std::string& recipe) { - // Cloud offload has no local hardware/OS requirements; availability is - // gated by the CloudProviderRegistry (config.json "cloud_providers") and - // a resolvable API key (env var or runtime auth), checked elsewhere in - // filter_models_by_backend / CloudServer::load. - if (recipe == "cloud") { + // A backend whose descriptor declares no support rows has no local + // hardware/OS gating (e.g. cloud offload): availability is determined at + // runtime (provider creds via the CloudProviderRegistry / API key), checked + // elsewhere in filter_models_by_backend / CloudServer::load. + const auto* desc = lemon::backends::descriptor_for(recipe); + if (desc && desc->support.empty()) { return ""; } auto result = get_supported_backends(recipe); @@ -1697,7 +1610,7 @@ std::vector SystemInfo::get_all_recipe_statuses() { if (recipe_info.contains("backends") && recipe_info["backends"].is_object()) { // Iterate in preference order (from RECIPE_DEFS table) - for (const auto& def : RECIPE_DEFS) { + for (const auto& def : recipe_defs()) { if (def.recipe != recipe_name) continue; if (!recipe_info["backends"].contains(def.backend)) continue; @@ -1736,43 +1649,6 @@ static std::string read_version_file(const fs::path& version_file) { return "unknown"; } -std::string SystemInfo::get_system_llamacpp_version() { - std::string output; - #ifdef _WIN32 - std::string command = "llama-server --version 2>NUL"; - int rc = lemon::utils::ProcessManager::run_command(command, output); - #else - FILE* pipe = popen("llama-server --version 2>/dev/null", "r"); - if (!pipe) { - return "unknown"; - } - - char buffer[256]; - if (fgets(buffer, sizeof(buffer), pipe) != nullptr) { - output = buffer; - } - - pclose(pipe); - #endif - - // Parse version from output like "version: 3432 (e2b2a632)" or "llama.cpp version b3432" - if (!output.empty()) { - // Try to find a version number - std::regex version_regex(R"(version:\s*(\d+)|version\s+b?(\d+))"); - std::smatch match; - if (std::regex_search(output, match, version_regex)) { - for (size_t i = 1; i < match.size(); ++i) { - if (match[i].matched) { - return "b" + match[i].str(); - } - } - } - return "detected"; - } - - return "unknown"; -} - // Map a CUDA Compute Capability "MAJOR.MINOR" string (as reported by nvidia-smi // --query-gpu=compute_cap) to the sm_XX token used in llamacpp-cuda release filenames. // Returns empty if the value cannot be parsed. @@ -2321,74 +2197,6 @@ bool SystemInfo::get_has_igpu() { return false; // No iGPU detected } -std::string SystemInfo::get_flm_version() { - // Cache real version strings to avoid spawning the subprocess twice per - // build_recipes_info() pass. "unknown" is NOT cached so that post-install - // verification in fastflowlm_server.cpp gets a fresh result after FLM is installed. - static std::string cached_version; - if (!cached_version.empty()) { - return cached_version; - } - - // Find the flm executable using shared utility - std::string flm_path = utils::find_flm_executable(); - if (flm_path.empty() || !utils::is_safe_executable_path(flm_path)) { - return "unknown"; - } - - std::string output; - #ifdef _WIN32 - std::string command = "\"" + flm_path + "\" version --json 2>NUL"; - int rc = lemon::utils::ProcessManager::run_command(command, output); - #else - std::string command = "\"" + flm_path + "\" version --json 2>/dev/null"; - FILE* pipe = popen(command.c_str(), "r"); - if (!pipe) { - return "unknown"; - } - - char buffer[256]; - while (fgets(buffer, sizeof(buffer), pipe) != nullptr) { - output += buffer; - } - - pclose(pipe); - #endif - - // Parse JSON output: { "version": "0.9.34" } - try { - json j = JsonUtils::parse(output); - if (j.contains("version") && j["version"].is_string()) { - std::string version = j["version"].get(); - // If the version doesn't start with 'v', prepend it - // for backend_versions.json compatibility (e.g. "v0.9.34"). - if (!version.empty() && version[0] != 'v') { - version = "v" + version; - } - cached_version = version; - return cached_version; - } - } catch (...) { - // Fallback to legacy parsing if JSON parsing fails - } - - // Legacy parsing from output like "FLM v0.9.4" - if (output.find("FLM v") != std::string::npos) { - size_t pos = output.find("FLM v"); - // Keep the 'v' prefix so it matches backend_versions.json (e.g. "v0.9.34"). - std::string version = output.substr(pos + 4); - // Trim whitespace and newlines - size_t end = version.find_first_of(" \t\n\r"); - if (end != std::string::npos) { - version = version.substr(0, end); - } - cached_version = version; - return cached_version; - } - - return "unknown"; -} - // ============================================================================ // Factory function // ============================================================================ diff --git a/src/cpp/server/utils/path_utils.cpp b/src/cpp/server/utils/path_utils.cpp index dc7492295..fb8591337 100644 --- a/src/cpp/server/utils/path_utils.cpp +++ b/src/cpp/server/utils/path_utils.cpp @@ -103,30 +103,6 @@ bool looks_like_path(const std::string& v) { } } -std::string find_flm_executable() { -#ifdef _WIN32 - // On Windows, only check the Lemonade install directory (auto-installed zip). - // No system PATH fallback - FLM should be installed via install_backend(). - std::string install_dir = (fs::path(get_downloaded_bin_dir()) / "flm" / "npu").make_preferred().string(); - if (fs::exists(install_dir)) { - for (const auto& entry : fs::recursive_directory_iterator(install_dir)) { - if (entry.is_regular_file() && entry.path().filename().string() == "flm.exe") { - std::string path = entry.path().string(); - if (is_safe_executable_path(path)) { - return path; - } - } - } - } - return ""; -#else - // Walk PATH directly — minimal Fedora/openSUSE containers do not ship `which`. - if (!find_executable_in_path("flm").empty()) { - return "flm"; - } - return ""; -#endif -} std::string find_executable_in_path(const std::string& executable_name) { if (!is_safe_executable_path(executable_name)) { @@ -180,50 +156,6 @@ std::string find_executable_in_path(const std::string& executable_name) { #endif } -bool is_ggml_hip_plugin_available() { -#ifdef __linux__ - // Allow distros/packagers that install outside the FHS paths below - // (e.g. NixOS, custom prefixes) to point directly at libggml-hip.so. - if (const char* env = std::getenv("LEMONADE_GGML_HIP_PATH"); env && *env) { - // Require the basename to look like the HIP plugin (libggml-hip*.so*, - // case-insensitive, versioned sonames allowed). This is a sanity check, - // not a security boundary: the path is not forwarded to ggml's loader, - // so we cannot verify it is actually loadable. It only guards against an - // accidental override pointing at an unrelated existing file. - std::string name = fs::path(env).filename().string(); - std::transform(name.begin(), name.end(), name.begin(), - [](unsigned char c) { return std::tolower(c); }); - const bool name_matches = name.rfind("libggml-hip", 0) == 0 && - name.find(".so") != std::string::npos; - // LEMONADE_GGML_HIP_PATH is user-controlled, so use the non-throwing - // filesystem overload: an odd or malformed path resolves to "not a - // regular file" (ec set) instead of raising a filesystem_error. - std::error_code hip_path_ec; - if (name_matches && fs::is_regular_file(env, hip_path_ec)) { - return true; - } - } - // On Linux x86_64, check common system library paths for the HIP plugin - std::vector possible_paths = { - // Debian/Ubuntu multiarch path (most common) - "/usr/lib/x86_64-linux-gnu/ggml/backends0/libggml-hip.so", - // Arch AUR path - "/usr/lib/libggml-hip.so", - // Standard Linux paths - "/usr/lib/ggml/backends0/libggml-hip.so", - "/usr/lib64/ggml/backends0/libggml-hip.so" - }; - - // Check all possible paths - for (const auto& path : possible_paths) { - if (fs::exists(path)) { - return true; - } - } -#endif - - return false; -} std::string get_cache_dir() { // If set_cache_dir() was called at startup, use that @@ -295,98 +227,5 @@ std::string get_downloaded_bin_dir() { return bin_dir; } -bool run_flm_validate(const std::string& flm_path, std::string& error_message) { - std::string flm_exe = flm_path.empty() ? find_flm_executable() : flm_path; - if (flm_exe.empty()) { - error_message = "FLM executable not found"; - return false; - } - if (!is_safe_executable_path(flm_exe)) { - error_message = "FLM path contains invalid characters"; - return false; - } - - std::string command = "\"" + flm_exe + "\" validate --json"; - std::string output; - int exit_code; -#ifdef _WIN32 - exit_code = ProcessManager::run_command(command, output); -#else - FILE* pipe = popen(command.c_str(), "r"); - if (!pipe) { - error_message = "Failed to execute " + flm_exe; - return false; - } - - char buffer[1024]; - while (fgets(buffer, sizeof(buffer), pipe) != nullptr) { - output += buffer; - } - - exit_code = pclose(pipe); - if (exit_code != -1) { - exit_code = WEXITSTATUS(exit_code); - } -#endif - - try { - if (!output.empty()) { - json j = JsonUtils::parse(output); - if (j.is_object()) { - // Check for overall status - bool validation_ok = false; - if (j.contains("ready")) { - validation_ok = j["ready"].get(); - } - - if (validation_ok) { - error_message.clear(); - return true; - } - - std::vector errors; - - if (j.contains("amd_device_found") && !j["amd_device_found"].get()) { - errors.push_back("No AMD NPU device found."); - } - - if (j.contains("all_fw_ok") && !j["all_fw_ok"].get()) { - errors.push_back("NPU firmware is incompatible."); - } - if (j.contains("kernel_ok") && !j["kernel_ok"].get()) { - errors.push_back("Kernel version is incompatible."); - } - - if (j.contains("memlock_ok") && !j["memlock_ok"].get()) { - errors.push_back("Memlock limits are too low."); - } - - if (j.contains("npu_driver_ok") && !j["npu_driver_ok"].get()) { - errors.push_back("NPU driver version is too old."); - } - - if (errors.empty()) { - error_message = "NPU validation failed."; - } else { - error_message = ""; - for (size_t i = 0; i < errors.size(); ++i) { - error_message += errors[i] + (i == errors.size() - 1 ? "" : " "); - } - } - return false; - } - } - } catch (...) { - // Fallback for non-JSON output or parsing error - } - - if (exit_code != 0) { - error_message = "flm validate failed with exit code " + std::to_string(exit_code); - return false; - } - - error_message.clear(); - return true; -} } // namespace utils::lemon