diff --git a/CMakeLists.txt b/CMakeLists.txt index 74463508e..919e6028b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -63,6 +63,7 @@ set(LEMON_BACKENDS "flm|fastflowlm" "ryzenai-llm|ryzenai" "vllm|vllm" + "vllm-omni|vllm_omni" "cloud|cloud" ) diff --git a/src/app/src/renderer/ChatWindow.tsx b/src/app/src/renderer/ChatWindow.tsx index a35e45ed2..8c81762cd 100644 --- a/src/app/src/renderer/ChatWindow.tsx +++ b/src/app/src/renderer/ChatWindow.tsx @@ -104,6 +104,20 @@ const ChatWindow: React.FC = ({ isVisible, width }) => { return labels.includes('chat-transcription'); }, [selectedModel, modelsData]); + // A chat model that produces audio (native voice) directly in the + // /chat/completions response (message.audio) — e.g. vLLM-Omni's Qwen2.5-Omni. + // Distinct from TTS models that serve /audio/speech (the "tts" label). Enables + // the voice-output toggle in the chat input. + const isAudioOutput = useMemo(() => { + if (!selectedModel) return false; + const info = modelsData[selectedModel]; + if (isCollectionModel(info)) { + return getCollectionComponents(info).some(component => + (modelsData[component]?.labels || []).includes('chat-speech')); + } + return (info?.labels || []).includes('chat-speech'); + }, [selectedModel, modelsData]); + const isCollectionSelected = useMemo(() => { if (!selectedModel) return false; return isCollectionModel(modelsData[selectedModel]); @@ -319,6 +333,7 @@ const ChatWindow: React.FC = ({ isVisible, width }) => { {...sharedProps} isVision={isVision} isAudioChat={isAudioChat} + isAudioOutput={isAudioOutput} currentLoadedModel={currentLoadedModel} setCurrentLoadedModel={setCurrentLoadedModel} collectionMode={collectionMode} diff --git a/src/app/src/renderer/components/panels/LLMChatPanel.tsx b/src/app/src/renderer/components/panels/LLMChatPanel.tsx index c18f3a625..4e89147eb 100644 --- a/src/app/src/renderer/components/panels/LLMChatPanel.tsx +++ b/src/app/src/renderer/components/panels/LLMChatPanel.tsx @@ -142,6 +142,7 @@ interface LLMChatPanelProps { appSettings: AppSettings | null; isVision: boolean; isAudioChat?: boolean; + isAudioOutput?: boolean; collectionMode?: boolean; currentLoadedModel: string | null; setCurrentLoadedModel: React.Dispatch>; @@ -152,7 +153,7 @@ interface LLMChatPanelProps { const LLMChatPanel: React.FC = ({ isBusy, isPreFlight, isInferring, activeModality, runPreFlight, reset, showError, appSettings, - isVision, isAudioChat = false, collectionMode = false, currentLoadedModel, setCurrentLoadedModel, + isVision, isAudioChat = false, isAudioOutput = false, collectionMode = false, currentLoadedModel, setCurrentLoadedModel, onNewChat, onUnloadCollection, }) => { const { selectedModel, modelsData } = useModels(); @@ -165,6 +166,10 @@ const LLMChatPanel: React.FC = ({ const [messages, setMessages] = useState([]); const [inputValue, setInputValue] = useState(''); + // Native audio (voice) output for omni chat models. Opt-in: generating speech + // is slow/expensive, so it defaults off even when the model supports it. + const [voiceEnabled, setVoiceEnabled] = useState(false); + const [selectedVoice, setSelectedVoice] = useState('Chelsie'); const [editingIndex, setEditingIndex] = useState(null); const [editingValue, setEditingValue] = useState(''); const [editingImages, setEditingImages] = useState([]); @@ -901,6 +906,64 @@ const LLMChatPanel: React.FC = ({ if (!accumulatedContent) throw new Error('No content received from stream'); }; + /** + * Non-streaming chat for omni models with native voice output. The audio + * comes back as `message.audio` in the response — vLLM-Omni splits text and + * audio across separate choices; OpenAI puts audio on the same choice, so we + * scan all choices. Rendered via buildFinalContent -> the MessageAudio player. + * Streaming is disabled here because the SSE path only assembles text deltas. + */ + const handleAudioChat = async (messageHistory: Message[]): Promise => { + const isNewModelLoad = currentLoadedModel !== chatModelName; + const requestBody = { + ...buildChatRequestBody(messageHistory), + stream: false, + modalities: ['text', 'audio'], + voice: selectedVoice, // vLLM-Omni reads top-level voice + audio: { voice: selectedVoice, format: 'wav' }, // OpenAI-standard (ignored if unsupported) + }; + + const response = await serverFetch('/chat/completions', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(requestBody), + signal: abortControllerRef.current?.signal, + }); + if (!response.ok) throw new Error(`HTTP error! status: ${response.status}`); + const data = await response.json(); + if (data.error) throw new Error(data.error.message || 'LLM returned error'); + if (!data.choices?.length) throw new Error('LLM returned empty response'); + + setCurrentLoadedModel(chatModelName); + if (isNewModelLoad) { + window.dispatchEvent(new CustomEvent('modelLoadEnd', { detail: { modelId: selectedModel } })); + } + + let text = ''; + let audio: { data?: string; format?: string } | undefined; + for (const choice of data.choices) { + const msg = choice.message || {}; + if (!text && typeof msg.content === 'string' && msg.content) text = msg.content; + if (!audio && msg.audio?.data) audio = msg.audio; + } + + const { content: cleanText, thinking } = extractThinking(text); + const artifacts: Artifact[] = []; + if (audio?.data) { + artifacts.push({ type: 'audio', data: audio.data, mime: mimeForFormat(audio.format || 'wav') }); + } + const finalContent = buildFinalContent(cleanText, artifacts); + setMessages(prev => { + const newMessages = [...prev]; + newMessages[newMessages.length - 1] = { + role: 'assistant', + content: finalContent, + thinking: thinking || undefined, + }; + return newMessages; + }); + }; + const sendMessage = async (textOverride?: string) => { const textToSend = typeof textOverride === 'string' ? textOverride : inputValue; // When called from voice auto-submit, `isBusy` may still be stale-true @@ -952,6 +1015,8 @@ const LLMChatPanel: React.FC = ({ try { if (collectionMode && lemonadeTools) { await handleCollectionChat(messageHistory); + } else if (isAudioOutput && voiceEnabled) { + await handleAudioChat(messageHistory); } else { await handleStreamingResponse(messageHistory); } @@ -1031,6 +1096,8 @@ const LLMChatPanel: React.FC = ({ try { if (collectionMode && lemonadeTools) { await handleCollectionChat(messageHistory); + } else if (isAudioOutput && voiceEnabled) { + await handleAudioChat(messageHistory); } else { await handleStreamingResponse(messageHistory); } @@ -1437,15 +1504,45 @@ const LLMChatPanel: React.FC = ({ sendDisabled={!inputValue.trim() && uploadedImages.length === 0 && uploadedAudio.length === 0} modelSelector={} rightControls={ - + <> + {isAudioOutput && ( +
+ + {voiceEnabled && ( + + )} +
+ )} + + } leftControls={ <> diff --git a/src/cpp/include/lemon/backends/vllm_omni/vllm_omni.h b/src/cpp/include/lemon/backends/vllm_omni/vllm_omni.h new file mode 100644 index 000000000..61165af5d --- /dev/null +++ b/src/cpp/include/lemon/backends/vllm_omni/vllm_omni.h @@ -0,0 +1,54 @@ +#pragma once + +#include "lemon/backends/backend_descriptor.h" + +namespace lemon { +namespace backends { +namespace vllm_omni { + +// The vllm-omni backend descriptor (plain data). Header-only `inline const` so +// it links into both the lemonade CLI and lemond without a separate source file. +// +// vLLM-Omni serves omni / any-to-any multimodal models (Qwen-Omni today) with +// ROCm acceleration. It is a pure-Python layer on top of the same base vLLM + +// PyTorch + Triton the plain `vllm` backend uses, shipped as a SEPARATE release +// artifact (vllm-omni*), so it gets its own recipe + version pin. gfx1151 only: +// that is the qualified, hardware-validated omni target. +inline const BackendDescriptor descriptor = { + /*recipe*/ "vllm-omni", + /*display_name*/ "vLLM-Omni ROCm (experimental)", + /*binary*/ "vllm-omni-server", + /*config_section*/ "", // defaults to recipe + /*default_device*/ DEVICE_GPU, + /*slot_policy*/ SlotPolicy::Standard, + /*selectable_backend*/ true, + /*uses_ctx_size*/ true, + /*dynamic_models*/ false, + /*options*/ { + {"vllm-omni_backend", "--vllm-omni", "", "BACKEND", + "vLLM-Omni backend to use", "vLLM-Omni Options"}, + {"vllm_omni_args", "--vllm-omni-args", "", "ARGS", + "Custom arguments to pass to vllm-omni-server", "vLLM-Omni Options"}, + }, + /*support*/ { + {"rocm", {"linux"}, {{"amd_gpu", {"gfx1151"}}}, "Strix Halo iGPU (gfx1151)"}, + }, + /*default_labels*/ {}, + /*required_checkpoints*/ {"main"}, + /*modality*/ "Omni (text, audio, vision)", + /*experimental*/ true, + /*web_display_name*/ "", + /*rocm_channels*/ {}, // single rocm artifact, no stable/nightly channels + /*exposes_prometheus_metrics*/ false, + /*rocm_requires_cwsr_fix*/ true, + /*version_policy*/ VersionPolicy::Exact, + /*self_manages_downloads*/ false, + /*takes_args*/ true, + /*arg_variants*/ {}, + /*bin_variants*/ {}, + /*config_extra*/ nlohmann::json::object(), +}; + +} // namespace vllm_omni +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/vllm_omni/vllm_omni_server.h b/src/cpp/include/lemon/backends/vllm_omni/vllm_omni_server.h new file mode 100644 index 000000000..7b8fc34fc --- /dev/null +++ b/src/cpp/include/lemon/backends/vllm_omni/vllm_omni_server.h @@ -0,0 +1,52 @@ +#pragma once + +#include "lemon/backends/backend_registry.h" + +#include "lemon/wrapped_server.h" +#include "lemon/backends/backend_utils.h" +#include +#include +#include + +namespace lemon { +namespace backends { + +// vLLM-Omni backend. Launches the bundle's `vllm-omni-server` on an omni model +// with a single-GPU deploy config and forwards OpenAI-compatible chat requests. +// Native voice / vision ride through transparently: audio output comes back as +// a second choice (`choices[1].message.audio.data`) in the chat response. +class VLLMOmniServer : public WrappedServer { +public: + static InstallParams get_install_params(const std::string& backend, const std::string& version); + + VLLMOmniServer(const std::string& log_level, + ModelManager* model_manager, + BackendManager* backend_manager); + + ~VLLMOmniServer() override; + + void load(const std::string& model_name, + const ModelInfo& model_info, + const RecipeOptions& options, + bool do_not_upgrade = false) override; + + void unload() override; + + // ICompletionServer implementation (audio/vision flow through the chat body). + json chat_completion(const json& request) override; + json completion(const json& request) override; + json responses(const json& request) override; + +private: + std::filesystem::path rocm_shim_dir_; + int64_t max_model_len_ = 0; +}; + +namespace vllm_omni { +// Factory for the vllm-omni backend (constructs the server class — lemond only). +std::unique_ptr create(const BackendContext& ctx); +const BackendSpec* spec(); +const BackendOps* ops(); +} // namespace vllm_omni +} // namespace backends +} // namespace lemon diff --git a/src/cpp/resources/backend_versions.json b/src/cpp/resources/backend_versions.json index 30fe27d1f..723e44db3 100644 --- a/src/cpp/resources/backend_versions.json +++ b/src/cpp/resources/backend_versions.json @@ -92,6 +92,9 @@ "vllm": { "rocm": "vllm0.20.1-rocm7.12.0" }, + "vllm-omni": { + "rocm": "vllm-omni0.23.0rc1-rocm7.14.0" + }, "rocm_asset_families": { "comment": "Maps a concrete ROCm ISA (as detected on the device) to the family target name the GitHub release repos publish assets under. ISAs not listed here (e.g. gfx908, gfx90a, which ship as individual assets) are used verbatim. Add a new GPU's ISA here when its release asset is published under a family name.", "gfx1200": "gfx120X", diff --git a/src/cpp/resources/omni_deploy/qwen2_5_omni_1gpu.yaml b/src/cpp/resources/omni_deploy/qwen2_5_omni_1gpu.yaml new file mode 100644 index 000000000..7912f6f88 --- /dev/null +++ b/src/cpp/resources/omni_deploy/qwen2_5_omni_1gpu.yaml @@ -0,0 +1,59 @@ +# Single-GPU deploy config for Qwen2.5-Omni on the vllm-omni backend. +# +# vLLM-Omni's upstream default places the talker stage on cuda:1, which is fatal +# on a single-GPU box (that engine core sees 0 devices -> "DP adjusted local +# rank out of bounds"). This colocates all three stages (thinker / talker / +# code2wav) on device 0 with rebalanced gpu_memory_utilization summing < 1.0. +async_chunk: false + +stages: + - stage_id: 0 # thinker (AR LLM) + max_num_batched_tokens: 32768 + max_num_seqs: 1 + gpu_memory_utilization: 0.5 + enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false + mm_processor_cache_gb: 0 + devices: "0" + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 2048 + seed: 42 + repetition_penalty: 1.1 + + - stage_id: 1 # talker (AR) — colocated on device 0 + max_num_batched_tokens: 32768 + max_num_seqs: 1 + gpu_memory_utilization: 0.25 + enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false + devices: "0" + default_sampling_params: + temperature: 0.9 + top_p: 0.8 + top_k: 40 + max_tokens: 2048 + seed: 42 + repetition_penalty: 1.05 + + - stage_id: 2 # code2wav (DiT) — colocated on device 0 + max_num_batched_tokens: 32768 + max_num_seqs: 1 + gpu_memory_utilization: 0.15 + enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false + enable_flashinfer_autotune: false + async_scheduling: false + devices: "0" + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 2048 + seed: 42 + repetition_penalty: 1.1 diff --git a/src/cpp/resources/server_models.json b/src/cpp/resources/server_models.json index e88923c79..42cc086f8 100644 --- a/src/cpp/resources/server_models.json +++ b/src/cpp/resources/server_models.json @@ -1658,6 +1658,19 @@ ], "size": 1.77 }, + "Qwen2.5-Omni-3B-vLLM-Omni": { + "checkpoint": "Qwen/Qwen2.5-Omni-3B", + "recipe": "vllm-omni", + "deploy_config": "resources/omni_deploy/qwen2_5_omni_1gpu.yaml", + "suggested": true, + "labels": [ + "vision", + "chat-transcription", + "chat-speech", + "omni" + ], + "size": 7.0 + }, "Qwen3.5-2B-FP16-vLLM": { "checkpoint": "Qwen/Qwen3.5-2B", "recipe": "vllm", diff --git a/src/cpp/server/backends/vllm_omni/vllm_omni_server.cpp b/src/cpp/server/backends/vllm_omni/vllm_omni_server.cpp new file mode 100644 index 000000000..aa8a46bc6 --- /dev/null +++ b/src/cpp/server/backends/vllm_omni/vllm_omni_server.cpp @@ -0,0 +1,307 @@ +#include "lemon/backends/vllm_omni/vllm_omni_server.h" +#include "lemon/backends/vllm_omni/vllm_omni.h" +#include "lemon/backends/backend_registry.h" +#include "lemon/backends/backend_utils.h" +#include "lemon/model_manager.h" +#include "lemon/runtime_config.h" +#include "lemon/system_info.h" +#include "lemon/utils/http_client.h" +#include "lemon/utils/path_utils.h" +#include "lemon/utils/process_manager.h" +#include +#include +#include +#include +#include +#include +#include + +namespace fs = std::filesystem; +using namespace lemon::utils; + +namespace lemon { +namespace backends { + +// Omni first-boot is slow: 3 stage engine cores + multimodal weight load + +// per-kernel Triton JIT for the gfx target. Give generous headroom. +static constexpr long VLLM_OMNI_READY_TIMEOUT_SECONDS = 1800; + +// --- ROCm startup shim (mirrors the vllm backend) --------------------------- +// Disables vLLM's pynvml probe so it does not activate the CUDA platform plugin +// on hybrid AMD/NVIDIA hosts. ROCm PyTorch exposes HIP devices via +// torch.cuda/torch.accelerator, so no CUDA_VISIBLE_DEVICES is set. + +static std::string with_prepended_pythonpath(const fs::path& dir) { + const char separator = ':'; + const char* current = std::getenv("PYTHONPATH"); + return dir.string() + ((current && current[0] != '\0') + ? std::string(1, separator) + current + : ""); +} + +static fs::path create_omni_rocm_shim_dir() { + fs::path runtime_base = path_from_utf8(get_runtime_dir()); + std::random_device rd; + std::uniform_int_distribution dis(0, 0xFFFFFF); + + std::error_code ec; + for (int attempt = 0; attempt < 8; ++attempt) { + auto nonce = static_cast( + std::chrono::steady_clock::now().time_since_epoch().count()); + std::ostringstream suffix; + suffix << "vllm-omni-rocm-shim-" << nonce << "-" << std::hex << dis(rd); + fs::path candidate = runtime_base / suffix.str(); + ec.clear(); + if (fs::create_directory(candidate, ec)) { + return candidate; + } + } + throw std::runtime_error("Failed to create temporary directory for vLLM-Omni ROCm startup shim"); +} + +static fs::path write_omni_rocm_startup_shim() { + fs::path dir = create_omni_rocm_shim_dir(); + fs::path file = dir / "sitecustomize.py"; + + std::ofstream out(file, std::ios::trunc); + if (!out.is_open()) { + std::error_code ec; + fs::remove_all(dir, ec); + throw std::runtime_error("Unable to write vLLM-Omni ROCm startup shim: " + file.string()); + } + + out << R"PY(# Auto-generated by Lemonade for the vLLM-Omni ROCm backend. +import sys + +try: + import vllm.utils.import_utils as _vllm_import_utils + + def _disabled_import_pynvml(): + raise ModuleNotFoundError( + "vLLM NVML probe disabled by Lemonade for the ROCm backend" + ) + + _vllm_import_utils.import_pynvml = _disabled_import_pynvml +except (ImportError, AttributeError) as exc: + print( + f"Lemonade warning: unable to disable vLLM NVML probe: {exc}", + file=sys.stderr, + ) +)PY"; + + if (!out.good()) { + std::error_code ec; + fs::remove_all(dir, ec); + throw std::runtime_error("Failed to write vLLM-Omni ROCm startup shim: " + file.string()); + } + return dir; +} + +static void cleanup_omni_rocm_shim_dir(fs::path& shim_dir) { + if (shim_dir.empty()) { + return; + } + std::error_code ec; + fs::remove_all(shim_dir, ec); + if (ec) { + LOG(WARNING, "vLLM-Omni") << "Failed to remove ROCm startup shim directory: " + << shim_dir << " (" << ec.message() << ")" << std::endl; + } + shim_dir.clear(); +} + +static void configure_omni_rocm_env( + const std::string& backend, + std::vector>& env_vars, + fs::path& shim_dir) { + if (backend != "rocm") { + return; + } + cleanup_omni_rocm_shim_dir(shim_dir); + shim_dir = write_omni_rocm_startup_shim(); + env_vars.push_back({"PYTHONPATH", with_prepended_pythonpath(shim_dir)}); +} + +// Split a free-form user args string on whitespace. MVP: does not honor quotes. +static std::vector split_user_args(const std::string& args) { + std::vector out; + std::istringstream stream(args); + std::string token; + while (stream >> token) { + out.push_back(token); + } + return out; +} + +// ---------------------------------------------------------------------------- + +InstallParams VLLMOmniServer::get_install_params(const std::string& backend, const std::string& version) { + InstallParams params; + + if (backend == "rocm") { + params.repo = "lemonade-sdk/vllm-rocm"; + std::string target_arch = + SystemInfo::rocm_asset_family(SystemInfo::get_rocm_arch()); + if (target_arch.empty()) { + throw std::runtime_error( + SystemInfo::get_unsupported_backend_error("vllm-omni", "rocm") + ); + } +#ifdef __linux__ + // One release per GPU target: release tag is {version}-{target_arch}, + // e.g. vllm-omni0.23.0rc1-rocm7.14.0-gfx1151. The version pin in + // backend_versions.json already carries the vllm-omni prefix. + std::string release_tag = version + "-" + target_arch; + params.version_override = release_tag; + params.filename = release_tag + "-x64.tar.gz"; +#else + throw std::runtime_error("vLLM-Omni ROCm is only supported on Linux"); +#endif + } else { + throw std::runtime_error("vLLM-Omni backend '" + backend + "' is not supported. Supported: rocm"); + } + + return params; +} + +VLLMOmniServer::VLLMOmniServer(const std::string& log_level, ModelManager* model_manager, BackendManager* backend_manager) + : WrappedServer("vllm-omni-server", log_level, model_manager, backend_manager) { +} + +VLLMOmniServer::~VLLMOmniServer() { + unload(); +} + +void VLLMOmniServer::load(const std::string& model_name, + const ModelInfo& model_info, + const RecipeOptions& options, + bool do_not_upgrade) { + LOG(INFO, "vLLM-Omni") << "Loading model: " << model_name << std::endl; + + std::string omni_backend = options.get_option("vllm-omni_backend"); + std::string omni_args = options.get_option("vllm_omni_args"); + int ctx_size = options.get_option("ctx_size"); + max_model_len_ = ctx_size; + + RuntimeConfig::validate_backend_choice("vllm-omni", omni_backend); + + backend_manager_->install_backend(vllm_omni::spec()->recipe, omni_backend); + + // vLLM-Omni uses HuggingFace model IDs, not local file paths. + std::string model_id = model_info.checkpoint(); + if (model_id.empty()) { + throw std::runtime_error("Model checkpoint (HuggingFace ID) not found for: " + model_name); + } + + // Resolve the single-GPU deploy config. Per-model override via the + // `deploy_config` key in server_models.json; falls back to the Qwen2.5-Omni + // config. The upstream deploy defaults target multi-GPU hosts and put a + // stage on cuda:1, which is fatal on a single-GPU box — so a colocating + // deploy config is required. + std::string deploy_rel = model_info.extra( + "deploy_config", "resources/omni_deploy/qwen2_5_omni_1gpu.yaml"); + std::string deploy_path = utils::get_resource_path(deploy_rel); + if (!fs::exists(path_from_utf8(deploy_path))) { + throw std::runtime_error("vLLM-Omni deploy config not found: " + deploy_path); + } + + port_ = choose_port(); + + std::string executable = BackendUtils::get_backend_binary_path(*vllm_omni::spec(), omni_backend); + + std::vector args; + args.push_back("serve"); + args.push_back(model_id); + args.push_back("--omni"); + args.push_back("--deploy-config"); + args.push_back(deploy_path); + args.push_back("--host"); + args.push_back("127.0.0.1"); + args.push_back("--port"); + args.push_back(std::to_string(port_)); + // Serve using the Lemonade model name so forwarded requests match. + args.push_back("--served-model-name"); + args.push_back(model_name); + args.push_back("--max-model-len"); + args.push_back(std::to_string(ctx_size)); + + if (!omni_args.empty()) { + LOG(DEBUG, "vLLM-Omni") << "Adding user arguments: " << omni_args << std::endl; + for (const auto& tok : split_user_args(omni_args)) { + args.push_back(tok); + } + } + + LOG(INFO, "vLLM-Omni") << "Starting vllm-omni-server on port " << port_ << "..." << std::endl; + + std::vector> env_vars; + configure_omni_rocm_env(omni_backend, env_vars, rocm_shim_dir_); + // Enable ROCm flash attention (the launcher script handles LD_LIBRARY_PATH). + env_vars.push_back({"FLASH_ATTENTION_TRITON_AMD_ENABLE", "TRUE"}); + // Prevent system/user Python packages from leaking into the bundled env. + env_vars.push_back({"PYTHONNOUSERSITE", "1"}); + + bool inherit_output = (log_level_ == "info") || is_debug(); + set_process_handle(ProcessManager::start_process(executable, args, "", inherit_output, true, env_vars)); + + // Omni serves /v1/models once all stages register; it has no reliable + // /health. Multi-stage boot + JIT can take minutes on cold start. + if (!wait_for_ready("/v1/models", VLLM_OMNI_READY_TIMEOUT_SECONDS)) { + const ProcessHandle handle = consume_process_handle_for_cleanup(); + if (has_process_handle(handle)) { + ProcessManager::stop_process(handle); + } + cleanup_omni_rocm_shim_dir(rocm_shim_dir_); + max_model_len_ = 0; + std::string err = "vllm-omni-server failed to start within timeout"; + if (needs_gfx1151_cwsr_fix()) { + err += ". Your kernel may be missing the gfx1151 CWSR fix — " + "see https://lemonade-server.ai/gfx1151_linux.html"; + } + throw std::runtime_error(err); + } + + LOG(DEBUG, "vLLM-Omni") << "Model loaded on port " << port_ << std::endl; +} + +void VLLMOmniServer::unload() { + stop_backend_watchdog(); + LOG(INFO, "vLLM-Omni") << "Unloading model..." << std::endl; + + const ProcessHandle handle = consume_process_handle_for_cleanup(); + if (has_process_handle(handle)) { + ProcessManager::stop_process(handle); + } + cleanup_omni_rocm_shim_dir(rocm_shim_dir_); + max_model_len_ = 0; +} + +json VLLMOmniServer::chat_completion(const json& request) { + return forward_request("/v1/chat/completions", request); +} + +json VLLMOmniServer::completion(const json& request) { + return forward_request("/v1/completions", request); +} + +json VLLMOmniServer::responses(const json& request) { + return forward_request("/v1/responses", request); +} + +} // namespace backends +} // namespace lemon + +namespace lemon { +namespace backends { +namespace vllm_omni { + +std::unique_ptr create(const BackendContext& ctx) { + return make_server(ctx); +} + +const BackendSpec* spec() { return make_spec(descriptor, /*split=*/true); } +const BackendOps* ops() { return default_backend_ops(); } + +} // namespace vllm_omni +} // namespace backends +} // namespace lemon