lemonade-sdk · ramkrishna2910 · Jul 2, 2026 · Jul 2, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -63,6 +63,7 @@ set(LEMON_BACKENDS
     "flm|fastflowlm"
     "ryzenai-llm|ryzenai"
     "vllm|vllm"
+    "vllm-omni|vllm_omni"
     "cloud|cloud"
 )
 

diff --git a/src/app/src/renderer/ChatWindow.tsx b/src/app/src/renderer/ChatWindow.tsx
@@ -104,6 +104,20 @@ const ChatWindow: React.FC<ChatWindowProps> = ({ isVisible, width }) => {
     return labels.includes('chat-transcription');
   }, [selectedModel, modelsData]);
 
+  // A chat model that produces audio (native voice) directly in the
+  // /chat/completions response (message.audio) — e.g. vLLM-Omni's Qwen2.5-Omni.
+  // Distinct from TTS models that serve /audio/speech (the "tts" label). Enables
+  // the voice-output toggle in the chat input.
+  const isAudioOutput = useMemo(() => {
+    if (!selectedModel) return false;
+    const info = modelsData[selectedModel];
+    if (isCollectionModel(info)) {
+      return getCollectionComponents(info).some(component =>
+        (modelsData[component]?.labels || []).includes('chat-speech'));
+    }
+    return (info?.labels || []).includes('chat-speech');
+  }, [selectedModel, modelsData]);
+
   const isCollectionSelected = useMemo(() => {
     if (!selectedModel) return false;
     return isCollectionModel(modelsData[selectedModel]);
@@ -319,6 +333,7 @@ const ChatWindow: React.FC<ChatWindowProps> = ({ isVisible, width }) => {
           {...sharedProps}
           isVision={isVision}
           isAudioChat={isAudioChat}
+          isAudioOutput={isAudioOutput}
           currentLoadedModel={currentLoadedModel}
           setCurrentLoadedModel={setCurrentLoadedModel}
           collectionMode={collectionMode}

diff --git a/src/app/src/renderer/components/panels/LLMChatPanel.tsx b/src/app/src/renderer/components/panels/LLMChatPanel.tsx
@@ -142,6 +142,7 @@ interface LLMChatPanelProps {
   appSettings: AppSettings | null;
   isVision: boolean;
   isAudioChat?: boolean;
+  isAudioOutput?: boolean;
   collectionMode?: boolean;
   currentLoadedModel: string | null;
   setCurrentLoadedModel: React.Dispatch<React.SetStateAction<string | null>>;
@@ -152,7 +153,7 @@ interface LLMChatPanelProps {
 const LLMChatPanel: React.FC<LLMChatPanelProps> = ({
   isBusy, isPreFlight, isInferring, activeModality,
   runPreFlight, reset, showError, appSettings,
-  isVision, isAudioChat = false, collectionMode = false, currentLoadedModel, setCurrentLoadedModel,
+  isVision, isAudioChat = false, isAudioOutput = false, collectionMode = false, currentLoadedModel, setCurrentLoadedModel,
   onNewChat, onUnloadCollection,
 }) => {
   const { selectedModel, modelsData } = useModels();
@@ -165,6 +166,10 @@ const LLMChatPanel: React.FC<LLMChatPanelProps> = ({
 
   const [messages, setMessages] = useState<Message[]>([]);
   const [inputValue, setInputValue] = useState('');
+  // Native audio (voice) output for omni chat models. Opt-in: generating speech
+  // is slow/expensive, so it defaults off even when the model supports it.
+  const [voiceEnabled, setVoiceEnabled] = useState(false);
+  const [selectedVoice, setSelectedVoice] = useState('Chelsie');
   const [editingIndex, setEditingIndex] = useState<number | null>(null);
   const [editingValue, setEditingValue] = useState('');
   const [editingImages, setEditingImages] = useState<string[]>([]);
@@ -901,6 +906,64 @@ const LLMChatPanel: React.FC<LLMChatPanelProps> = ({
     if (!accumulatedContent) throw new Error('No content received from stream');
   };
 
+  /**
+   * Non-streaming chat for omni models with native voice output. The audio
+   * comes back as `message.audio` in the response — vLLM-Omni splits text and
+   * audio across separate choices; OpenAI puts audio on the same choice, so we
+   * scan all choices. Rendered via buildFinalContent -> the MessageAudio player.
+   * Streaming is disabled here because the SSE path only assembles text deltas.
+   */
+  const handleAudioChat = async (messageHistory: Message[]): Promise<void> => {
+    const isNewModelLoad = currentLoadedModel !== chatModelName;
+    const requestBody = {
+      ...buildChatRequestBody(messageHistory),
+      stream: false,
+      modalities: ['text', 'audio'],
+      voice: selectedVoice,                           // vLLM-Omni reads top-level voice
+      audio: { voice: selectedVoice, format: 'wav' }, // OpenAI-standard (ignored if unsupported)
+    };
+
+    const response = await serverFetch('/chat/completions', {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify(requestBody),
+      signal: abortControllerRef.current?.signal,
+    });
+    if (!response.ok) throw new Error(`HTTP error! status: ${response.status}`);
+    const data = await response.json();
+    if (data.error) throw new Error(data.error.message || 'LLM returned error');
+    if (!data.choices?.length) throw new Error('LLM returned empty response');
+
+    setCurrentLoadedModel(chatModelName);
+    if (isNewModelLoad) {
+      window.dispatchEvent(new CustomEvent('modelLoadEnd', { detail: { modelId: selectedModel } }));
+    }
+
+    let text = '';
+    let audio: { data?: string; format?: string } | undefined;
+    for (const choice of data.choices) {
+      const msg = choice.message || {};
+      if (!text && typeof msg.content === 'string' && msg.content) text = msg.content;
+      if (!audio && msg.audio?.data) audio = msg.audio;
+    }
+
+    const { content: cleanText, thinking } = extractThinking(text);
+    const artifacts: Artifact[] = [];
+    if (audio?.data) {
+      artifacts.push({ type: 'audio', data: audio.data, mime: mimeForFormat(audio.format || 'wav') });
+    }
+    const finalContent = buildFinalContent(cleanText, artifacts);
+    setMessages(prev => {
+      const newMessages = [...prev];
+      newMessages[newMessages.length - 1] = {
+        role: 'assistant',
+        content: finalContent,
+        thinking: thinking || undefined,
+      };
+      return newMessages;
+    });
+  };
+
   const sendMessage = async (textOverride?: string) => {
     const textToSend = typeof textOverride === 'string' ? textOverride : inputValue;
     // When called from voice auto-submit, `isBusy` may still be stale-true
@@ -952,6 +1015,8 @@ const LLMChatPanel: React.FC<LLMChatPanelProps> = ({
     try {
       if (collectionMode && lemonadeTools) {
         await handleCollectionChat(messageHistory);
+      } else if (isAudioOutput && voiceEnabled) {
+        await handleAudioChat(messageHistory);
       } else {
         await handleStreamingResponse(messageHistory);
       }
@@ -1031,6 +1096,8 @@ const LLMChatPanel: React.FC<LLMChatPanelProps> = ({
     try {
       if (collectionMode && lemonadeTools) {
         await handleCollectionChat(messageHistory);
+      } else if (isAudioOutput && voiceEnabled) {
+        await handleAudioChat(messageHistory);
       } else {
         await handleStreamingResponse(messageHistory);
       }
@@ -1437,15 +1504,45 @@ const LLMChatPanel: React.FC<LLMChatPanelProps> = ({
             sendDisabled={!inputValue.trim() && uploadedImages.length === 0 && uploadedAudio.length === 0}
             modelSelector={<ModelSelector disabled={isBusy} />}
             rightControls={
-              <RecordButton
-                disabled={isBusy}
-                inputValue={inputValue}
-                setInputValue={setInputValue}
-                textareaRef={inputTextareaRef}
-                onError={showError}
-                runPreFlight={runPreFlight}
-                reset={reset}
-              />
+              <>
+                {isAudioOutput && (
+                  <div className="voice-output-controls" style={{ display: 'flex', alignItems: 'center', gap: 4 }}>
+                    <button
+                      type="button"
+                      className="voice-toggle-button"
+                      onClick={() => setVoiceEnabled(v => !v)}
+                      disabled={isBusy}
+                      title={voiceEnabled
+                        ? 'Voice output on — click to disable'
+                        : 'Voice output off — click to enable (generates speech; slower)'}
+                      aria-pressed={voiceEnabled}
+                    >
+                      {voiceEnabled ? '🔊' : '🔈'}
+                    </button>
+                    {voiceEnabled && (
+                      <select
+                        className="voice-select"
+                        value={selectedVoice}
+                        onChange={(e) => setSelectedVoice(e.target.value)}
+                        disabled={isBusy}
+                        title="Voice"
+                      >
+                        <option value="Chelsie">Chelsie</option>
+                        <option value="Ethan">Ethan</option>
+                      </select>
+                    )}
+                  </div>
+                )}
+                <RecordButton
+                  disabled={isBusy}
+                  inputValue={inputValue}
+                  setInputValue={setInputValue}
+                  textareaRef={inputTextareaRef}
+                  onError={showError}
+                  runPreFlight={runPreFlight}
+                  reset={reset}
+                />
+              </>
             }
             leftControls={
               <>

diff --git a/src/cpp/include/lemon/backends/vllm_omni/vllm_omni.h b/src/cpp/include/lemon/backends/vllm_omni/vllm_omni.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
+
+namespace lemon {
+namespace backends {
+namespace vllm_omni {
+
+// The vllm-omni backend descriptor (plain data). Header-only `inline const` so
+// it links into both the lemonade CLI and lemond without a separate source file.
+//
+// vLLM-Omni serves omni / any-to-any multimodal models (Qwen-Omni today) with
+// ROCm acceleration. It is a pure-Python layer on top of the same base vLLM +
+// PyTorch + Triton the plain `vllm` backend uses, shipped as a SEPARATE release
+// artifact (vllm-omni*), so it gets its own recipe + version pin. gfx1151 only:
+// that is the qualified, hardware-validated omni target.
+inline const BackendDescriptor descriptor = {
+    /*recipe*/          "vllm-omni",
+    /*display_name*/    "vLLM-Omni ROCm (experimental)",
+    /*binary*/          "vllm-omni-server",
+    /*config_section*/  "",  // defaults to recipe
+    /*default_device*/  DEVICE_GPU,
+    /*slot_policy*/     SlotPolicy::Standard,
+    /*selectable_backend*/ true,
+    /*uses_ctx_size*/   true,
+    /*dynamic_models*/  false,
+    /*options*/ {
+        {"vllm-omni_backend", "--vllm-omni", "", "BACKEND",
+         "vLLM-Omni backend to use", "vLLM-Omni Options"},
+        {"vllm_omni_args", "--vllm-omni-args", "", "ARGS",
+         "Custom arguments to pass to vllm-omni-server", "vLLM-Omni Options"},
+    },
+    /*support*/ {
+        {"rocm", {"linux"}, {{"amd_gpu", {"gfx1151"}}}, "Strix Halo iGPU (gfx1151)"},
+    },
+    /*default_labels*/  {},
+    /*required_checkpoints*/ {"main"},
+    /*modality*/        "Omni (text, audio, vision)",
+    /*experimental*/    true,
+    /*web_display_name*/ "",
+    /*rocm_channels*/   {},  // single rocm artifact, no stable/nightly channels
+    /*exposes_prometheus_metrics*/ false,
+    /*rocm_requires_cwsr_fix*/ true,
+    /*version_policy*/  VersionPolicy::Exact,
+    /*self_manages_downloads*/ false,
+    /*takes_args*/      true,
+    /*arg_variants*/    {},
+    /*bin_variants*/    {},
+    /*config_extra*/    nlohmann::json::object(),
+};
+
+}  // namespace vllm_omni
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/vllm_omni/vllm_omni_server.h b/src/cpp/include/lemon/backends/vllm_omni/vllm_omni_server.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include "lemon/backends/backend_registry.h"
+
+#include "lemon/wrapped_server.h"
+#include "lemon/backends/backend_utils.h"
+#include <filesystem>
+#include <cstdint>
+#include <string>
+
+namespace lemon {
+namespace backends {
+
+// vLLM-Omni backend. Launches the bundle's `vllm-omni-server` on an omni model
+// with a single-GPU deploy config and forwards OpenAI-compatible chat requests.
+// Native voice / vision ride through transparently: audio output comes back as
+// a second choice (`choices[1].message.audio.data`) in the chat response.
+class VLLMOmniServer : public WrappedServer {
+public:
+    static InstallParams get_install_params(const std::string& backend, const std::string& version);
+
+    VLLMOmniServer(const std::string& log_level,
+                   ModelManager* model_manager,
+                   BackendManager* backend_manager);
+
+    ~VLLMOmniServer() override;
+
+    void load(const std::string& model_name,
+             const ModelInfo& model_info,
+             const RecipeOptions& options,
+             bool do_not_upgrade = false) override;
+
+    void unload() override;
+
+    // ICompletionServer implementation (audio/vision flow through the chat body).
+    json chat_completion(const json& request) override;
+    json completion(const json& request) override;
+    json responses(const json& request) override;
+
+private:
+    std::filesystem::path rocm_shim_dir_;
+    int64_t max_model_len_ = 0;
+};
+
+namespace vllm_omni {
+// Factory for the vllm-omni backend (constructs the server class — lemond only).
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+const BackendSpec* spec();
+const BackendOps* ops();
+}  // namespace vllm_omni
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/resources/backend_versions.json b/src/cpp/resources/backend_versions.json
@@ -92,6 +92,9 @@
   "vllm": {
     "rocm": "vllm0.20.1-rocm7.12.0"
   },
+  "vllm-omni": {
+    "rocm": "vllm-omni0.23.0rc1-rocm7.14.0"
+  },
   "rocm_asset_families": {
     "comment": "Maps a concrete ROCm ISA (as detected on the device) to the family target name the GitHub release repos publish assets under. ISAs not listed here (e.g. gfx908, gfx90a, which ship as individual assets) are used verbatim. Add a new GPU's ISA here when its release asset is published under a family name.",
     "gfx1200": "gfx120X",

diff --git a/src/cpp/resources/omni_deploy/qwen2_5_omni_1gpu.yaml b/src/cpp/resources/omni_deploy/qwen2_5_omni_1gpu.yaml
@@ -0,0 +1,59 @@
+# Single-GPU deploy config for Qwen2.5-Omni on the vllm-omni backend.
+#
+# vLLM-Omni's upstream default places the talker stage on cuda:1, which is fatal
+# on a single-GPU box (that engine core sees 0 devices -> "DP adjusted local
+# rank out of bounds"). This colocates all three stages (thinker / talker /
+# code2wav) on device 0 with rebalanced gpu_memory_utilization summing < 1.0.
+async_chunk: false
+
+stages:
+  - stage_id: 0          # thinker (AR LLM)
+    max_num_batched_tokens: 32768
+    max_num_seqs: 1
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    trust_remote_code: true
+    enable_prefix_caching: false
+    mm_processor_cache_gb: 0
+    devices: "0"
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 2048
+      seed: 42
+      repetition_penalty: 1.1
+
+  - stage_id: 1          # talker (AR) — colocated on device 0
+    max_num_batched_tokens: 32768
+    max_num_seqs: 1
+    gpu_memory_utilization: 0.25
+    enforce_eager: true
+    trust_remote_code: true
+    enable_prefix_caching: false
+    devices: "0"
+    default_sampling_params:
+      temperature: 0.9
+      top_p: 0.8
+      top_k: 40
+      max_tokens: 2048
+      seed: 42
+      repetition_penalty: 1.05
+
+  - stage_id: 2          # code2wav (DiT) — colocated on device 0
+    max_num_batched_tokens: 32768
+    max_num_seqs: 1
+    gpu_memory_utilization: 0.15
+    enforce_eager: true
+    trust_remote_code: true
+    enable_prefix_caching: false
+    enable_flashinfer_autotune: false
+    async_scheduling: false
+    devices: "0"
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 2048
+      seed: 42
+      repetition_penalty: 1.1