Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ set(LEMON_BACKENDS
"flm|fastflowlm"
"ryzenai-llm|ryzenai"
"vllm|vllm"
"vllm-omni|vllm_omni"
"cloud|cloud"
)

Expand Down
15 changes: 15 additions & 0 deletions src/app/src/renderer/ChatWindow.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,20 @@ const ChatWindow: React.FC<ChatWindowProps> = ({ isVisible, width }) => {
return labels.includes('chat-transcription');
}, [selectedModel, modelsData]);

// A chat model that produces audio (native voice) directly in the
// /chat/completions response (message.audio) — e.g. vLLM-Omni's Qwen2.5-Omni.
// Distinct from TTS models that serve /audio/speech (the "tts" label). Enables
// the voice-output toggle in the chat input.
const isAudioOutput = useMemo(() => {
if (!selectedModel) return false;
const info = modelsData[selectedModel];
if (isCollectionModel(info)) {
return getCollectionComponents(info).some(component =>
(modelsData[component]?.labels || []).includes('chat-speech'));
}
return (info?.labels || []).includes('chat-speech');
}, [selectedModel, modelsData]);

const isCollectionSelected = useMemo(() => {
if (!selectedModel) return false;
return isCollectionModel(modelsData[selectedModel]);
Expand Down Expand Up @@ -319,6 +333,7 @@ const ChatWindow: React.FC<ChatWindowProps> = ({ isVisible, width }) => {
{...sharedProps}
isVision={isVision}
isAudioChat={isAudioChat}
isAudioOutput={isAudioOutput}
currentLoadedModel={currentLoadedModel}
setCurrentLoadedModel={setCurrentLoadedModel}
collectionMode={collectionMode}
Expand Down
117 changes: 107 additions & 10 deletions src/app/src/renderer/components/panels/LLMChatPanel.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ interface LLMChatPanelProps {
appSettings: AppSettings | null;
isVision: boolean;
isAudioChat?: boolean;
isAudioOutput?: boolean;
collectionMode?: boolean;
currentLoadedModel: string | null;
setCurrentLoadedModel: React.Dispatch<React.SetStateAction<string | null>>;
Expand All @@ -152,7 +153,7 @@ interface LLMChatPanelProps {
const LLMChatPanel: React.FC<LLMChatPanelProps> = ({
isBusy, isPreFlight, isInferring, activeModality,
runPreFlight, reset, showError, appSettings,
isVision, isAudioChat = false, collectionMode = false, currentLoadedModel, setCurrentLoadedModel,
isVision, isAudioChat = false, isAudioOutput = false, collectionMode = false, currentLoadedModel, setCurrentLoadedModel,
onNewChat, onUnloadCollection,
}) => {
const { selectedModel, modelsData } = useModels();
Expand All @@ -165,6 +166,10 @@ const LLMChatPanel: React.FC<LLMChatPanelProps> = ({

const [messages, setMessages] = useState<Message[]>([]);
const [inputValue, setInputValue] = useState('');
// Native audio (voice) output for omni chat models. Opt-in: generating speech
// is slow/expensive, so it defaults off even when the model supports it.
const [voiceEnabled, setVoiceEnabled] = useState(false);
const [selectedVoice, setSelectedVoice] = useState('Chelsie');
const [editingIndex, setEditingIndex] = useState<number | null>(null);
const [editingValue, setEditingValue] = useState('');
const [editingImages, setEditingImages] = useState<string[]>([]);
Expand Down Expand Up @@ -901,6 +906,64 @@ const LLMChatPanel: React.FC<LLMChatPanelProps> = ({
if (!accumulatedContent) throw new Error('No content received from stream');
};

/**
* Non-streaming chat for omni models with native voice output. The audio
* comes back as `message.audio` in the response — vLLM-Omni splits text and
* audio across separate choices; OpenAI puts audio on the same choice, so we
* scan all choices. Rendered via buildFinalContent -> the MessageAudio player.
* Streaming is disabled here because the SSE path only assembles text deltas.
*/
const handleAudioChat = async (messageHistory: Message[]): Promise<void> => {
const isNewModelLoad = currentLoadedModel !== chatModelName;
const requestBody = {
...buildChatRequestBody(messageHistory),
stream: false,
modalities: ['text', 'audio'],
voice: selectedVoice, // vLLM-Omni reads top-level voice
audio: { voice: selectedVoice, format: 'wav' }, // OpenAI-standard (ignored if unsupported)
};

const response = await serverFetch('/chat/completions', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(requestBody),
signal: abortControllerRef.current?.signal,
});
if (!response.ok) throw new Error(`HTTP error! status: ${response.status}`);
const data = await response.json();
if (data.error) throw new Error(data.error.message || 'LLM returned error');
if (!data.choices?.length) throw new Error('LLM returned empty response');

setCurrentLoadedModel(chatModelName);
if (isNewModelLoad) {
window.dispatchEvent(new CustomEvent('modelLoadEnd', { detail: { modelId: selectedModel } }));
}

let text = '';
let audio: { data?: string; format?: string } | undefined;
for (const choice of data.choices) {
const msg = choice.message || {};
if (!text && typeof msg.content === 'string' && msg.content) text = msg.content;
if (!audio && msg.audio?.data) audio = msg.audio;
}

const { content: cleanText, thinking } = extractThinking(text);
const artifacts: Artifact[] = [];
if (audio?.data) {
artifacts.push({ type: 'audio', data: audio.data, mime: mimeForFormat(audio.format || 'wav') });
}
const finalContent = buildFinalContent(cleanText, artifacts);
setMessages(prev => {
const newMessages = [...prev];
newMessages[newMessages.length - 1] = {
role: 'assistant',
content: finalContent,
thinking: thinking || undefined,
};
return newMessages;
});
};

const sendMessage = async (textOverride?: string) => {
const textToSend = typeof textOverride === 'string' ? textOverride : inputValue;
// When called from voice auto-submit, `isBusy` may still be stale-true
Expand Down Expand Up @@ -952,6 +1015,8 @@ const LLMChatPanel: React.FC<LLMChatPanelProps> = ({
try {
if (collectionMode && lemonadeTools) {
await handleCollectionChat(messageHistory);
} else if (isAudioOutput && voiceEnabled) {
await handleAudioChat(messageHistory);
} else {
await handleStreamingResponse(messageHistory);
}
Expand Down Expand Up @@ -1031,6 +1096,8 @@ const LLMChatPanel: React.FC<LLMChatPanelProps> = ({
try {
if (collectionMode && lemonadeTools) {
await handleCollectionChat(messageHistory);
} else if (isAudioOutput && voiceEnabled) {
await handleAudioChat(messageHistory);
} else {
await handleStreamingResponse(messageHistory);
}
Expand Down Expand Up @@ -1437,15 +1504,45 @@ const LLMChatPanel: React.FC<LLMChatPanelProps> = ({
sendDisabled={!inputValue.trim() && uploadedImages.length === 0 && uploadedAudio.length === 0}
modelSelector={<ModelSelector disabled={isBusy} />}
rightControls={
<RecordButton
disabled={isBusy}
inputValue={inputValue}
setInputValue={setInputValue}
textareaRef={inputTextareaRef}
onError={showError}
runPreFlight={runPreFlight}
reset={reset}
/>
<>
{isAudioOutput && (
<div className="voice-output-controls" style={{ display: 'flex', alignItems: 'center', gap: 4 }}>
<button
type="button"
className="voice-toggle-button"
onClick={() => setVoiceEnabled(v => !v)}
disabled={isBusy}
title={voiceEnabled
? 'Voice output on — click to disable'
: 'Voice output off — click to enable (generates speech; slower)'}
aria-pressed={voiceEnabled}
>
{voiceEnabled ? '🔊' : '🔈'}
</button>
{voiceEnabled && (
<select
className="voice-select"
value={selectedVoice}
onChange={(e) => setSelectedVoice(e.target.value)}
disabled={isBusy}
title="Voice"
>
<option value="Chelsie">Chelsie</option>
<option value="Ethan">Ethan</option>
</select>
)}
</div>
)}
<RecordButton
disabled={isBusy}
inputValue={inputValue}
setInputValue={setInputValue}
textareaRef={inputTextareaRef}
onError={showError}
runPreFlight={runPreFlight}
reset={reset}
/>
</>
}
leftControls={
<>
Expand Down
54 changes: 54 additions & 0 deletions src/cpp/include/lemon/backends/vllm_omni/vllm_omni.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#pragma once

#include "lemon/backends/backend_descriptor.h"

namespace lemon {
namespace backends {
namespace vllm_omni {

// The vllm-omni backend descriptor (plain data). Header-only `inline const` so
// it links into both the lemonade CLI and lemond without a separate source file.
//
// vLLM-Omni serves omni / any-to-any multimodal models (Qwen-Omni today) with
// ROCm acceleration. It is a pure-Python layer on top of the same base vLLM +
// PyTorch + Triton the plain `vllm` backend uses, shipped as a SEPARATE release
// artifact (vllm-omni*), so it gets its own recipe + version pin. gfx1151 only:
// that is the qualified, hardware-validated omni target.
inline const BackendDescriptor descriptor = {
/*recipe*/ "vllm-omni",
/*display_name*/ "vLLM-Omni ROCm (experimental)",
/*binary*/ "vllm-omni-server",
/*config_section*/ "", // defaults to recipe
/*default_device*/ DEVICE_GPU,
/*slot_policy*/ SlotPolicy::Standard,
/*selectable_backend*/ true,
/*uses_ctx_size*/ true,
/*dynamic_models*/ false,
/*options*/ {
{"vllm-omni_backend", "--vllm-omni", "", "BACKEND",
"vLLM-Omni backend to use", "vLLM-Omni Options"},
{"vllm_omni_args", "--vllm-omni-args", "", "ARGS",
"Custom arguments to pass to vllm-omni-server", "vLLM-Omni Options"},
},
/*support*/ {
{"rocm", {"linux"}, {{"amd_gpu", {"gfx1151"}}}, "Strix Halo iGPU (gfx1151)"},
},
/*default_labels*/ {},
/*required_checkpoints*/ {"main"},
/*modality*/ "Omni (text, audio, vision)",
/*experimental*/ true,
/*web_display_name*/ "",
/*rocm_channels*/ {}, // single rocm artifact, no stable/nightly channels
/*exposes_prometheus_metrics*/ false,
/*rocm_requires_cwsr_fix*/ true,
/*version_policy*/ VersionPolicy::Exact,
/*self_manages_downloads*/ false,
/*takes_args*/ true,
/*arg_variants*/ {},
/*bin_variants*/ {},
/*config_extra*/ nlohmann::json::object(),
};

} // namespace vllm_omni
} // namespace backends
} // namespace lemon
52 changes: 52 additions & 0 deletions src/cpp/include/lemon/backends/vllm_omni/vllm_omni_server.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#pragma once

#include "lemon/backends/backend_registry.h"

#include "lemon/wrapped_server.h"
#include "lemon/backends/backend_utils.h"
#include <filesystem>
#include <cstdint>
#include <string>

namespace lemon {
namespace backends {

// vLLM-Omni backend. Launches the bundle's `vllm-omni-server` on an omni model
// with a single-GPU deploy config and forwards OpenAI-compatible chat requests.
// Native voice / vision ride through transparently: audio output comes back as
// a second choice (`choices[1].message.audio.data`) in the chat response.
class VLLMOmniServer : public WrappedServer {
public:
static InstallParams get_install_params(const std::string& backend, const std::string& version);

VLLMOmniServer(const std::string& log_level,
ModelManager* model_manager,
BackendManager* backend_manager);

~VLLMOmniServer() override;

void load(const std::string& model_name,
const ModelInfo& model_info,
const RecipeOptions& options,
bool do_not_upgrade = false) override;

void unload() override;

// ICompletionServer implementation (audio/vision flow through the chat body).
json chat_completion(const json& request) override;
json completion(const json& request) override;
json responses(const json& request) override;

private:
std::filesystem::path rocm_shim_dir_;
int64_t max_model_len_ = 0;
};

namespace vllm_omni {
// Factory for the vllm-omni backend (constructs the server class — lemond only).
std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
const BackendSpec* spec();
const BackendOps* ops();
} // namespace vllm_omni
} // namespace backends
} // namespace lemon
3 changes: 3 additions & 0 deletions src/cpp/resources/backend_versions.json
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@
"vllm": {
"rocm": "vllm0.20.1-rocm7.12.0"
},
"vllm-omni": {
"rocm": "vllm-omni0.23.0rc1-rocm7.14.0"
},
"rocm_asset_families": {
"comment": "Maps a concrete ROCm ISA (as detected on the device) to the family target name the GitHub release repos publish assets under. ISAs not listed here (e.g. gfx908, gfx90a, which ship as individual assets) are used verbatim. Add a new GPU's ISA here when its release asset is published under a family name.",
"gfx1200": "gfx120X",
Expand Down
59 changes: 59 additions & 0 deletions src/cpp/resources/omni_deploy/qwen2_5_omni_1gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Single-GPU deploy config for Qwen2.5-Omni on the vllm-omni backend.
#
# vLLM-Omni's upstream default places the talker stage on cuda:1, which is fatal
# on a single-GPU box (that engine core sees 0 devices -> "DP adjusted local
# rank out of bounds"). This colocates all three stages (thinker / talker /
# code2wav) on device 0 with rebalanced gpu_memory_utilization summing < 1.0.
async_chunk: false

stages:
- stage_id: 0 # thinker (AR LLM)
max_num_batched_tokens: 32768
max_num_seqs: 1
gpu_memory_utilization: 0.5
enforce_eager: true
trust_remote_code: true
enable_prefix_caching: false
mm_processor_cache_gb: 0
devices: "0"
default_sampling_params:
temperature: 0.0
top_p: 1.0
top_k: -1
max_tokens: 2048
seed: 42
repetition_penalty: 1.1

- stage_id: 1 # talker (AR) — colocated on device 0
max_num_batched_tokens: 32768
max_num_seqs: 1
gpu_memory_utilization: 0.25
enforce_eager: true
trust_remote_code: true
enable_prefix_caching: false
devices: "0"
default_sampling_params:
temperature: 0.9
top_p: 0.8
top_k: 40
max_tokens: 2048
seed: 42
repetition_penalty: 1.05

- stage_id: 2 # code2wav (DiT) — colocated on device 0
max_num_batched_tokens: 32768
max_num_seqs: 1
gpu_memory_utilization: 0.15
enforce_eager: true
trust_remote_code: true
enable_prefix_caching: false
enable_flashinfer_autotune: false
async_scheduling: false
devices: "0"
default_sampling_params:
temperature: 0.0
top_p: 1.0
top_k: -1
max_tokens: 2048
seed: 42
repetition_penalty: 1.1
Loading
Loading