diff --git a/app/src/App.tsx b/app/src/App.tsx
index 8a218742..458686c8 100644
--- a/app/src/App.tsx
+++ b/app/src/App.tsx
@@ -94,10 +94,11 @@ function App() {
serverStartingRef.current = true;
const isRemote = useServerStore.getState().mode === 'remote';
+ const customModelsDir = useServerStore.getState().customModelsDir;
console.log(`Production mode: Starting bundled server... (remote: ${isRemote})`);
platform.lifecycle
- .startServer(isRemote)
+ .startServer(isRemote, customModelsDir)
.then((serverUrl) => {
console.log('Server is ready at:', serverUrl);
// Update the server URL in the store with the dynamically assigned port
diff --git a/app/src/components/AudioTab/AudioTab.tsx b/app/src/components/AudioTab/AudioTab.tsx
index 395922c8..44058bc5 100644
--- a/app/src/components/AudioTab/AudioTab.tsx
+++ b/app/src/components/AudioTab/AudioTab.tsx
@@ -23,8 +23,8 @@ import {
import { apiClient } from '@/lib/api/client';
import { BOTTOM_SAFE_AREA_PADDING } from '@/lib/constants/ui';
import { cn } from '@/lib/utils/cn';
-import { usePlayerStore } from '@/stores/playerStore';
import { usePlatform } from '@/platform/PlatformContext';
+import { usePlayerStore } from '@/stores/playerStore';
interface AudioDevice {
id: string;
@@ -129,7 +129,7 @@ export function AudioTab() {
if (await confirm('Delete this channel?')) {
deleteChannel.mutate(channelId);
}
- }
+ };
const allChannels = channels || [];
const allDevices = devices || [];
@@ -168,7 +168,7 @@ export function AudioTab() {
) : (
-
+
{allChannels.map((channel) => {
const isSelected = selectedChannelId === channel.id;
return (
@@ -343,7 +343,9 @@ export function AudioTab() {
- {platform.metadata.isTauri ? 'No audio devices found' : 'Audio device selection requires Tauri'}
+ {platform.metadata.isTauri
+ ? 'No audio devices found'
+ : 'Audio device selection requires Tauri'}
)}
diff --git a/app/src/components/ModelsTab/ModelsTab.tsx b/app/src/components/ModelsTab/ModelsTab.tsx
index 4ef9f93b..78905406 100644
--- a/app/src/components/ModelsTab/ModelsTab.tsx
+++ b/app/src/components/ModelsTab/ModelsTab.tsx
@@ -2,7 +2,7 @@ import { ModelManagement } from '@/components/ServerSettings/ModelManagement';
export function ModelsTab() {
return (
-
+
);
diff --git a/app/src/components/ServerSettings/GenerationSettings.tsx b/app/src/components/ServerSettings/GenerationSettings.tsx
index b9b45de3..048bd424 100644
--- a/app/src/components/ServerSettings/GenerationSettings.tsx
+++ b/app/src/components/ServerSettings/GenerationSettings.tsx
@@ -1,4 +1,5 @@
import { Card, CardContent, CardDescription, CardHeader, CardTitle } from '@/components/ui/card';
+import { Checkbox } from '@/components/ui/checkbox';
import { Slider } from '@/components/ui/slider';
import { useServerStore } from '@/stores/serverStore';
@@ -7,6 +8,8 @@ export function GenerationSettings() {
const setMaxChunkChars = useServerStore((state) => state.setMaxChunkChars);
const crossfadeMs = useServerStore((state) => state.crossfadeMs);
const setCrossfadeMs = useServerStore((state) => state.setCrossfadeMs);
+ const normalizeAudio = useServerStore((state) => state.normalizeAudio);
+ const setNormalizeAudio = useServerStore((state) => state.setNormalizeAudio);
return (
@@ -64,6 +67,25 @@ export function GenerationSettings() {
Blends audio between chunks to smooth transitions. Set to 0 for a hard cut.
+
+
+
+
+
+ Normalize audio
+
+
+ Adjusts output volume to a consistent level across generations.
+
+
+
diff --git a/app/src/components/ServerSettings/ModelManagement.tsx b/app/src/components/ServerSettings/ModelManagement.tsx
index 37e8b229..b9c6c09c 100644
--- a/app/src/components/ServerSettings/ModelManagement.tsx
+++ b/app/src/components/ServerSettings/ModelManagement.tsx
@@ -7,6 +7,7 @@ import {
CircleX,
Download,
ExternalLink,
+ FolderOpen,
HardDrive,
Heart,
Loader2,
@@ -41,6 +42,8 @@ import { useToast } from '@/components/ui/use-toast';
import { apiClient } from '@/lib/api/client';
import type { ActiveDownloadTask, HuggingFaceModelInfo, ModelStatus } from '@/lib/api/types';
import { useModelDownloadToast } from '@/lib/hooks/useModelDownloadToast';
+import { usePlatform } from '@/platform/PlatformContext';
+import { useServerStore } from '@/stores/serverStore';
async function fetchHuggingFaceModelInfo(repoId: string): Promise
{
const response = await fetch(`https://huggingface.co/api/models/${repoId}`);
@@ -48,6 +51,29 @@ async function fetchHuggingFaceModelInfo(repoId: string): Promise = {
+ 'qwen-tts-1.7B':
+ 'High-quality multilingual TTS by Alibaba. Supports 10 languages with natural prosody and voice cloning from short reference audio.',
+ 'qwen-tts-0.6B':
+ 'Lightweight version of Qwen TTS. Same language support with faster inference, ideal for lower-end hardware.',
+ luxtts:
+ 'Lightweight ZipVoice-based TTS designed for high quality voice cloning and 48kHz speech generation at speeds exceeding 150x realtime.',
+ 'chatterbox-tts':
+ 'Production-grade open source TTS by Resemble AI. Supports 23 languages with voice cloning and emotion exaggeration control.',
+ 'chatterbox-turbo':
+ 'Streamlined 350M parameter TTS by Resemble AI. High-quality English speech with less compute and VRAM than larger models.',
+ 'whisper-base':
+ 'Smallest Whisper model (74M parameters). Fast transcription with moderate accuracy.',
+ 'whisper-small':
+ 'Whisper Small (244M parameters). Good balance of speed and accuracy for transcription.',
+ 'whisper-medium':
+ 'Whisper Medium (769M parameters). Higher accuracy transcription at moderate speed.',
+ 'whisper-large':
+ 'Whisper Large (1.5B parameters). Best accuracy for speech-to-text across multiple languages.',
+ 'whisper-turbo':
+ 'Whisper Large v3 Turbo. Pruned for significantly faster inference while maintaining near-large accuracy.',
+};
+
function formatDownloads(n: number): string {
if (n >= 1_000_000) return `${(n / 1_000_000).toFixed(1)}M`;
if (n >= 1_000) return `${(n / 1_000).toFixed(1)}k`;
@@ -85,6 +111,18 @@ function formatBytes(bytes: number): string {
export function ModelManagement() {
const { toast } = useToast();
const queryClient = useQueryClient();
+ const platform = usePlatform();
+ const customModelsDir = useServerStore((state) => state.customModelsDir);
+ const setCustomModelsDir = useServerStore((state) => state.setCustomModelsDir);
+ const [migrating, setMigrating] = useState(false);
+ const [migrationProgress, setMigrationProgress] = useState<{
+ current: number;
+ total: number;
+ progress: number;
+ filename?: string;
+ status: string;
+ } | null>(null);
+ const [pendingMigrateDir, setPendingMigrateDir] = useState(null);
const [downloadingModel, setDownloadingModel] = useState(null);
const [downloadingDisplayName, setDownloadingDisplayName] = useState(null);
const [consoleOpen, setConsoleOpen] = useState(false);
@@ -104,6 +142,12 @@ export function ModelManagement() {
refetchInterval: 5000,
});
+ const { data: cacheDir } = useQuery({
+ queryKey: ['modelsCacheDir'],
+ queryFn: () => apiClient.getModelsCacheDir(),
+ staleTime: 1000 * 60 * 5,
+ });
+
const { data: activeTasks } = useQuery({
queryKey: ['activeTasks'],
queryFn: () => apiClient.getActiveTasks(),
@@ -382,6 +426,87 @@ export function ModelManagement() {
+ {/* Model storage location */}
+ {platform.metadata.isTauri && cacheDir && (
+
+
+
+
Storage location
+
+ {cacheDir.path}
+
+
+
+ {
+ try {
+ const { open } = await import('@tauri-apps/plugin-shell');
+ await open(cacheDir.path);
+ } catch {
+ toast({ title: 'Failed to open model folder', variant: 'destructive' });
+ }
+ }}
+ >
+
+ Open
+
+ {
+ try {
+ const { open: openDialog } = await import('@tauri-apps/plugin-dialog');
+ const selected = await openDialog({
+ directory: true,
+ title: 'Choose model storage folder',
+ });
+ if (!selected) return;
+ const newDir =
+ typeof selected === 'string' ? selected : (selected as { path: string }).path;
+ if (!newDir) return;
+ setPendingMigrateDir(newDir);
+ } catch {
+ toast({ title: 'Failed to open folder picker', variant: 'destructive' });
+ }
+ }}
+ disabled={migrating}
+ >
+ {migrating ? (
+
+ ) : (
+
+ )}
+ {migrating ? 'Migrating...' : 'Change'}
+
+ {customModelsDir && (
+ {
+ setCustomModelsDir(null);
+ toast({ title: 'Reset to default location. Restarting server...' });
+ await platform.lifecycle.restartServer('');
+ queryClient.invalidateQueries();
+ }}
+ >
+
+ Reset
+
+ )}
+
+
+
+ )}
+
{/* Model list */}
{isLoading ? (
@@ -457,9 +582,7 @@ export function ModelManagement() {
{formatSize(model.size_mb)}
)}
- {!model.downloaded && !isDownloading && !hasError && (
- Not downloaded
- )}
+
@@ -571,13 +694,6 @@ export function ModelManagement() {
Error
)}
- {!freshSelectedModel.downloaded &&
- !selectedState?.isDownloading &&
- !selectedState?.hasError && (
-
- Not downloaded
-
- )}
{/* HuggingFace model card info */}
@@ -588,6 +704,13 @@ export function ModelManagement() {
)}
+ {/* Description */}
+ {MODEL_DESCRIPTIONS[freshSelectedModel.model_name] && (
+
+ {MODEL_DESCRIPTIONS[freshSelectedModel.model_name]}
+
+ )}
+
{hfModelInfo && (
{/* Pipeline tag + author */}
@@ -810,6 +933,126 @@ export function ModelManagement() {
+
+ {/* Migration confirmation dialog */}
+
!open && setPendingMigrateDir(null)}
+ >
+
+
+ Move models to new location?
+
+ The server will shut down while models are being moved to the new folder. It will
+ restart automatically once the migration is complete.
+
+
+
+ {pendingMigrateDir}
+
+
+ Cancel
+ {
+ if (!pendingMigrateDir) return;
+ const newDir = pendingMigrateDir;
+ setPendingMigrateDir(null);
+ setMigrating(true);
+ setMigrationProgress({
+ current: 0,
+ total: 0,
+ progress: 0,
+ status: 'downloading',
+ filename: 'Preparing...',
+ });
+ try {
+ // Start the migration (background task)
+ await apiClient.migrateModels(newDir);
+
+ // Connect to SSE for progress
+ await new Promise((resolve, reject) => {
+ const es = new EventSource(apiClient.getMigrationProgressUrl());
+ es.onmessage = (event) => {
+ try {
+ const data = JSON.parse(event.data);
+ setMigrationProgress(data);
+ if (data.status === 'complete') {
+ es.close();
+ resolve();
+ } else if (data.status === 'error') {
+ es.close();
+ reject(new Error(data.error || 'Migration failed'));
+ }
+ } catch {
+ /* ignore parse errors */
+ }
+ };
+ es.onerror = () => {
+ es.close();
+ reject(new Error('Lost connection during migration'));
+ };
+ });
+
+ setCustomModelsDir(newDir);
+ setMigrationProgress({
+ current: 1,
+ total: 1,
+ progress: 100,
+ status: 'complete',
+ filename: 'Restarting server...',
+ });
+ await platform.lifecycle.restartServer(newDir);
+ queryClient.invalidateQueries();
+ toast({ title: 'Models moved successfully' });
+ } catch (e) {
+ toast({
+ title: 'Migration failed',
+ description: e instanceof Error ? e.message : 'Failed to migrate models',
+ variant: 'destructive',
+ });
+ } finally {
+ setMigrating(false);
+ setMigrationProgress(null);
+ }
+ }}
+ >
+ Move Models
+
+
+
+
+
+ {/* Migration progress overlay */}
+ {migrating && migrationProgress && (
+
+
+
+
+
Moving models
+
+ {migrationProgress.status === 'complete'
+ ? 'Restarting server...'
+ : 'The server is offline while models are being moved.'}
+
+
+ {migrationProgress.total > 0 && (
+
+
+
+ {migrationProgress.filename}
+
+ {formatBytes(migrationProgress.current)} /{' '}
+ {formatBytes(migrationProgress.total)}
+
+
+
+ )}
+
+
+ )}
);
}
diff --git a/app/src/components/ServerTab/ServerTab.tsx b/app/src/components/ServerTab/ServerTab.tsx
index 000ec5b7..d9954c90 100644
--- a/app/src/components/ServerTab/ServerTab.tsx
+++ b/app/src/components/ServerTab/ServerTab.tsx
@@ -2,12 +2,18 @@ import { ConnectionForm } from '@/components/ServerSettings/ConnectionForm';
import { GenerationSettings } from '@/components/ServerSettings/GenerationSettings';
import { GpuAcceleration } from '@/components/ServerSettings/GpuAcceleration';
import { UpdateStatus } from '@/components/ServerSettings/UpdateStatus';
+import { BOTTOM_SAFE_AREA_PADDING } from '@/lib/constants/ui';
+import { cn } from '@/lib/utils/cn';
import { usePlatform } from '@/platform/PlatformContext';
+import { usePlayerStore } from '@/stores/playerStore';
export function ServerTab() {
const platform = usePlatform();
+ const isPlayerVisible = !!usePlayerStore((state) => state.audioUrl);
return (
-
+
diff --git a/app/src/lib/api/client.ts b/app/src/lib/api/client.ts
index e522ef48..dbc4cbdc 100644
--- a/app/src/lib/api/client.ts
+++ b/app/src/lib/api/client.ts
@@ -316,6 +316,21 @@ class ApiClient {
return this.request
('/models/status');
}
+ async getModelsCacheDir(): Promise<{ path: string }> {
+ return this.request<{ path: string }>('/models/cache-dir');
+ }
+
+ async migrateModels(destination: string): Promise<{ source: string; destination: string }> {
+ return this.request('/models/migrate', {
+ method: 'POST',
+ body: JSON.stringify({ destination }),
+ });
+ }
+
+ getMigrationProgressUrl(): string {
+ return `${this.getBaseUrl()}/models/migrate/progress`;
+ }
+
async triggerModelDownload(modelName: string): Promise<{ message: string }> {
console.log(
'[API] triggerModelDownload called for:',
diff --git a/app/src/lib/api/types.ts b/app/src/lib/api/types.ts
index af5a6c15..3a17eba3 100644
--- a/app/src/lib/api/types.ts
+++ b/app/src/lib/api/types.ts
@@ -38,6 +38,7 @@ export interface GenerationRequest {
instruct?: string;
max_chunk_chars?: number;
crossfade_ms?: number;
+ normalize?: boolean;
}
export interface GenerationResponse {
diff --git a/app/src/lib/hooks/useGenerationForm.ts b/app/src/lib/hooks/useGenerationForm.ts
index 66effd22..f79f7ab2 100644
--- a/app/src/lib/hooks/useGenerationForm.ts
+++ b/app/src/lib/hooks/useGenerationForm.ts
@@ -34,6 +34,7 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
const setIsGenerating = useGenerationStore((state) => state.setIsGenerating);
const maxChunkChars = useServerStore((state) => state.maxChunkChars);
const crossfadeMs = useServerStore((state) => state.crossfadeMs);
+ const normalizeAudio = useServerStore((state) => state.normalizeAudio);
const [downloadingModelName, setDownloadingModelName] = useState(null);
const [downloadingDisplayName, setDownloadingDisplayName] = useState(null);
@@ -115,6 +116,7 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
instruct: isQwen ? data.instruct || undefined : undefined,
max_chunk_chars: maxChunkChars,
crossfade_ms: crossfadeMs,
+ normalize: normalizeAudio,
});
toast({
diff --git a/app/src/lib/hooks/useRestoreActiveTasks.tsx b/app/src/lib/hooks/useRestoreActiveTasks.tsx
index 063e6bcb..191cca94 100644
--- a/app/src/lib/hooks/useRestoreActiveTasks.tsx
+++ b/app/src/lib/hooks/useRestoreActiveTasks.tsx
@@ -1,23 +1,23 @@
import { useCallback, useEffect, useRef, useState } from 'react';
import { apiClient } from '@/lib/api/client';
-import { useGenerationStore } from '@/stores/generationStore';
import type { ActiveDownloadTask } from '@/lib/api/types';
+import { useGenerationStore } from '@/stores/generationStore';
// Polling interval in milliseconds
-const POLL_INTERVAL = 2000;
+const POLL_INTERVAL = 30000;
/**
* Hook to monitor active tasks (downloads and generations).
* Polls the server periodically to catch downloads triggered from anywhere
* (transcription, generation, explicit download, etc.).
- *
+ *
* Returns the active downloads so components can render download toasts.
*/
export function useRestoreActiveTasks() {
const [activeDownloads, setActiveDownloads] = useState([]);
const setIsGenerating = useGenerationStore((state) => state.setIsGenerating);
const setActiveGenerationId = useGenerationStore((state) => state.setActiveGenerationId);
-
+
// Track which downloads we've seen to detect new ones
const seenDownloadsRef = useRef>(new Set());
@@ -41,14 +41,14 @@ export function useRestoreActiveTasks() {
// Update active downloads
// Keep track of all active downloads (including new ones)
const currentDownloadNames = new Set(tasks.downloads.map((d) => d.model_name));
-
+
// Remove completed downloads from our seen set
for (const name of seenDownloadsRef.current) {
if (!currentDownloadNames.has(name)) {
seenDownloadsRef.current.delete(name);
}
}
-
+
// Add new downloads to seen set
for (const download of tasks.downloads) {
seenDownloadsRef.current.add(download.model_name);
diff --git a/app/src/platform/types.ts b/app/src/platform/types.ts
index 23e99da5..eeba4c99 100644
--- a/app/src/platform/types.ts
+++ b/app/src/platform/types.ts
@@ -49,9 +49,9 @@ export interface PlatformAudio {
}
export interface PlatformLifecycle {
- startServer(remote?: boolean): Promise;
+ startServer(remote?: boolean, modelsDir?: string | null): Promise;
stopServer(): Promise;
- restartServer(): Promise;
+ restartServer(modelsDir?: string | null): Promise;
setKeepServerRunning(keep: boolean): Promise;
setupWindowCloseHandler(): Promise;
onServerReady?: () => void;
diff --git a/app/src/stores/serverStore.ts b/app/src/stores/serverStore.ts
index 1795b61c..586e1e8c 100644
--- a/app/src/stores/serverStore.ts
+++ b/app/src/stores/serverStore.ts
@@ -19,6 +19,12 @@ interface ServerStore {
crossfadeMs: number;
setCrossfadeMs: (value: number) => void;
+
+ normalizeAudio: boolean;
+ setNormalizeAudio: (value: boolean) => void;
+
+ customModelsDir: string | null;
+ setCustomModelsDir: (dir: string | null) => void;
}
export const useServerStore = create()(
@@ -41,6 +47,12 @@ export const useServerStore = create()(
crossfadeMs: 50,
setCrossfadeMs: (value) => set({ crossfadeMs: value }),
+
+ normalizeAudio: true,
+ setNormalizeAudio: (value) => set({ normalizeAudio: value }),
+
+ customModelsDir: null,
+ setCustomModelsDir: (dir) => set({ customModelsDir: dir }),
}),
{
name: 'voicebox-server',
diff --git a/backend/main.py b/backend/main.py
index cb9a2bd3..9b3aa334 100644
--- a/backend/main.py
+++ b/backend/main.py
@@ -844,6 +844,10 @@ async def download_chatterbox_turbo_background():
trim_fn=trim_fn,
)
+ if data.normalize:
+ from .utils.audio import normalize_audio
+ audio = normalize_audio(audio)
+
# Calculate duration
duration = len(audio) / sample_rate
@@ -975,6 +979,10 @@ async def stream_speech(
trim_fn=trim_fn,
)
+ if data.normalize:
+ from .utils.audio import normalize_audio
+ audio = normalize_audio(audio)
+
wav_bytes = tts.audio_to_wav_bytes(audio, sample_rate)
async def _wav_stream():
@@ -1588,6 +1596,141 @@ async def event_generator():
)
+@app.get("/models/cache-dir")
+async def get_models_cache_dir():
+ """Get the path to the HuggingFace model cache directory."""
+ from huggingface_hub import constants as hf_constants
+ return {"path": str(Path(hf_constants.HF_HUB_CACHE))}
+
+
+def _get_dir_size(path: Path) -> int:
+ """Get total size of a directory in bytes."""
+ total = 0
+ for f in path.rglob("*"):
+ if f.is_file():
+ total += f.stat().st_size
+ return total
+
+
+def _copy_with_progress(src: Path, dst: Path, progress_manager, copied_so_far: int, total_bytes: int) -> int:
+ """Copy a directory tree with byte-level progress tracking."""
+ import shutil
+ dst.mkdir(parents=True, exist_ok=True)
+ for item in src.iterdir():
+ dest_item = dst / item.name
+ if item.is_dir():
+ copied_so_far = _copy_with_progress(item, dest_item, progress_manager, copied_so_far, total_bytes)
+ else:
+ size = item.stat().st_size
+ shutil.copy2(str(item), str(dest_item))
+ copied_so_far += size
+ progress_manager.update_progress(
+ "migration", copied_so_far, total_bytes,
+ filename=item.name, status="downloading",
+ )
+ return copied_so_far
+
+
+@app.post("/models/migrate")
+async def migrate_models(request: models.ModelMigrateRequest):
+ """Move all downloaded models to a new directory with byte-level progress via SSE."""
+ import shutil
+ from huggingface_hub import constants as hf_constants
+
+ source = Path(hf_constants.HF_HUB_CACHE)
+ destination = Path(request.destination)
+
+ if not source.exists():
+ raise HTTPException(status_code=404, detail="Current model cache directory not found")
+
+ model_dirs = [d for d in source.iterdir() if d.name.startswith("models--") and d.is_dir()]
+ if not model_dirs:
+ return {"moved": 0, "errors": [], "source": str(source), "destination": str(destination)}
+
+ destination.mkdir(parents=True, exist_ok=True)
+
+ progress_manager = get_progress_manager()
+
+ # Check if source and destination are on the same filesystem (rename is instant)
+ same_fs = False
+ try:
+ same_fs = source.stat().st_dev == destination.stat().st_dev
+ except OSError:
+ pass
+
+ async def migrate_background():
+ moved = 0
+ errors = []
+ try:
+ if same_fs:
+ # Same filesystem: rename is instant, just track model count
+ total = len(model_dirs)
+ for i, item in enumerate(model_dirs):
+ dest_item = destination / item.name
+ try:
+ if dest_item.exists():
+ shutil.rmtree(dest_item)
+ shutil.move(str(item), str(dest_item))
+ moved += 1
+ progress_manager.update_progress(
+ "migration", i + 1, total,
+ filename=item.name, status="downloading",
+ )
+ except Exception as e:
+ errors.append(f"{item.name}: {str(e)}")
+ else:
+ # Cross-filesystem: copy with byte-level progress, then delete source
+ total_bytes = sum(_get_dir_size(d) for d in model_dirs)
+ progress_manager.update_progress("migration", 0, total_bytes, filename="Calculating...", status="downloading")
+
+ copied = 0
+ for item in model_dirs:
+ dest_item = destination / item.name
+ try:
+ if dest_item.exists():
+ shutil.rmtree(dest_item)
+ copied = await asyncio.to_thread(
+ _copy_with_progress, item, dest_item, progress_manager, copied, total_bytes
+ )
+ # Remove source after successful copy
+ await asyncio.to_thread(shutil.rmtree, str(item))
+ moved += 1
+ except Exception as e:
+ errors.append(f"{item.name}: {str(e)}")
+
+ progress_manager.update_progress("migration", 1, 1, status="complete")
+ progress_manager.mark_complete("migration")
+ except Exception as e:
+ progress_manager.update_progress("migration", 0, 0, status="error")
+ progress_manager.mark_error("migration", str(e))
+
+ _create_background_task(migrate_background())
+
+ return {"source": str(source), "destination": str(destination)}
+
+
+@app.get("/models/migrate/progress")
+async def get_migration_progress():
+ """Get model migration progress via Server-Sent Events."""
+ from fastapi.responses import StreamingResponse
+
+ progress_manager = get_progress_manager()
+
+ async def event_generator():
+ async for event in progress_manager.subscribe("migration"):
+ yield event
+
+ return StreamingResponse(
+ event_generator(),
+ media_type="text/event-stream",
+ headers={
+ "Cache-Control": "no-cache",
+ "Connection": "keep-alive",
+ "X-Accel-Buffering": "no",
+ },
+ )
+
+
@app.get("/models/status", response_model=models.ModelStatusListResponse)
async def get_model_status():
"""Get status of all available models."""
diff --git a/backend/models.py b/backend/models.py
index b462b67a..8f9dbf10 100644
--- a/backend/models.py
+++ b/backend/models.py
@@ -60,6 +60,7 @@ class GenerationRequest(BaseModel):
engine: Optional[str] = Field(default="qwen", pattern="^(qwen|luxtts|chatterbox|chatterbox_turbo)$")
max_chunk_chars: int = Field(default=800, ge=100, le=5000, description="Max characters per chunk for long text splitting")
crossfade_ms: int = Field(default=50, ge=0, le=500, description="Crossfade duration in ms between chunks (0 for hard cut)")
+ normalize: bool = Field(default=True, description="Normalize output audio volume")
class GenerationResponse(BaseModel):
@@ -170,6 +171,11 @@ class ModelDownloadRequest(BaseModel):
model_name: str
+class ModelMigrateRequest(BaseModel):
+ """Request model for migrating models to a new directory."""
+ destination: str
+
+
class ActiveDownloadTask(BaseModel):
"""Response model for active download task."""
model_name: str
diff --git a/bun.lock b/bun.lock
index 9e08a825..d271b5c6 100644
--- a/bun.lock
+++ b/bun.lock
@@ -13,7 +13,7 @@
},
"app": {
"name": "@voicebox/app",
- "version": "0.1.11",
+ "version": "0.1.13",
"dependencies": {
"@dnd-kit/core": "^6.3.1",
"@dnd-kit/sortable": "^10.0.0",
@@ -68,7 +68,7 @@
},
"landing": {
"name": "@voicebox/landing",
- "version": "0.1.11",
+ "version": "0.1.13",
"dependencies": {
"@radix-ui/react-separator": "^1.1.8",
"@radix-ui/react-slot": "^1.2.4",
@@ -93,7 +93,7 @@
},
"tauri": {
"name": "@voicebox/tauri",
- "version": "0.1.11",
+ "version": "0.1.13",
"dependencies": {
"@tauri-apps/api": "^2.0.0",
"@tauri-apps/plugin-dialog": "^2.0.0",
@@ -116,7 +116,7 @@
},
"web": {
"name": "@voicebox/web",
- "version": "0.1.11",
+ "version": "0.1.13",
"dependencies": {
"@tanstack/react-query": "^5.0.0",
"react": "^18.3.0",
@@ -125,6 +125,7 @@
"zustand": "^4.5.0",
},
"devDependencies": {
+ "@tailwindcss/vite": "^4.0.0",
"@types/react": "^18.3.0",
"@types/react-dom": "^18.3.0",
"@typescript-eslint/eslint-plugin": "^7.0.0",
diff --git a/docs/issue-pain-points.md b/docs/issue-pain-points.md
new file mode 100644
index 00000000..54346cfd
--- /dev/null
+++ b/docs/issue-pain-points.md
@@ -0,0 +1,67 @@
+# Voicebox Issue Pain Points (Snapshot)
+
+## Scope
+
+- Dataset: **128 total issues** (**107 open**, **21 closed**)
+- Source: GitHub issues in `jamiepine/voicebox`
+- Classification: keyword/theme clustering
+- Note: counts below are **non-exclusive** (one issue can belong to multiple pain points)
+
+## Most Common Pain Points (Open Issues)
+
+| Rank | Pain Point | Open Issues | What users are reporting |
+|---|---|---:|---|
+| 1 | Model download & offline reliability | **32** | Downloads failing/stalling, cache/offline behavior inconsistent, wrong model size selected, Errno issues |
+| 2 | GPU/backend compatibility | **22** | GPU not detected, backend fallback surprises, platform-specific runtime failures (Windows/Mac) |
+| 3 | Export/save/file persistence | **15** | Export fails, "failed to fetch/download audio", samples/profiles not saving |
+| 4 | Language/accent quality & coverage | **14** | Missing language support, accent mismatch, robotic outputs |
+| 5 | Update/restart safety + long-op controls | **4** | Auto-restart without warning, update confusion, lack of cancel/pause controls |
+
+## Representative Issues by Pain Point
+
+### 1) Model download & offline reliability (32)
+
+- [#159](https://github.com/jamiepine/voicebox/issues/159) - Qwen download fails with Errno 22
+- [#151](https://github.com/jamiepine/voicebox/issues/151) - Model loading hangs / server crashes
+- [#150](https://github.com/jamiepine/voicebox/issues/150) - Internet required despite downloaded models
+- [#149](https://github.com/jamiepine/voicebox/issues/149) - Cancel/pause controls for large downloads
+- [#96](https://github.com/jamiepine/voicebox/issues/96) - 0.6B selection still uses/downloads 1.7B
+
+### 2) GPU/backend compatibility (22)
+
+- [#164](https://github.com/jamiepine/voicebox/issues/164) - Windows: no GPU usage + multiple breakages
+- [#141](https://github.com/jamiepine/voicebox/issues/141) - Using CPU only, GPU not used
+- [#131](https://github.com/jamiepine/voicebox/issues/131) - Numpy ABI mismatch in bundled app
+- [#130](https://github.com/jamiepine/voicebox/issues/130) - Intel Mac tensor/padding generation error
+- [#127](https://github.com/jamiepine/voicebox/issues/127) - GPU not found
+
+### 3) Export/save/file persistence (15)
+
+- [#148](https://github.com/jamiepine/voicebox/issues/148) - Japanese export fails on 0.1.12
+- [#143](https://github.com/jamiepine/voicebox/issues/143) - Samples not saving
+- [#134](https://github.com/jamiepine/voicebox/issues/134) - Can't save profile
+- [#105](https://github.com/jamiepine/voicebox/issues/105) - Export audio fails (failed to fetch)
+- [#49](https://github.com/jamiepine/voicebox/issues/49) - Export filename/location ignored on Windows
+
+### 4) Language/accent quality & coverage (14)
+
+- [#162](https://github.com/jamiepine/voicebox/issues/162) - Persian audio request/problem
+- [#117](https://github.com/jamiepine/voicebox/issues/117) - Arabic language support
+- [#113](https://github.com/jamiepine/voicebox/issues/113) - Polish language support
+- [#109](https://github.com/jamiepine/voicebox/issues/109) - Ukrainian support
+- [#100](https://github.com/jamiepine/voicebox/issues/100) - Non-US accent quality issues
+
+### 5) Update/restart safety + controls (4)
+
+- [#164](https://github.com/jamiepine/voicebox/issues/164) - Update behavior + usability failures
+- [#136](https://github.com/jamiepine/voicebox/issues/136) - Auto-restart without warning
+- [#86](https://github.com/jamiepine/voicebox/issues/86) - Unexpected restart with no confirmation
+- [#149](https://github.com/jamiepine/voicebox/issues/149) - Need pause/cancel and pre-download confirmation
+
+## Additional Signal
+
+- There is also a large **feature-request/misc** bucket (**36 open**) that is competing with stability triage (audiobook, Linux build, additional ASR/TTS models, integrations).
+
+## Takeaway
+
+Most user pain is concentrated in four stability areas: **download/offline path**, **GPU/backend detection**, **save/export reliability**, and **language/accent correctness**. Addressing those first should reduce the majority of current support friction.
diff --git a/docs/plans/PROJECT_STATUS.md b/docs/plans/PROJECT_STATUS.md
index d47dfebf..b8d6ff53 100644
--- a/docs/plans/PROJECT_STATUS.md
+++ b/docs/plans/PROJECT_STATUS.md
@@ -321,7 +321,7 @@ Notable requests:
## New Model Integration — Landscape
-### Models Worth Supporting (2026 SOTA)
+### Models Worth Supporting (2026 SOTA — updated March 13)
| Model | Cloning | Speed | Sample Rate | Languages | VRAM | Integration Ease | Status |
|-------|---------|-------|-------------|-----------|------|-----------------|--------|
@@ -329,10 +329,23 @@ Notable requests:
| **LuxTTS** | 3s zero-shot | 150x RT, CPU ok | 48 kHz | English | <1 GB | **Shipped** | PR #254 |
| **Chatterbox MTL** | 5s zero-shot | Medium | 24 kHz | 23 | Medium | **Shipped** | PR #257 |
| **Chatterbox Turbo** | 5s zero-shot | Fast | 24 kHz | English | Low | **PR #258** | In review |
+| **HumeAI TADA 1B/3B** | Zero-shot | 5× faster than LLM-TTS | — | EN (1B), Multilingual (3B) | Medium | Needs vetting | MIT, 700s+ coherent, synced transcript output |
+| **MOSS-TTS Family** | Zero-shot | — | — | Multilingual | Medium | Needs vetting | Apache 2.0, multi-speaker dialogue, text-to-voice design (no ref audio) |
+| **VoxCPM 1.5** | Zero-shot (seconds) | ~0.15 RTF streaming | — | Bilingual (EN/ZH) | Medium | Needs vetting | Apache 2.0, tokenizer-free continuous diffusion, LoRA-friendly |
+| **Pocket TTS** | Zero-shot + streaming | >1× RT on CPU | — | English | ~100M params, CPU-first | Needs vetting | MIT, Kyutai Labs, no GPU required |
+| **Kokoro-82M** | 3s instant | CPU realtime | 24 kHz | English | Tiny (82M) | Ready | Apache 2.0, multi-engine arch in place |
| **XTTS-v2** | 6s zero-shot | Mid-GPU | 24 kHz | 17+ | Medium | Ready | Multi-engine arch in place |
| **Fish Speech** | 10-30s few-shot | Real-time | 24-44 kHz | 50+ | Medium | Ready | Multi-engine arch in place |
| **CosyVoice2-0.5B** | 3-10s zero-shot | Very fast | 24 kHz | Multilingual | Low | Ready | Multi-engine arch in place |
-| **Kokoro-82M** | 3s instant | CPU realtime | 24 kHz | English | Tiny | Ready | Multi-engine arch in place |
+
+#### Notes on New Candidates (March 2026)
+
+- **HumeAI TADA** — Text-Audio Dual Alignment arch. Near-zero hallucinations/drift, free synced transcript. 700+ seconds coherent audio. Best candidate for Stories long-form reliability. [HF: HumeAI/tada-1b](https://huggingface.co/HumeAI/tada-1b) | [GitHub: HumeAI/tada](https://github.com/HumeAI/tada)
+- **MOSS-TTS** — Modular suite: flagship cloning, MOSS-TTSD (multi-speaker dialogue), MOSS-VoiceGenerator (create voices from text descriptions, no ref audio). Unique UX for Stories voice design. [GitHub: OpenMOSS/MOSS-TTS](https://github.com/OpenMOSS/MOSS-TTS)
+- **VoxCPM 1.5** — Tokenizer-free continuous diffusion + autoregressive. No discrete token artifacts. Context-aware prosody/emotion, real-time streaming, LoRA fine-tuning. Trained on 1.8M+ hours. [GitHub: OpenBMB/VoxCPM](https://github.com/OpenBMB/VoxCPM)
+- **Pocket TTS** — 100M param CPU-first model from Kyutai Labs (Moshi team). Runs >1× realtime without GPU. Broadens hardware support significantly. [GitHub: kyutai-labs/pocket-tts](https://github.com/kyutai-labs/pocket-tts)
+- **Watch list:** MioTTS-2.6B (fast LLM-based EN/JP, vLLM compatible), Oolel-Voices (Soynade Research, expressive modular control)
+- **Skipped:** Fish Audio S2 — restrictive research license (commercial use requires approval), despite strong features
### Adding a New Engine (Now Straightforward)
@@ -402,16 +415,21 @@ The generation form now uses a flat model dropdown with engine-based routing. Pe
### Tier 3 — Future (v0.3.0+)
-| Item | Notes |
-|------|-------|
-| XTTS-v2 / Fish Speech / CosyVoice | Multi-engine arch is ready; just needs backend implementation |
-| OpenAI-compatible API (plan doc exists) | Low effort once API is stable |
-| LoRA fine-tuning (PR #195) | Complex, needs rework for multi-engine |
-| External/remote providers | Depends on use case demand |
-| GGUF support (#226) | Depends on model ecosystem maturity |
-| Queue system (#234) | Batch generation |
-| Streaming for non-MLX engines | Currently MLX-only |
-| Kokoro-82M | Tiny model, great for CPU-only machines |
+| Priority | Item | Notes |
+|----------|------|-------|
+| 1 | **HumeAI TADA** | Long-form reliability for Stories, synced transcripts. Addresses #234, #203, #191, #111, #69. Needs API vetting. |
+| 2 | **Pocket TTS** (Kyutai) | CPU-first 100M model, broadens hardware support. Kyutai ships clean code. Needs API vetting. |
+| 3 | **MOSS-TTS** | Text-to-voice design (no ref audio) is unique. Multi-speaker dialogue for Stories. Needs thorough API vetting. |
+| 4 | **Kokoro-82M** | 82M params, CPU realtime, Apache 2.0. Easy win. |
+| 5 | **Model config registry refactor** | Reduce 5-dispatch-point duplication in main.py — do before adding 3+ more engines |
+| 6 | XTTS-v2 / Fish Speech / CosyVoice | Multi-engine arch is ready; just needs backend implementation |
+| 7 | **VoxCPM 1.5** | Tokenizer-free streaming, interesting but uncertain integration surface |
+| 8 | OpenAI-compatible API (plan doc exists) | Low effort once API is stable |
+| 9 | LoRA fine-tuning (PR #195) | Complex, needs rework for multi-engine |
+| 10 | External/remote providers | Depends on use case demand |
+| 11 | GGUF support (#226) | Depends on model ecosystem maturity |
+| 12 | Queue system (#234) | Batch generation |
+| 13 | Streaming for non-MLX engines | Currently MLX-only |
---
diff --git a/tauri/src-tauri/gen/Assets.car b/tauri/src-tauri/gen/Assets.car
index a5f68f95..8065a50c 100644
Binary files a/tauri/src-tauri/gen/Assets.car and b/tauri/src-tauri/gen/Assets.car differ
diff --git a/tauri/src-tauri/gen/voicebox.icns b/tauri/src-tauri/gen/voicebox.icns
index e4492f52..59661d99 100644
Binary files a/tauri/src-tauri/gen/voicebox.icns and b/tauri/src-tauri/gen/voicebox.icns differ
diff --git a/tauri/src-tauri/src/main.rs b/tauri/src-tauri/src/main.rs
index 157fee9e..ae0f1ed9 100644
--- a/tauri/src-tauri/src/main.rs
+++ b/tauri/src-tauri/src/main.rs
@@ -16,6 +16,7 @@ struct ServerState {
child: Mutex>,
server_pid: Mutex >,
keep_running_on_close: Mutex,
+ models_dir: Mutex>,
}
#[command]
@@ -23,7 +24,16 @@ async fn start_server(
app: tauri::AppHandle,
state: State<'_, ServerState>,
remote: Option,
+ models_dir: Option,
) -> Result {
+ // Store models_dir for use on restart (empty string means reset to default)
+ if let Some(ref dir) = models_dir {
+ if dir.is_empty() {
+ *state.models_dir.lock().unwrap() = None;
+ } else {
+ *state.models_dir.lock().unwrap() = Some(dir.clone());
+ }
+ }
// Check if server is already running (managed by this app instance)
if state.child.lock().unwrap().is_some() {
return Ok(format!("http://127.0.0.1:{}", SERVER_PORT));
@@ -274,6 +284,12 @@ async fn start_server(
let port_str = SERVER_PORT.to_string();
let is_remote = remote.unwrap_or(false);
+ // Resolve the custom models directory from the parameter or stored state
+ let effective_models_dir = models_dir.or_else(|| state.models_dir.lock().unwrap().clone());
+ if let Some(ref dir) = effective_models_dir {
+ println!("Custom models directory: {}", dir);
+ }
+
// If CUDA binary exists, launch it directly instead of the bundled sidecar
let spawn_result = if let Some(ref cuda_path) = cuda_binary {
println!("Launching CUDA backend: {:?}", cuda_path);
@@ -282,6 +298,9 @@ async fn start_server(
if is_remote {
cmd = cmd.args(["--host", "0.0.0.0"]);
}
+ if let Some(ref dir) = effective_models_dir {
+ cmd = cmd.env("VOICEBOX_MODELS_DIR", dir);
+ }
cmd.spawn()
} else {
// Use the bundled CPU sidecar
@@ -289,6 +308,9 @@ async fn start_server(
if is_remote {
sidecar = sidecar.args(["--host", "0.0.0.0"]);
}
+ if let Some(ref dir) = effective_models_dir {
+ sidecar = sidecar.env("VOICEBOX_MODELS_DIR", dir);
+ }
println!("Spawning server process...");
sidecar.spawn()
};
@@ -613,9 +635,19 @@ async fn stop_server(state: State<'_, ServerState>) -> Result<(), String> {
async fn restart_server(
app: tauri::AppHandle,
state: State<'_, ServerState>,
+ models_dir: Option,
) -> Result {
println!("restart_server: stopping current server...");
+ // Update stored models_dir: empty string means reset to default, non-empty means set
+ if let Some(ref dir) = models_dir {
+ if dir.is_empty() {
+ *state.models_dir.lock().unwrap() = None;
+ } else {
+ *state.models_dir.lock().unwrap() = Some(dir.clone());
+ }
+ }
+
// Stop the current server
stop_server(state.clone()).await?;
@@ -623,9 +655,9 @@ async fn restart_server(
println!("restart_server: waiting for port release...");
tokio::time::sleep(tokio::time::Duration::from_millis(1000)).await;
- // Start server again (will auto-detect CUDA binary)
+ // Start server again (will auto-detect CUDA binary and use stored models_dir)
println!("restart_server: starting server...");
- start_server(app, state, None).await
+ start_server(app, state, None, None).await
}
#[command]
@@ -686,6 +718,7 @@ pub fn run() {
child: Mutex::new(None),
server_pid: Mutex::new(None),
keep_running_on_close: Mutex::new(false),
+ models_dir: Mutex::new(None),
})
.manage(audio_capture::AudioCaptureState::new())
.manage(audio_output::AudioOutputState::new())
diff --git a/tauri/src-tauri/tauri.conf.json b/tauri/src-tauri/tauri.conf.json
index 5cb83ee2..aa8435ea 100644
--- a/tauri/src-tauri/tauri.conf.json
+++ b/tauri/src-tauri/tauri.conf.json
@@ -56,7 +56,7 @@
},
"plugins": {
"shell": {
- "open": true
+ "open": ".*"
},
"updater": {
"pubkey": "dW50cnVzdGVkIGNvbW1lbnQ6IG1pbmlzaWduIHB1YmxpYyBrZXk6IEUxRENBQkRBQjdBNTM1OTIKUldTU05hVzMycXZjNGJGcUxmcVVocll2QjdSaTJNdlFxR2M3VDJsMnVvbDdyZGRPMmRlOW9aWTcK",
diff --git a/tauri/src/platform/lifecycle.ts b/tauri/src/platform/lifecycle.ts
index 60063f3e..d31ddd52 100644
--- a/tauri/src/platform/lifecycle.ts
+++ b/tauri/src/platform/lifecycle.ts
@@ -5,9 +5,12 @@ import type { PlatformLifecycle } from '@/platform/types';
class TauriLifecycle implements PlatformLifecycle {
onServerReady?: () => void;
- async startServer(remote = false): Promise {
+ async startServer(remote = false, modelsDir?: string | null): Promise {
try {
- const result = await invoke('start_server', { remote });
+ const result = await invoke('start_server', {
+ remote,
+ modelsDir: modelsDir ?? undefined,
+ });
console.log('Server started:', result);
this.onServerReady?.();
return result;
@@ -27,9 +30,11 @@ class TauriLifecycle implements PlatformLifecycle {
}
}
- async restartServer(): Promise {
+ async restartServer(modelsDir?: string | null): Promise {
try {
- const result = await invoke('restart_server');
+ const result = await invoke('restart_server', {
+ modelsDir: modelsDir ?? undefined,
+ });
console.log('Server restarted:', result);
this.onServerReady?.();
return result;
diff --git a/web/src/platform/lifecycle.ts b/web/src/platform/lifecycle.ts
index f40f1a90..9a6d825a 100644
--- a/web/src/platform/lifecycle.ts
+++ b/web/src/platform/lifecycle.ts
@@ -3,7 +3,7 @@ import type { PlatformLifecycle } from '@/platform/types';
class WebLifecycle implements PlatformLifecycle {
onServerReady?: () => void;
- async startServer(_remote = false): Promise {
+ async startServer(_remote = false, _modelsDir?: string | null): Promise {
// Web assumes server is running externally
// Return a default URL - this should be configured via env vars
const serverUrl = import.meta.env.VITE_SERVER_URL || 'http://localhost:17493';
@@ -15,7 +15,7 @@ class WebLifecycle implements PlatformLifecycle {
// No-op for web - server is managed externally
}
- async restartServer(): Promise {
+ async restartServer(_modelsDir?: string | null): Promise {
// No-op for web - server is managed externally
return import.meta.env.VITE_SERVER_URL || 'http://localhost:17493';
}