diff --git a/app/src/App.tsx b/app/src/App.tsx index 8a218742..458686c8 100644 --- a/app/src/App.tsx +++ b/app/src/App.tsx @@ -94,10 +94,11 @@ function App() { serverStartingRef.current = true; const isRemote = useServerStore.getState().mode === 'remote'; + const customModelsDir = useServerStore.getState().customModelsDir; console.log(`Production mode: Starting bundled server... (remote: ${isRemote})`); platform.lifecycle - .startServer(isRemote) + .startServer(isRemote, customModelsDir) .then((serverUrl) => { console.log('Server is ready at:', serverUrl); // Update the server URL in the store with the dynamically assigned port diff --git a/app/src/components/AudioTab/AudioTab.tsx b/app/src/components/AudioTab/AudioTab.tsx index 395922c8..44058bc5 100644 --- a/app/src/components/AudioTab/AudioTab.tsx +++ b/app/src/components/AudioTab/AudioTab.tsx @@ -23,8 +23,8 @@ import { import { apiClient } from '@/lib/api/client'; import { BOTTOM_SAFE_AREA_PADDING } from '@/lib/constants/ui'; import { cn } from '@/lib/utils/cn'; -import { usePlayerStore } from '@/stores/playerStore'; import { usePlatform } from '@/platform/PlatformContext'; +import { usePlayerStore } from '@/stores/playerStore'; interface AudioDevice { id: string; @@ -129,7 +129,7 @@ export function AudioTab() { if (await confirm('Delete this channel?')) { deleteChannel.mutate(channelId); } - } + }; const allChannels = channels || []; const allDevices = devices || []; @@ -168,7 +168,7 @@ export function AudioTab() { ) : ( -
+
{allChannels.map((channel) => { const isSelected = selectedChannelId === channel.id; return ( @@ -343,7 +343,9 @@ export function AudioTab() {

- {platform.metadata.isTauri ? 'No audio devices found' : 'Audio device selection requires Tauri'} + {platform.metadata.isTauri + ? 'No audio devices found' + : 'Audio device selection requires Tauri'}

)} diff --git a/app/src/components/ModelsTab/ModelsTab.tsx b/app/src/components/ModelsTab/ModelsTab.tsx index 4ef9f93b..78905406 100644 --- a/app/src/components/ModelsTab/ModelsTab.tsx +++ b/app/src/components/ModelsTab/ModelsTab.tsx @@ -2,7 +2,7 @@ import { ModelManagement } from '@/components/ServerSettings/ModelManagement'; export function ModelsTab() { return ( -
+
); diff --git a/app/src/components/ServerSettings/GenerationSettings.tsx b/app/src/components/ServerSettings/GenerationSettings.tsx index b9b45de3..048bd424 100644 --- a/app/src/components/ServerSettings/GenerationSettings.tsx +++ b/app/src/components/ServerSettings/GenerationSettings.tsx @@ -1,4 +1,5 @@ import { Card, CardContent, CardDescription, CardHeader, CardTitle } from '@/components/ui/card'; +import { Checkbox } from '@/components/ui/checkbox'; import { Slider } from '@/components/ui/slider'; import { useServerStore } from '@/stores/serverStore'; @@ -7,6 +8,8 @@ export function GenerationSettings() { const setMaxChunkChars = useServerStore((state) => state.setMaxChunkChars); const crossfadeMs = useServerStore((state) => state.crossfadeMs); const setCrossfadeMs = useServerStore((state) => state.setCrossfadeMs); + const normalizeAudio = useServerStore((state) => state.normalizeAudio); + const setNormalizeAudio = useServerStore((state) => state.setNormalizeAudio); return ( @@ -64,6 +67,25 @@ export function GenerationSettings() { Blends audio between chunks to smooth transitions. Set to 0 for a hard cut.

+ +
+ +
+ +

+ Adjusts output volume to a consistent level across generations. +

+
+
diff --git a/app/src/components/ServerSettings/ModelManagement.tsx b/app/src/components/ServerSettings/ModelManagement.tsx index 37e8b229..b9c6c09c 100644 --- a/app/src/components/ServerSettings/ModelManagement.tsx +++ b/app/src/components/ServerSettings/ModelManagement.tsx @@ -7,6 +7,7 @@ import { CircleX, Download, ExternalLink, + FolderOpen, HardDrive, Heart, Loader2, @@ -41,6 +42,8 @@ import { useToast } from '@/components/ui/use-toast'; import { apiClient } from '@/lib/api/client'; import type { ActiveDownloadTask, HuggingFaceModelInfo, ModelStatus } from '@/lib/api/types'; import { useModelDownloadToast } from '@/lib/hooks/useModelDownloadToast'; +import { usePlatform } from '@/platform/PlatformContext'; +import { useServerStore } from '@/stores/serverStore'; async function fetchHuggingFaceModelInfo(repoId: string): Promise { const response = await fetch(`https://huggingface.co/api/models/${repoId}`); @@ -48,6 +51,29 @@ async function fetchHuggingFaceModelInfo(repoId: string): Promise = { + 'qwen-tts-1.7B': + 'High-quality multilingual TTS by Alibaba. Supports 10 languages with natural prosody and voice cloning from short reference audio.', + 'qwen-tts-0.6B': + 'Lightweight version of Qwen TTS. Same language support with faster inference, ideal for lower-end hardware.', + luxtts: + 'Lightweight ZipVoice-based TTS designed for high quality voice cloning and 48kHz speech generation at speeds exceeding 150x realtime.', + 'chatterbox-tts': + 'Production-grade open source TTS by Resemble AI. Supports 23 languages with voice cloning and emotion exaggeration control.', + 'chatterbox-turbo': + 'Streamlined 350M parameter TTS by Resemble AI. High-quality English speech with less compute and VRAM than larger models.', + 'whisper-base': + 'Smallest Whisper model (74M parameters). Fast transcription with moderate accuracy.', + 'whisper-small': + 'Whisper Small (244M parameters). Good balance of speed and accuracy for transcription.', + 'whisper-medium': + 'Whisper Medium (769M parameters). Higher accuracy transcription at moderate speed.', + 'whisper-large': + 'Whisper Large (1.5B parameters). Best accuracy for speech-to-text across multiple languages.', + 'whisper-turbo': + 'Whisper Large v3 Turbo. Pruned for significantly faster inference while maintaining near-large accuracy.', +}; + function formatDownloads(n: number): string { if (n >= 1_000_000) return `${(n / 1_000_000).toFixed(1)}M`; if (n >= 1_000) return `${(n / 1_000).toFixed(1)}k`; @@ -85,6 +111,18 @@ function formatBytes(bytes: number): string { export function ModelManagement() { const { toast } = useToast(); const queryClient = useQueryClient(); + const platform = usePlatform(); + const customModelsDir = useServerStore((state) => state.customModelsDir); + const setCustomModelsDir = useServerStore((state) => state.setCustomModelsDir); + const [migrating, setMigrating] = useState(false); + const [migrationProgress, setMigrationProgress] = useState<{ + current: number; + total: number; + progress: number; + filename?: string; + status: string; + } | null>(null); + const [pendingMigrateDir, setPendingMigrateDir] = useState(null); const [downloadingModel, setDownloadingModel] = useState(null); const [downloadingDisplayName, setDownloadingDisplayName] = useState(null); const [consoleOpen, setConsoleOpen] = useState(false); @@ -104,6 +142,12 @@ export function ModelManagement() { refetchInterval: 5000, }); + const { data: cacheDir } = useQuery({ + queryKey: ['modelsCacheDir'], + queryFn: () => apiClient.getModelsCacheDir(), + staleTime: 1000 * 60 * 5, + }); + const { data: activeTasks } = useQuery({ queryKey: ['activeTasks'], queryFn: () => apiClient.getActiveTasks(), @@ -382,6 +426,87 @@ export function ModelManagement() {

+ {/* Model storage location */} + {platform.metadata.isTauri && cacheDir && ( +
+
+
+ Storage location +

+ {cacheDir.path} +

+
+
+ + + {customModelsDir && ( + + )} +
+
+
+ )} + {/* Model list */} {isLoading ? (
@@ -457,9 +582,7 @@ export function ModelManagement() { {formatSize(model.size_mb)} )} - {!model.downloaded && !isDownloading && !hasError && ( - Not downloaded - )} +
@@ -571,13 +694,6 @@ export function ModelManagement() { Error )} - {!freshSelectedModel.downloaded && - !selectedState?.isDownloading && - !selectedState?.hasError && ( - - Not downloaded - - )} {/* HuggingFace model card info */} @@ -588,6 +704,13 @@ export function ModelManagement() { )} + {/* Description */} + {MODEL_DESCRIPTIONS[freshSelectedModel.model_name] && ( +

+ {MODEL_DESCRIPTIONS[freshSelectedModel.model_name]} +

+ )} + {hfModelInfo && (
{/* Pipeline tag + author */} @@ -810,6 +933,126 @@ export function ModelManagement() { + + {/* Migration confirmation dialog */} + !open && setPendingMigrateDir(null)} + > + + + Move models to new location? + + The server will shut down while models are being moved to the new folder. It will + restart automatically once the migration is complete. + + +
+ {pendingMigrateDir} +
+ + Cancel + { + if (!pendingMigrateDir) return; + const newDir = pendingMigrateDir; + setPendingMigrateDir(null); + setMigrating(true); + setMigrationProgress({ + current: 0, + total: 0, + progress: 0, + status: 'downloading', + filename: 'Preparing...', + }); + try { + // Start the migration (background task) + await apiClient.migrateModels(newDir); + + // Connect to SSE for progress + await new Promise((resolve, reject) => { + const es = new EventSource(apiClient.getMigrationProgressUrl()); + es.onmessage = (event) => { + try { + const data = JSON.parse(event.data); + setMigrationProgress(data); + if (data.status === 'complete') { + es.close(); + resolve(); + } else if (data.status === 'error') { + es.close(); + reject(new Error(data.error || 'Migration failed')); + } + } catch { + /* ignore parse errors */ + } + }; + es.onerror = () => { + es.close(); + reject(new Error('Lost connection during migration')); + }; + }); + + setCustomModelsDir(newDir); + setMigrationProgress({ + current: 1, + total: 1, + progress: 100, + status: 'complete', + filename: 'Restarting server...', + }); + await platform.lifecycle.restartServer(newDir); + queryClient.invalidateQueries(); + toast({ title: 'Models moved successfully' }); + } catch (e) { + toast({ + title: 'Migration failed', + description: e instanceof Error ? e.message : 'Failed to migrate models', + variant: 'destructive', + }); + } finally { + setMigrating(false); + setMigrationProgress(null); + } + }} + > + Move Models + + +
+
+ + {/* Migration progress overlay */} + {migrating && migrationProgress && ( +
+
+
+ +

Moving models

+

+ {migrationProgress.status === 'complete' + ? 'Restarting server...' + : 'The server is offline while models are being moved.'} +

+
+ {migrationProgress.total > 0 && ( +
+ +
+ {migrationProgress.filename} + + {formatBytes(migrationProgress.current)} /{' '} + {formatBytes(migrationProgress.total)} + +
+
+ )} +
+
+ )}
); } diff --git a/app/src/components/ServerTab/ServerTab.tsx b/app/src/components/ServerTab/ServerTab.tsx index 000ec5b7..d9954c90 100644 --- a/app/src/components/ServerTab/ServerTab.tsx +++ b/app/src/components/ServerTab/ServerTab.tsx @@ -2,12 +2,18 @@ import { ConnectionForm } from '@/components/ServerSettings/ConnectionForm'; import { GenerationSettings } from '@/components/ServerSettings/GenerationSettings'; import { GpuAcceleration } from '@/components/ServerSettings/GpuAcceleration'; import { UpdateStatus } from '@/components/ServerSettings/UpdateStatus'; +import { BOTTOM_SAFE_AREA_PADDING } from '@/lib/constants/ui'; +import { cn } from '@/lib/utils/cn'; import { usePlatform } from '@/platform/PlatformContext'; +import { usePlayerStore } from '@/stores/playerStore'; export function ServerTab() { const platform = usePlatform(); + const isPlayerVisible = !!usePlayerStore((state) => state.audioUrl); return ( -
+
diff --git a/app/src/lib/api/client.ts b/app/src/lib/api/client.ts index e522ef48..dbc4cbdc 100644 --- a/app/src/lib/api/client.ts +++ b/app/src/lib/api/client.ts @@ -316,6 +316,21 @@ class ApiClient { return this.request('/models/status'); } + async getModelsCacheDir(): Promise<{ path: string }> { + return this.request<{ path: string }>('/models/cache-dir'); + } + + async migrateModels(destination: string): Promise<{ source: string; destination: string }> { + return this.request('/models/migrate', { + method: 'POST', + body: JSON.stringify({ destination }), + }); + } + + getMigrationProgressUrl(): string { + return `${this.getBaseUrl()}/models/migrate/progress`; + } + async triggerModelDownload(modelName: string): Promise<{ message: string }> { console.log( '[API] triggerModelDownload called for:', diff --git a/app/src/lib/api/types.ts b/app/src/lib/api/types.ts index af5a6c15..3a17eba3 100644 --- a/app/src/lib/api/types.ts +++ b/app/src/lib/api/types.ts @@ -38,6 +38,7 @@ export interface GenerationRequest { instruct?: string; max_chunk_chars?: number; crossfade_ms?: number; + normalize?: boolean; } export interface GenerationResponse { diff --git a/app/src/lib/hooks/useGenerationForm.ts b/app/src/lib/hooks/useGenerationForm.ts index 66effd22..f79f7ab2 100644 --- a/app/src/lib/hooks/useGenerationForm.ts +++ b/app/src/lib/hooks/useGenerationForm.ts @@ -34,6 +34,7 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) { const setIsGenerating = useGenerationStore((state) => state.setIsGenerating); const maxChunkChars = useServerStore((state) => state.maxChunkChars); const crossfadeMs = useServerStore((state) => state.crossfadeMs); + const normalizeAudio = useServerStore((state) => state.normalizeAudio); const [downloadingModelName, setDownloadingModelName] = useState(null); const [downloadingDisplayName, setDownloadingDisplayName] = useState(null); @@ -115,6 +116,7 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) { instruct: isQwen ? data.instruct || undefined : undefined, max_chunk_chars: maxChunkChars, crossfade_ms: crossfadeMs, + normalize: normalizeAudio, }); toast({ diff --git a/app/src/lib/hooks/useRestoreActiveTasks.tsx b/app/src/lib/hooks/useRestoreActiveTasks.tsx index 063e6bcb..191cca94 100644 --- a/app/src/lib/hooks/useRestoreActiveTasks.tsx +++ b/app/src/lib/hooks/useRestoreActiveTasks.tsx @@ -1,23 +1,23 @@ import { useCallback, useEffect, useRef, useState } from 'react'; import { apiClient } from '@/lib/api/client'; -import { useGenerationStore } from '@/stores/generationStore'; import type { ActiveDownloadTask } from '@/lib/api/types'; +import { useGenerationStore } from '@/stores/generationStore'; // Polling interval in milliseconds -const POLL_INTERVAL = 2000; +const POLL_INTERVAL = 30000; /** * Hook to monitor active tasks (downloads and generations). * Polls the server periodically to catch downloads triggered from anywhere * (transcription, generation, explicit download, etc.). - * + * * Returns the active downloads so components can render download toasts. */ export function useRestoreActiveTasks() { const [activeDownloads, setActiveDownloads] = useState([]); const setIsGenerating = useGenerationStore((state) => state.setIsGenerating); const setActiveGenerationId = useGenerationStore((state) => state.setActiveGenerationId); - + // Track which downloads we've seen to detect new ones const seenDownloadsRef = useRef>(new Set()); @@ -41,14 +41,14 @@ export function useRestoreActiveTasks() { // Update active downloads // Keep track of all active downloads (including new ones) const currentDownloadNames = new Set(tasks.downloads.map((d) => d.model_name)); - + // Remove completed downloads from our seen set for (const name of seenDownloadsRef.current) { if (!currentDownloadNames.has(name)) { seenDownloadsRef.current.delete(name); } } - + // Add new downloads to seen set for (const download of tasks.downloads) { seenDownloadsRef.current.add(download.model_name); diff --git a/app/src/platform/types.ts b/app/src/platform/types.ts index 23e99da5..eeba4c99 100644 --- a/app/src/platform/types.ts +++ b/app/src/platform/types.ts @@ -49,9 +49,9 @@ export interface PlatformAudio { } export interface PlatformLifecycle { - startServer(remote?: boolean): Promise; + startServer(remote?: boolean, modelsDir?: string | null): Promise; stopServer(): Promise; - restartServer(): Promise; + restartServer(modelsDir?: string | null): Promise; setKeepServerRunning(keep: boolean): Promise; setupWindowCloseHandler(): Promise; onServerReady?: () => void; diff --git a/app/src/stores/serverStore.ts b/app/src/stores/serverStore.ts index 1795b61c..586e1e8c 100644 --- a/app/src/stores/serverStore.ts +++ b/app/src/stores/serverStore.ts @@ -19,6 +19,12 @@ interface ServerStore { crossfadeMs: number; setCrossfadeMs: (value: number) => void; + + normalizeAudio: boolean; + setNormalizeAudio: (value: boolean) => void; + + customModelsDir: string | null; + setCustomModelsDir: (dir: string | null) => void; } export const useServerStore = create()( @@ -41,6 +47,12 @@ export const useServerStore = create()( crossfadeMs: 50, setCrossfadeMs: (value) => set({ crossfadeMs: value }), + + normalizeAudio: true, + setNormalizeAudio: (value) => set({ normalizeAudio: value }), + + customModelsDir: null, + setCustomModelsDir: (dir) => set({ customModelsDir: dir }), }), { name: 'voicebox-server', diff --git a/backend/main.py b/backend/main.py index cb9a2bd3..9b3aa334 100644 --- a/backend/main.py +++ b/backend/main.py @@ -844,6 +844,10 @@ async def download_chatterbox_turbo_background(): trim_fn=trim_fn, ) + if data.normalize: + from .utils.audio import normalize_audio + audio = normalize_audio(audio) + # Calculate duration duration = len(audio) / sample_rate @@ -975,6 +979,10 @@ async def stream_speech( trim_fn=trim_fn, ) + if data.normalize: + from .utils.audio import normalize_audio + audio = normalize_audio(audio) + wav_bytes = tts.audio_to_wav_bytes(audio, sample_rate) async def _wav_stream(): @@ -1588,6 +1596,141 @@ async def event_generator(): ) +@app.get("/models/cache-dir") +async def get_models_cache_dir(): + """Get the path to the HuggingFace model cache directory.""" + from huggingface_hub import constants as hf_constants + return {"path": str(Path(hf_constants.HF_HUB_CACHE))} + + +def _get_dir_size(path: Path) -> int: + """Get total size of a directory in bytes.""" + total = 0 + for f in path.rglob("*"): + if f.is_file(): + total += f.stat().st_size + return total + + +def _copy_with_progress(src: Path, dst: Path, progress_manager, copied_so_far: int, total_bytes: int) -> int: + """Copy a directory tree with byte-level progress tracking.""" + import shutil + dst.mkdir(parents=True, exist_ok=True) + for item in src.iterdir(): + dest_item = dst / item.name + if item.is_dir(): + copied_so_far = _copy_with_progress(item, dest_item, progress_manager, copied_so_far, total_bytes) + else: + size = item.stat().st_size + shutil.copy2(str(item), str(dest_item)) + copied_so_far += size + progress_manager.update_progress( + "migration", copied_so_far, total_bytes, + filename=item.name, status="downloading", + ) + return copied_so_far + + +@app.post("/models/migrate") +async def migrate_models(request: models.ModelMigrateRequest): + """Move all downloaded models to a new directory with byte-level progress via SSE.""" + import shutil + from huggingface_hub import constants as hf_constants + + source = Path(hf_constants.HF_HUB_CACHE) + destination = Path(request.destination) + + if not source.exists(): + raise HTTPException(status_code=404, detail="Current model cache directory not found") + + model_dirs = [d for d in source.iterdir() if d.name.startswith("models--") and d.is_dir()] + if not model_dirs: + return {"moved": 0, "errors": [], "source": str(source), "destination": str(destination)} + + destination.mkdir(parents=True, exist_ok=True) + + progress_manager = get_progress_manager() + + # Check if source and destination are on the same filesystem (rename is instant) + same_fs = False + try: + same_fs = source.stat().st_dev == destination.stat().st_dev + except OSError: + pass + + async def migrate_background(): + moved = 0 + errors = [] + try: + if same_fs: + # Same filesystem: rename is instant, just track model count + total = len(model_dirs) + for i, item in enumerate(model_dirs): + dest_item = destination / item.name + try: + if dest_item.exists(): + shutil.rmtree(dest_item) + shutil.move(str(item), str(dest_item)) + moved += 1 + progress_manager.update_progress( + "migration", i + 1, total, + filename=item.name, status="downloading", + ) + except Exception as e: + errors.append(f"{item.name}: {str(e)}") + else: + # Cross-filesystem: copy with byte-level progress, then delete source + total_bytes = sum(_get_dir_size(d) for d in model_dirs) + progress_manager.update_progress("migration", 0, total_bytes, filename="Calculating...", status="downloading") + + copied = 0 + for item in model_dirs: + dest_item = destination / item.name + try: + if dest_item.exists(): + shutil.rmtree(dest_item) + copied = await asyncio.to_thread( + _copy_with_progress, item, dest_item, progress_manager, copied, total_bytes + ) + # Remove source after successful copy + await asyncio.to_thread(shutil.rmtree, str(item)) + moved += 1 + except Exception as e: + errors.append(f"{item.name}: {str(e)}") + + progress_manager.update_progress("migration", 1, 1, status="complete") + progress_manager.mark_complete("migration") + except Exception as e: + progress_manager.update_progress("migration", 0, 0, status="error") + progress_manager.mark_error("migration", str(e)) + + _create_background_task(migrate_background()) + + return {"source": str(source), "destination": str(destination)} + + +@app.get("/models/migrate/progress") +async def get_migration_progress(): + """Get model migration progress via Server-Sent Events.""" + from fastapi.responses import StreamingResponse + + progress_manager = get_progress_manager() + + async def event_generator(): + async for event in progress_manager.subscribe("migration"): + yield event + + return StreamingResponse( + event_generator(), + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "X-Accel-Buffering": "no", + }, + ) + + @app.get("/models/status", response_model=models.ModelStatusListResponse) async def get_model_status(): """Get status of all available models.""" diff --git a/backend/models.py b/backend/models.py index b462b67a..8f9dbf10 100644 --- a/backend/models.py +++ b/backend/models.py @@ -60,6 +60,7 @@ class GenerationRequest(BaseModel): engine: Optional[str] = Field(default="qwen", pattern="^(qwen|luxtts|chatterbox|chatterbox_turbo)$") max_chunk_chars: int = Field(default=800, ge=100, le=5000, description="Max characters per chunk for long text splitting") crossfade_ms: int = Field(default=50, ge=0, le=500, description="Crossfade duration in ms between chunks (0 for hard cut)") + normalize: bool = Field(default=True, description="Normalize output audio volume") class GenerationResponse(BaseModel): @@ -170,6 +171,11 @@ class ModelDownloadRequest(BaseModel): model_name: str +class ModelMigrateRequest(BaseModel): + """Request model for migrating models to a new directory.""" + destination: str + + class ActiveDownloadTask(BaseModel): """Response model for active download task.""" model_name: str diff --git a/bun.lock b/bun.lock index 9e08a825..d271b5c6 100644 --- a/bun.lock +++ b/bun.lock @@ -13,7 +13,7 @@ }, "app": { "name": "@voicebox/app", - "version": "0.1.11", + "version": "0.1.13", "dependencies": { "@dnd-kit/core": "^6.3.1", "@dnd-kit/sortable": "^10.0.0", @@ -68,7 +68,7 @@ }, "landing": { "name": "@voicebox/landing", - "version": "0.1.11", + "version": "0.1.13", "dependencies": { "@radix-ui/react-separator": "^1.1.8", "@radix-ui/react-slot": "^1.2.4", @@ -93,7 +93,7 @@ }, "tauri": { "name": "@voicebox/tauri", - "version": "0.1.11", + "version": "0.1.13", "dependencies": { "@tauri-apps/api": "^2.0.0", "@tauri-apps/plugin-dialog": "^2.0.0", @@ -116,7 +116,7 @@ }, "web": { "name": "@voicebox/web", - "version": "0.1.11", + "version": "0.1.13", "dependencies": { "@tanstack/react-query": "^5.0.0", "react": "^18.3.0", @@ -125,6 +125,7 @@ "zustand": "^4.5.0", }, "devDependencies": { + "@tailwindcss/vite": "^4.0.0", "@types/react": "^18.3.0", "@types/react-dom": "^18.3.0", "@typescript-eslint/eslint-plugin": "^7.0.0", diff --git a/docs/issue-pain-points.md b/docs/issue-pain-points.md new file mode 100644 index 00000000..54346cfd --- /dev/null +++ b/docs/issue-pain-points.md @@ -0,0 +1,67 @@ +# Voicebox Issue Pain Points (Snapshot) + +## Scope + +- Dataset: **128 total issues** (**107 open**, **21 closed**) +- Source: GitHub issues in `jamiepine/voicebox` +- Classification: keyword/theme clustering +- Note: counts below are **non-exclusive** (one issue can belong to multiple pain points) + +## Most Common Pain Points (Open Issues) + +| Rank | Pain Point | Open Issues | What users are reporting | +|---|---|---:|---| +| 1 | Model download & offline reliability | **32** | Downloads failing/stalling, cache/offline behavior inconsistent, wrong model size selected, Errno issues | +| 2 | GPU/backend compatibility | **22** | GPU not detected, backend fallback surprises, platform-specific runtime failures (Windows/Mac) | +| 3 | Export/save/file persistence | **15** | Export fails, "failed to fetch/download audio", samples/profiles not saving | +| 4 | Language/accent quality & coverage | **14** | Missing language support, accent mismatch, robotic outputs | +| 5 | Update/restart safety + long-op controls | **4** | Auto-restart without warning, update confusion, lack of cancel/pause controls | + +## Representative Issues by Pain Point + +### 1) Model download & offline reliability (32) + +- [#159](https://github.com/jamiepine/voicebox/issues/159) - Qwen download fails with Errno 22 +- [#151](https://github.com/jamiepine/voicebox/issues/151) - Model loading hangs / server crashes +- [#150](https://github.com/jamiepine/voicebox/issues/150) - Internet required despite downloaded models +- [#149](https://github.com/jamiepine/voicebox/issues/149) - Cancel/pause controls for large downloads +- [#96](https://github.com/jamiepine/voicebox/issues/96) - 0.6B selection still uses/downloads 1.7B + +### 2) GPU/backend compatibility (22) + +- [#164](https://github.com/jamiepine/voicebox/issues/164) - Windows: no GPU usage + multiple breakages +- [#141](https://github.com/jamiepine/voicebox/issues/141) - Using CPU only, GPU not used +- [#131](https://github.com/jamiepine/voicebox/issues/131) - Numpy ABI mismatch in bundled app +- [#130](https://github.com/jamiepine/voicebox/issues/130) - Intel Mac tensor/padding generation error +- [#127](https://github.com/jamiepine/voicebox/issues/127) - GPU not found + +### 3) Export/save/file persistence (15) + +- [#148](https://github.com/jamiepine/voicebox/issues/148) - Japanese export fails on 0.1.12 +- [#143](https://github.com/jamiepine/voicebox/issues/143) - Samples not saving +- [#134](https://github.com/jamiepine/voicebox/issues/134) - Can't save profile +- [#105](https://github.com/jamiepine/voicebox/issues/105) - Export audio fails (failed to fetch) +- [#49](https://github.com/jamiepine/voicebox/issues/49) - Export filename/location ignored on Windows + +### 4) Language/accent quality & coverage (14) + +- [#162](https://github.com/jamiepine/voicebox/issues/162) - Persian audio request/problem +- [#117](https://github.com/jamiepine/voicebox/issues/117) - Arabic language support +- [#113](https://github.com/jamiepine/voicebox/issues/113) - Polish language support +- [#109](https://github.com/jamiepine/voicebox/issues/109) - Ukrainian support +- [#100](https://github.com/jamiepine/voicebox/issues/100) - Non-US accent quality issues + +### 5) Update/restart safety + controls (4) + +- [#164](https://github.com/jamiepine/voicebox/issues/164) - Update behavior + usability failures +- [#136](https://github.com/jamiepine/voicebox/issues/136) - Auto-restart without warning +- [#86](https://github.com/jamiepine/voicebox/issues/86) - Unexpected restart with no confirmation +- [#149](https://github.com/jamiepine/voicebox/issues/149) - Need pause/cancel and pre-download confirmation + +## Additional Signal + +- There is also a large **feature-request/misc** bucket (**36 open**) that is competing with stability triage (audiobook, Linux build, additional ASR/TTS models, integrations). + +## Takeaway + +Most user pain is concentrated in four stability areas: **download/offline path**, **GPU/backend detection**, **save/export reliability**, and **language/accent correctness**. Addressing those first should reduce the majority of current support friction. diff --git a/docs/plans/PROJECT_STATUS.md b/docs/plans/PROJECT_STATUS.md index d47dfebf..b8d6ff53 100644 --- a/docs/plans/PROJECT_STATUS.md +++ b/docs/plans/PROJECT_STATUS.md @@ -321,7 +321,7 @@ Notable requests: ## New Model Integration — Landscape -### Models Worth Supporting (2026 SOTA) +### Models Worth Supporting (2026 SOTA — updated March 13) | Model | Cloning | Speed | Sample Rate | Languages | VRAM | Integration Ease | Status | |-------|---------|-------|-------------|-----------|------|-----------------|--------| @@ -329,10 +329,23 @@ Notable requests: | **LuxTTS** | 3s zero-shot | 150x RT, CPU ok | 48 kHz | English | <1 GB | **Shipped** | PR #254 | | **Chatterbox MTL** | 5s zero-shot | Medium | 24 kHz | 23 | Medium | **Shipped** | PR #257 | | **Chatterbox Turbo** | 5s zero-shot | Fast | 24 kHz | English | Low | **PR #258** | In review | +| **HumeAI TADA 1B/3B** | Zero-shot | 5× faster than LLM-TTS | — | EN (1B), Multilingual (3B) | Medium | Needs vetting | MIT, 700s+ coherent, synced transcript output | +| **MOSS-TTS Family** | Zero-shot | — | — | Multilingual | Medium | Needs vetting | Apache 2.0, multi-speaker dialogue, text-to-voice design (no ref audio) | +| **VoxCPM 1.5** | Zero-shot (seconds) | ~0.15 RTF streaming | — | Bilingual (EN/ZH) | Medium | Needs vetting | Apache 2.0, tokenizer-free continuous diffusion, LoRA-friendly | +| **Pocket TTS** | Zero-shot + streaming | >1× RT on CPU | — | English | ~100M params, CPU-first | Needs vetting | MIT, Kyutai Labs, no GPU required | +| **Kokoro-82M** | 3s instant | CPU realtime | 24 kHz | English | Tiny (82M) | Ready | Apache 2.0, multi-engine arch in place | | **XTTS-v2** | 6s zero-shot | Mid-GPU | 24 kHz | 17+ | Medium | Ready | Multi-engine arch in place | | **Fish Speech** | 10-30s few-shot | Real-time | 24-44 kHz | 50+ | Medium | Ready | Multi-engine arch in place | | **CosyVoice2-0.5B** | 3-10s zero-shot | Very fast | 24 kHz | Multilingual | Low | Ready | Multi-engine arch in place | -| **Kokoro-82M** | 3s instant | CPU realtime | 24 kHz | English | Tiny | Ready | Multi-engine arch in place | + +#### Notes on New Candidates (March 2026) + +- **HumeAI TADA** — Text-Audio Dual Alignment arch. Near-zero hallucinations/drift, free synced transcript. 700+ seconds coherent audio. Best candidate for Stories long-form reliability. [HF: HumeAI/tada-1b](https://huggingface.co/HumeAI/tada-1b) | [GitHub: HumeAI/tada](https://github.com/HumeAI/tada) +- **MOSS-TTS** — Modular suite: flagship cloning, MOSS-TTSD (multi-speaker dialogue), MOSS-VoiceGenerator (create voices from text descriptions, no ref audio). Unique UX for Stories voice design. [GitHub: OpenMOSS/MOSS-TTS](https://github.com/OpenMOSS/MOSS-TTS) +- **VoxCPM 1.5** — Tokenizer-free continuous diffusion + autoregressive. No discrete token artifacts. Context-aware prosody/emotion, real-time streaming, LoRA fine-tuning. Trained on 1.8M+ hours. [GitHub: OpenBMB/VoxCPM](https://github.com/OpenBMB/VoxCPM) +- **Pocket TTS** — 100M param CPU-first model from Kyutai Labs (Moshi team). Runs >1× realtime without GPU. Broadens hardware support significantly. [GitHub: kyutai-labs/pocket-tts](https://github.com/kyutai-labs/pocket-tts) +- **Watch list:** MioTTS-2.6B (fast LLM-based EN/JP, vLLM compatible), Oolel-Voices (Soynade Research, expressive modular control) +- **Skipped:** Fish Audio S2 — restrictive research license (commercial use requires approval), despite strong features ### Adding a New Engine (Now Straightforward) @@ -402,16 +415,21 @@ The generation form now uses a flat model dropdown with engine-based routing. Pe ### Tier 3 — Future (v0.3.0+) -| Item | Notes | -|------|-------| -| XTTS-v2 / Fish Speech / CosyVoice | Multi-engine arch is ready; just needs backend implementation | -| OpenAI-compatible API (plan doc exists) | Low effort once API is stable | -| LoRA fine-tuning (PR #195) | Complex, needs rework for multi-engine | -| External/remote providers | Depends on use case demand | -| GGUF support (#226) | Depends on model ecosystem maturity | -| Queue system (#234) | Batch generation | -| Streaming for non-MLX engines | Currently MLX-only | -| Kokoro-82M | Tiny model, great for CPU-only machines | +| Priority | Item | Notes | +|----------|------|-------| +| 1 | **HumeAI TADA** | Long-form reliability for Stories, synced transcripts. Addresses #234, #203, #191, #111, #69. Needs API vetting. | +| 2 | **Pocket TTS** (Kyutai) | CPU-first 100M model, broadens hardware support. Kyutai ships clean code. Needs API vetting. | +| 3 | **MOSS-TTS** | Text-to-voice design (no ref audio) is unique. Multi-speaker dialogue for Stories. Needs thorough API vetting. | +| 4 | **Kokoro-82M** | 82M params, CPU realtime, Apache 2.0. Easy win. | +| 5 | **Model config registry refactor** | Reduce 5-dispatch-point duplication in main.py — do before adding 3+ more engines | +| 6 | XTTS-v2 / Fish Speech / CosyVoice | Multi-engine arch is ready; just needs backend implementation | +| 7 | **VoxCPM 1.5** | Tokenizer-free streaming, interesting but uncertain integration surface | +| 8 | OpenAI-compatible API (plan doc exists) | Low effort once API is stable | +| 9 | LoRA fine-tuning (PR #195) | Complex, needs rework for multi-engine | +| 10 | External/remote providers | Depends on use case demand | +| 11 | GGUF support (#226) | Depends on model ecosystem maturity | +| 12 | Queue system (#234) | Batch generation | +| 13 | Streaming for non-MLX engines | Currently MLX-only | --- diff --git a/tauri/src-tauri/gen/Assets.car b/tauri/src-tauri/gen/Assets.car index a5f68f95..8065a50c 100644 Binary files a/tauri/src-tauri/gen/Assets.car and b/tauri/src-tauri/gen/Assets.car differ diff --git a/tauri/src-tauri/gen/voicebox.icns b/tauri/src-tauri/gen/voicebox.icns index e4492f52..59661d99 100644 Binary files a/tauri/src-tauri/gen/voicebox.icns and b/tauri/src-tauri/gen/voicebox.icns differ diff --git a/tauri/src-tauri/src/main.rs b/tauri/src-tauri/src/main.rs index 157fee9e..ae0f1ed9 100644 --- a/tauri/src-tauri/src/main.rs +++ b/tauri/src-tauri/src/main.rs @@ -16,6 +16,7 @@ struct ServerState { child: Mutex>, server_pid: Mutex>, keep_running_on_close: Mutex, + models_dir: Mutex>, } #[command] @@ -23,7 +24,16 @@ async fn start_server( app: tauri::AppHandle, state: State<'_, ServerState>, remote: Option, + models_dir: Option, ) -> Result { + // Store models_dir for use on restart (empty string means reset to default) + if let Some(ref dir) = models_dir { + if dir.is_empty() { + *state.models_dir.lock().unwrap() = None; + } else { + *state.models_dir.lock().unwrap() = Some(dir.clone()); + } + } // Check if server is already running (managed by this app instance) if state.child.lock().unwrap().is_some() { return Ok(format!("http://127.0.0.1:{}", SERVER_PORT)); @@ -274,6 +284,12 @@ async fn start_server( let port_str = SERVER_PORT.to_string(); let is_remote = remote.unwrap_or(false); + // Resolve the custom models directory from the parameter or stored state + let effective_models_dir = models_dir.or_else(|| state.models_dir.lock().unwrap().clone()); + if let Some(ref dir) = effective_models_dir { + println!("Custom models directory: {}", dir); + } + // If CUDA binary exists, launch it directly instead of the bundled sidecar let spawn_result = if let Some(ref cuda_path) = cuda_binary { println!("Launching CUDA backend: {:?}", cuda_path); @@ -282,6 +298,9 @@ async fn start_server( if is_remote { cmd = cmd.args(["--host", "0.0.0.0"]); } + if let Some(ref dir) = effective_models_dir { + cmd = cmd.env("VOICEBOX_MODELS_DIR", dir); + } cmd.spawn() } else { // Use the bundled CPU sidecar @@ -289,6 +308,9 @@ async fn start_server( if is_remote { sidecar = sidecar.args(["--host", "0.0.0.0"]); } + if let Some(ref dir) = effective_models_dir { + sidecar = sidecar.env("VOICEBOX_MODELS_DIR", dir); + } println!("Spawning server process..."); sidecar.spawn() }; @@ -613,9 +635,19 @@ async fn stop_server(state: State<'_, ServerState>) -> Result<(), String> { async fn restart_server( app: tauri::AppHandle, state: State<'_, ServerState>, + models_dir: Option, ) -> Result { println!("restart_server: stopping current server..."); + // Update stored models_dir: empty string means reset to default, non-empty means set + if let Some(ref dir) = models_dir { + if dir.is_empty() { + *state.models_dir.lock().unwrap() = None; + } else { + *state.models_dir.lock().unwrap() = Some(dir.clone()); + } + } + // Stop the current server stop_server(state.clone()).await?; @@ -623,9 +655,9 @@ async fn restart_server( println!("restart_server: waiting for port release..."); tokio::time::sleep(tokio::time::Duration::from_millis(1000)).await; - // Start server again (will auto-detect CUDA binary) + // Start server again (will auto-detect CUDA binary and use stored models_dir) println!("restart_server: starting server..."); - start_server(app, state, None).await + start_server(app, state, None, None).await } #[command] @@ -686,6 +718,7 @@ pub fn run() { child: Mutex::new(None), server_pid: Mutex::new(None), keep_running_on_close: Mutex::new(false), + models_dir: Mutex::new(None), }) .manage(audio_capture::AudioCaptureState::new()) .manage(audio_output::AudioOutputState::new()) diff --git a/tauri/src-tauri/tauri.conf.json b/tauri/src-tauri/tauri.conf.json index 5cb83ee2..aa8435ea 100644 --- a/tauri/src-tauri/tauri.conf.json +++ b/tauri/src-tauri/tauri.conf.json @@ -56,7 +56,7 @@ }, "plugins": { "shell": { - "open": true + "open": ".*" }, "updater": { "pubkey": "dW50cnVzdGVkIGNvbW1lbnQ6IG1pbmlzaWduIHB1YmxpYyBrZXk6IEUxRENBQkRBQjdBNTM1OTIKUldTU05hVzMycXZjNGJGcUxmcVVocll2QjdSaTJNdlFxR2M3VDJsMnVvbDdyZGRPMmRlOW9aWTcK", diff --git a/tauri/src/platform/lifecycle.ts b/tauri/src/platform/lifecycle.ts index 60063f3e..d31ddd52 100644 --- a/tauri/src/platform/lifecycle.ts +++ b/tauri/src/platform/lifecycle.ts @@ -5,9 +5,12 @@ import type { PlatformLifecycle } from '@/platform/types'; class TauriLifecycle implements PlatformLifecycle { onServerReady?: () => void; - async startServer(remote = false): Promise { + async startServer(remote = false, modelsDir?: string | null): Promise { try { - const result = await invoke('start_server', { remote }); + const result = await invoke('start_server', { + remote, + modelsDir: modelsDir ?? undefined, + }); console.log('Server started:', result); this.onServerReady?.(); return result; @@ -27,9 +30,11 @@ class TauriLifecycle implements PlatformLifecycle { } } - async restartServer(): Promise { + async restartServer(modelsDir?: string | null): Promise { try { - const result = await invoke('restart_server'); + const result = await invoke('restart_server', { + modelsDir: modelsDir ?? undefined, + }); console.log('Server restarted:', result); this.onServerReady?.(); return result; diff --git a/web/src/platform/lifecycle.ts b/web/src/platform/lifecycle.ts index f40f1a90..9a6d825a 100644 --- a/web/src/platform/lifecycle.ts +++ b/web/src/platform/lifecycle.ts @@ -3,7 +3,7 @@ import type { PlatformLifecycle } from '@/platform/types'; class WebLifecycle implements PlatformLifecycle { onServerReady?: () => void; - async startServer(_remote = false): Promise { + async startServer(_remote = false, _modelsDir?: string | null): Promise { // Web assumes server is running externally // Return a default URL - this should be configured via env vars const serverUrl = import.meta.env.VITE_SERVER_URL || 'http://localhost:17493'; @@ -15,7 +15,7 @@ class WebLifecycle implements PlatformLifecycle { // No-op for web - server is managed externally } - async restartServer(): Promise { + async restartServer(_modelsDir?: string | null): Promise { // No-op for web - server is managed externally return import.meta.env.VITE_SERVER_URL || 'http://localhost:17493'; }