-
Notifications
You must be signed in to change notification settings - Fork 1.5k
feat: chunked TTS generation with quality selector #99
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -49,10 +49,16 @@ class Config: | |
| from_attributes = True | ||
|
|
||
|
|
||
| class TTSSettingsUpdate(BaseModel): | ||
| """Request model for updating TTS settings.""" | ||
| quality: Optional[str] = Field(None, pattern="^(standard|high)$") | ||
| max_chunk_chars: Optional[int] = Field(None, ge=100, le=5000) | ||
|
|
||
|
|
||
| class GenerationRequest(BaseModel): | ||
| """Request model for voice generation.""" | ||
| profile_id: str | ||
| text: str = Field(..., min_length=1, max_length=5000) | ||
| text: str = Field(..., min_length=1, max_length=50000) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
# Verify the frontend validation limits
echo "=== Checking frontend text length validation ==="
rg -n 'max.*5000|max_length.*5000' --type=ts --type=tsx -g '!node_modules'
rg -n 'Max 5000' --type=ts --type=tsx -g '!node_modules'Repository: jamiepine/voicebox Length of output: 183 🏁 Script executed: #!/bin/bash
# Find the frontend files mentioned in the review
echo "=== Locating frontend files ==="
find . -name "useGenerationForm.ts" 2>/dev/null
find . -name "GenerationForm.tsx" 2>/dev/null
# Search for 5000 character limits using a broader pattern
echo -e "\n=== Searching for 5000 character validation ==="
rg -n '5000' --max-count=20 2>/dev/null | grep -E '(max|Max|length)'Repository: jamiepine/voicebox Length of output: 1025 🏁 Script executed: #!/bin/bash
echo "=== useGenerationForm.ts around line 14 ==="
sed -n '10,20p' ./app/src/lib/hooks/useGenerationForm.ts
echo -e "\n=== GenerationForm.tsx around lines 85-86 ==="
sed -n '80,90p' ./app/src/components/Generation/GenerationForm.tsx
echo -e "\n=== $GenerationRequest.ts around line 15 ==="
sed -n '10,20p' ./app/src/lib/api/schemas/\$GenerationRequest.ts
echo -e "\n=== backend/utils/validation.py around line 9 ==="
sed -n '1,15p' backend/utils/validation.py
echo -e "\n=== Check if validation.py's validate_text is used for the text field ==="
rg -n 'validate_text' --max-count=10Repository: jamiepine/voicebox Length of output: 1849 🏁 Script executed: #!/bin/bash
echo "=== Check where validate_text is called ==="
rg -n 'validate_text' -A 2 -B 2
echo -e "\n=== Find the API endpoint handler for generation ==="
rg -n 'def.*generation|@.*post|@.*app' backend/ --max-count=20 | head -30
echo -e "\n=== Look for the GenerationRequest handler ==="
rg -n 'GenerationRequest' backend/ -A 3 -B 1 | head -40Repository: jamiepine/voicebox Length of output: 3978 Frontend validation mismatch will block users from using the expanded limit. The backend now accepts up to 50,000 characters, but the frontend still enforces a 5,000 character limit in three places:
Users won't be able to submit texts longer than 5,000 characters from the UI despite the backend supporting it. 🤖 Prompt for AI Agents |
||
| language: str = Field(default="en", pattern="^(zh|en|ja|ko|de|fr|ru|pt|es|it|he)$") | ||
| seed: Optional[int] = Field(None, ge=0) | ||
| model_size: Optional[str] = Field(default="1.7B", pattern="^(1\\.7B|0\\.6B)$") | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,176 @@ | ||
| """ | ||
| Chunked TTS generation with quality selection. | ||
|
|
||
| Splits long text into sentence-boundary chunks, generates audio per-chunk, | ||
| and concatenates with crossfade. Optionally upsamples to 44.1kHz for | ||
| higher quality output. | ||
|
|
||
| Environment variables: | ||
| TTS_QUALITY: "standard" (24kHz native) or "high" (44.1kHz upsampled) | ||
| TTS_MAX_CHUNK_CHARS: Max characters per chunk (default 800) | ||
| TTS_UPSAMPLE_RATE: Target sample rate for high quality (default 44100) | ||
| """ | ||
|
|
||
| import logging | ||
| import os | ||
| import re | ||
| from typing import List | ||
|
|
||
| import numpy as np | ||
|
|
||
| logger = logging.getLogger("voicebox.chunked-tts") | ||
|
|
||
| # --------------------------------------------------------------------------- | ||
| # Runtime-mutable settings | ||
| # --------------------------------------------------------------------------- | ||
|
|
||
| _tts_settings = { | ||
| "quality": os.getenv("TTS_QUALITY", "standard"), | ||
| "max_chunk_chars": int(os.getenv("TTS_MAX_CHUNK_CHARS", "800")), | ||
| "upsample_rate": int(os.getenv("TTS_UPSAMPLE_RATE", "44100")), | ||
| } | ||
|
Comment on lines
+27
to
+31
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Invalid environment variable will crash at import time.
🛡️ Proposed fix: Add validation with defaults+def _parse_int_env(key: str, default: int) -> int:
+ """Parse integer from env var with fallback to default."""
+ val = os.getenv(key)
+ if val is None:
+ return default
+ try:
+ return int(val)
+ except ValueError:
+ logger.warning(f"Invalid {key}='{val}', using default {default}")
+ return default
+
+
_tts_settings = {
"quality": os.getenv("TTS_QUALITY", "standard"),
- "max_chunk_chars": int(os.getenv("TTS_MAX_CHUNK_CHARS", "800")),
- "upsample_rate": int(os.getenv("TTS_UPSAMPLE_RATE", "44100")),
+ "max_chunk_chars": _parse_int_env("TTS_MAX_CHUNK_CHARS", 800),
+ "upsample_rate": _parse_int_env("TTS_UPSAMPLE_RATE", 44100),
}🤖 Prompt for AI Agents |
||
|
|
||
| QUALITY_RATES = { | ||
| "standard": 24000, # Qwen3-TTS native sample rate | ||
| "high": 44100, # CD-quality upsampled via soxr | ||
| } | ||
|
|
||
|
|
||
| def get_tts_settings() -> dict: | ||
| """Return current TTS chunking/quality settings.""" | ||
| quality = _tts_settings["quality"] | ||
| return { | ||
| "quality": quality, | ||
| "sample_rate": QUALITY_RATES.get(quality, 24000), | ||
| "max_chunk_chars": _tts_settings["max_chunk_chars"], | ||
| "available_qualities": list(QUALITY_RATES.keys()), | ||
| } | ||
|
|
||
|
|
||
| def update_tts_settings(updates: dict) -> dict: | ||
| """Update TTS settings at runtime. Returns new settings.""" | ||
| if "quality" in updates: | ||
| q = updates["quality"] | ||
| if q not in QUALITY_RATES: | ||
| raise ValueError( | ||
| f"Invalid quality '{q}'. Must be one of {list(QUALITY_RATES.keys())}" | ||
| ) | ||
| _tts_settings["quality"] = q | ||
| if "max_chunk_chars" in updates: | ||
| val = int(updates["max_chunk_chars"]) | ||
| if val < 100 or val > 5000: | ||
| raise ValueError("max_chunk_chars must be between 100 and 5000") | ||
| _tts_settings["max_chunk_chars"] = val | ||
| return get_tts_settings() | ||
|
|
||
|
|
||
| # --------------------------------------------------------------------------- | ||
| # Text splitting | ||
| # --------------------------------------------------------------------------- | ||
|
|
||
| def split_text_into_chunks(text: str, max_chars: int = 800) -> List[str]: | ||
| """Split text at sentence boundaries, with clause and word fallbacks. | ||
|
|
||
| Priority: sentence-end (.!?) > clause boundary (;:,) > whitespace > hard cut. | ||
| """ | ||
| text = text.strip() | ||
| if not text: | ||
| return [] | ||
| if len(text) <= max_chars: | ||
| return [text] | ||
|
|
||
| chunks: List[str] = [] | ||
| remaining = text | ||
|
|
||
| while remaining: | ||
| remaining = remaining.strip() | ||
| if not remaining: | ||
| break | ||
| if len(remaining) <= max_chars: | ||
| chunks.append(remaining) | ||
| break | ||
|
|
||
| segment = remaining[:max_chars] | ||
|
|
||
| # Try to find last sentence end | ||
| split_pos = _find_last_sentence_end(segment) | ||
| if split_pos == -1: | ||
| split_pos = _find_last_clause_boundary(segment) | ||
| if split_pos == -1: | ||
| split_pos = segment.rfind(" ") | ||
| if split_pos == -1: | ||
| split_pos = max_chars - 1 | ||
|
|
||
| chunk = remaining[: split_pos + 1].strip() | ||
| if chunk: | ||
| chunks.append(chunk) | ||
| remaining = remaining[split_pos + 1 :] | ||
|
|
||
| return chunks | ||
|
|
||
|
|
||
| def _find_last_sentence_end(text: str) -> int: | ||
| best = -1 | ||
| for m in re.finditer(r"[.!?](?:\s|$)", text): | ||
| best = m.start() | ||
| return best | ||
|
|
||
|
|
||
| def _find_last_clause_boundary(text: str) -> int: | ||
| best = -1 | ||
| for m in re.finditer(r"[;:,\u2014](?:\s|$)", text): | ||
| best = m.start() | ||
| return best | ||
|
|
||
|
|
||
| # --------------------------------------------------------------------------- | ||
| # Audio concatenation | ||
| # --------------------------------------------------------------------------- | ||
|
|
||
| def concatenate_audio_chunks( | ||
| chunks: List[np.ndarray], | ||
| sr: int, | ||
| crossfade_ms: int = 50, | ||
| ) -> np.ndarray: | ||
| """Concatenate audio arrays with a short crossfade to avoid clicks.""" | ||
| if not chunks: | ||
| return np.array([], dtype=np.float32) | ||
| if len(chunks) == 1: | ||
| return chunks[0] | ||
|
|
||
| crossfade_samples = int(sr * crossfade_ms / 1000) | ||
| result = chunks[0].copy() | ||
|
|
||
| for chunk in chunks[1:]: | ||
| if len(chunk) == 0: | ||
| continue | ||
| overlap = min(crossfade_samples, len(result), len(chunk)) | ||
| if overlap > 0: | ||
| fade_out = np.linspace(1.0, 0.0, overlap, dtype=np.float32) | ||
| fade_in = np.linspace(0.0, 1.0, overlap, dtype=np.float32) | ||
| result[-overlap:] = result[-overlap:] * fade_out + chunk[:overlap] * fade_in | ||
| result = np.concatenate([result, chunk[overlap:]]) | ||
| else: | ||
| result = np.concatenate([result, chunk]) | ||
|
|
||
| return result | ||
|
|
||
|
|
||
| # --------------------------------------------------------------------------- | ||
| # Resampling | ||
| # --------------------------------------------------------------------------- | ||
|
|
||
| def resample_audio(audio: np.ndarray, src_rate: int, dst_rate: int) -> np.ndarray: | ||
| """Resample audio using soxr (VHQ), with linear-interp fallback.""" | ||
| if src_rate == dst_rate: | ||
| return audio | ||
| try: | ||
| import soxr | ||
|
|
||
| return soxr.resample(audio, src_rate, dst_rate, quality="VHQ") | ||
| except ImportError: | ||
| logger.warning("soxr not installed; falling back to linear interpolation") | ||
| ratio = dst_rate / src_rate | ||
| new_len = int(len(audio) * ratio) | ||
| indices = np.linspace(0, len(audio) - 1, new_len) | ||
| return np.interp(indices, np.arange(len(audio)), audio).astype(np.float32) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Preserve exception chain with
raise ... from e.The static analysis tool flagged line 1483: within an
exceptclause, exceptions should be raised withraise ... from eto preserve the exception chain for debugging.🔧 Proposed fix
📝 Committable suggestion
🧰 Tools
🪛 Ruff (0.15.5)
[warning] 1483-1483: Within an
exceptclause, raise exceptions withraise ... from errorraise ... from Noneto distinguish them from errors in exception handling(B904)
🤖 Prompt for AI Agents