Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@ backend/.venv/

# Build output
extension/dist/
web/dist/

# Environment
backend/.env
web/.env.local

# OS
.DS_Store
169 changes: 157 additions & 12 deletions backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from dotenv import load_dotenv
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel

from services.speechmatics_client import SpeechmaticsClient
from services.minimax_client import MinimaxClient, get_language_name
Expand Down Expand Up @@ -50,6 +51,108 @@ async def health():
return {"status": "ok", "service": "interpreter-backend"}


class VoiceProfileRequest(BaseModel):
userId: str
audio: str # base64-encoded audio
format: str = "webm"


@app.post("/api/voice-profile")
async def create_voice_profile(req: VoiceProfileRequest):
"""
Create a voice profile by uploading audio to MiniMax file upload API.
Returns the MiniMax file_id which can be used for voice clone TTS.
"""
import base64
import io

minimax_key = os.getenv("MINIMAX_API_KEY", "")
minimax_group_id = os.getenv("MINIMAX_GROUP_ID", "")

if not minimax_key:
raise ValueError("MINIMAX_API_KEY is required")

# Decode base64 audio
audio_bytes = base64.b64decode(req.audio)
logger.info(
"Uploading voice sample for user %s (%d bytes, format=%s)",
req.userId, len(audio_bytes), req.format,
)

# Upload to MiniMax file upload API
ext = req.format if req.format in {"mp3", "m4a", "wav", "webm"} else "wav"
upload_url = f"https://api.minimax.io/v1/files/upload?GroupId={minimax_group_id}"
headers = {"Authorization": f"Bearer {minimax_key}"}

try:
async with httpx.AsyncClient(timeout=30.0) as client:
files = {"file": (f"voice_sample.{ext}", io.BytesIO(audio_bytes))}
data = {"purpose": "voice_clone"}
resp = await client.post(upload_url, headers=headers, data=data, files=files)
logger.info("MiniMax file upload status=%s body=%s", resp.status_code, resp.text)
resp.raise_for_status()
result = resp.json()
except httpx.HTTPStatusError as e:
logger.error("MiniMax upload HTTP error %s: %s", e.response.status_code, e.response.text)
from fastapi.responses import JSONResponse
return JSONResponse(status_code=502, content={"error": f"MiniMax API error: {e.response.status_code}", "detail": e.response.text})
except Exception as e:
logger.error("MiniMax upload failed: %s", e)
from fastapi.responses import JSONResponse
return JSONResponse(status_code=500, content={"error": str(e)})

# MiniMax may return file_id at top level or nested under "file"
file_id = result.get("file_id") or (result.get("file") or {}).get("file_id")
if not file_id:
logger.error("MiniMax file upload response missing file_id: %s", result)
from fastapi.responses import JSONResponse
return JSONResponse(status_code=502, content={"error": "No file_id in MiniMax response", "detail": str(result)})

file_id = str(file_id)
logger.info("MiniMax file uploaded for user %s: file_id=%s", req.userId, file_id)
return {"voiceProfileId": file_id}


async def lookup_voice_profile(user_id: str) -> dict | None:
"""
Query Convex HTTP API to fetch a user's voice profile.
Returns profile dict with voiceProfileId, voiceProfileStatus, language
or None if not found or on error.
"""
convex_site_url = os.getenv("CONVEX_SITE_URL", "").rstrip("/")
if not convex_site_url or not user_id:
return None

try:
async with httpx.AsyncClient(timeout=5.0) as client:
resp = await client.get(
f"{convex_site_url}/api/voice-profile",
params={"userId": user_id},
)
if resp.status_code == 200:
profile = resp.json()
logger.info(
"Convex voice profile for %s: status=%s, voiceProfileId=%s",
user_id,
profile.get("voiceProfileStatus"),
profile.get("voiceProfileId"),
)
return profile
elif resp.status_code == 404:
logger.info("No Convex profile found for user %s", user_id)
return None
else:
logger.warning(
"Convex profile lookup failed: %d %s",
resp.status_code,
resp.text,
)
return None
except Exception as e:
logger.error("Convex profile lookup error: %s", e)
return None


@app.websocket("/ws/translate")
async def websocket_translate(ws: WebSocket):
"""
Expand Down Expand Up @@ -91,6 +194,9 @@ async def websocket_translate(ws: WebSocket):
speechmatics_tts = SpeechmaticsTTSClient(api_key=speechmatics_key)
connection_open = True

# Voice profile for clone TTS (set via Convex lookup on config)
user_voice_id: str | None = None

# Buffer for accumulating transcript before translation
transcript_buffer = ""
translation_lock = asyncio.Lock()
Expand Down Expand Up @@ -200,19 +306,36 @@ async def on_translation(text: str, is_final: bool):
logger.error("Error in on_translation: %s", e)

async def synthesize_tts(translated_text: str) -> bytes | None:
if tts_provider == "speechmatics":
audio_data = await speechmatics_tts.text_to_speech(
text=translated_text,
language=target_lang,
)
if audio_data:
return audio_data
logger.warning(
"Speechmatics TTS returned no audio for target language '%s'; "
"falling back to MiniMax TTS.",
target_lang,
)
# If user has a voice clone profile (MiniMax file_id), try clone TTS first
if user_voice_id:
try:
audio_data = await minimax.voice_clone_tts(
text=translated_text,
file_id=user_voice_id,
)
if audio_data:
logger.info("[TTS] Used MiniMax voice clone (file_id=%s)", user_voice_id)
return audio_data
except Exception as e:
logger.warning(
"MiniMax voice clone TTS failed, falling back to standard: %s", e
)

# Primary: Speechmatics TTS
audio_data = await speechmatics_tts.text_to_speech(
text=translated_text,
language=target_lang,
)
if audio_data:
return audio_data

logger.warning(
"Speechmatics TTS returned no audio for target language '%s'; "
"falling back to MiniMax TTS.",
target_lang,
)

# Fallback: MiniMax standard TTS
return await minimax.text_to_speech(
text=translated_text,
language=target_lang,
Expand Down Expand Up @@ -344,6 +467,28 @@ async def translate_and_speak(text: str):
)
logger.info("TTS provider: %s", tts_provider)

# Look up voice profile from Convex if user_id provided
config_user_id = msg.get("user_id", "")
if config_user_id:
profile = await lookup_voice_profile(config_user_id)
if (
profile
and profile.get("voiceProfileStatus") == "ready"
and profile.get("voiceProfileId")
):
user_voice_id = profile["voiceProfileId"]
logger.info(
"Using voice profile %s for user %s",
user_voice_id,
config_user_id,
)
else:
user_voice_id = None
logger.info(
"No ready voice profile for user %s, using default",
config_user_id,
)

# Initialize Speechmatics with the source language
if speechmatics:
await speechmatics.close()
Expand Down
93 changes: 92 additions & 1 deletion backend/services/minimax_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
# MiniMax API endpoints (current official domain)
CHAT_URL = "https://api.minimax.io/v1/text/chatcompletion_v2"
TTS_WS_URL = "wss://api.minimax.io/ws/v1/t2a_v2"
FILE_UPLOAD_URL = "https://api.minimax.io/v1/files/upload"
VOICE_CLONE_URL = "https://api.minimax.io/v1/voice_clone"

DEFAULT_TEXT_MODEL = "MiniMax-M2"
DEFAULT_TTS_MODEL = "speech-2.8-turbo"
Expand Down Expand Up @@ -273,15 +275,17 @@ async def text_to_speech(
self,
text: str,
language: str = "en",
voice_id: Optional[str] = None,
) -> Optional[bytes]:
"""
Generate speech audio using MiniMax T2A WebSocket API.
Returns MP3 audio bytes assembled from streaming chunks.
If voice_id is provided, it overrides the language-based voice selection.
"""
if not text.strip():
return None

voice_id = self._resolve_voice_id(language)
voice_id = voice_id or self._resolve_voice_id(language)
try:
return await self._text_to_speech_once(text=text, voice_id=voice_id)
except ValueError as e:
Expand Down Expand Up @@ -447,6 +451,93 @@ async def _wait_for_event(self, ws, expected_events: set[str], timeout: float) -
)
raise ValueError(f"MiniMax TTS task_failed: {status_msg}")

async def upload_file(
self,
audio_bytes: bytes,
filename: str = "voice_sample.wav",
purpose: str = "voice_clone",
) -> str:
"""
Upload an audio file to MiniMax File API.
Returns the file_id for use with voice clone API.
"""
headers = {"Authorization": f"Bearer {self.api_key}"}
files = {"file": (filename, audio_bytes)}
data = {"purpose": purpose}

resp = await self.client.post(
FILE_UPLOAD_URL,
headers=headers,
data=data,
files=files,
)
resp.raise_for_status()
result = resp.json()

file_id = result.get("file", {}).get("file_id")
if not file_id:
raise ValueError(f"MiniMax file upload returned no file_id: {result}")

logger.info("MiniMax file uploaded: %s (purpose=%s)", file_id, purpose)
return file_id

async def voice_clone_tts(
self,
text: str,
file_id: str,
voice_id: str = "cloned_voice",
model: str = "speech-2.6-hd",
) -> Optional[bytes]:
"""
Synthesize speech using MiniMax voice clone API.
Sends the cloned voice file_id and text, returns audio bytes.
"""
if not text.strip():
return None

payload = {
"file_id": file_id,
"voice_id": voice_id,
"text": text,
"model": model,
}

try:
resp = await self.client.post(
VOICE_CLONE_URL,
headers=self._headers,
json=payload,
timeout=30.0,
)
resp.raise_for_status()
result = resp.json()

# Response contains base64-encoded audio
import base64
audio_b64 = result.get("audio", "")
if not audio_b64:
logger.warning("MiniMax voice clone returned no audio: %s", result)
return None

audio_bytes = base64.b64decode(audio_b64)
logger.info(
"MiniMax voice clone TTS generated %d bytes (voice_id=%s)",
len(audio_bytes),
voice_id,
)
return audio_bytes

except httpx.HTTPStatusError as e:
logger.error(
"MiniMax voice clone HTTP error: %s - %s",
e.response.status_code,
e.response.text,
)
raise
except (ConnectionError, ValueError) as e:
logger.error("MiniMax voice clone error: %s", e)
raise

async def close(self):
"""Close the HTTP client."""
await self.client.aclose()
2 changes: 2 additions & 0 deletions web/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

.env.local
Loading