shlawgathon · xiejosh · Feb 22, 2026
diff --git a/.gitignore b/.gitignore
@@ -6,9 +6,11 @@ backend/.venv/
 
 # Build output
 extension/dist/
+web/dist/
 
 # Environment
 backend/.env
+web/.env.local
 
 # OS
 .DS_Store
diff --git a/backend/main.py b/backend/main.py
@@ -14,6 +14,7 @@
 from dotenv import load_dotenv
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
 
 from services.speechmatics_client import SpeechmaticsClient
 from services.minimax_client import MinimaxClient, get_language_name
@@ -50,6 +51,108 @@ async def health():
     return {"status": "ok", "service": "interpreter-backend"}
 
 
+class VoiceProfileRequest(BaseModel):
+    userId: str
+    audio: str  # base64-encoded audio
+    format: str = "webm"
+
+
+@app.post("/api/voice-profile")
+async def create_voice_profile(req: VoiceProfileRequest):
+    """
+    Create a voice profile by uploading audio to MiniMax file upload API.
+    Returns the MiniMax file_id which can be used for voice clone TTS.
+    """
+    import base64
+    import io
+
+    minimax_key = os.getenv("MINIMAX_API_KEY", "")
+    minimax_group_id = os.getenv("MINIMAX_GROUP_ID", "")
+
+    if not minimax_key:
+        raise ValueError("MINIMAX_API_KEY is required")
+
+    # Decode base64 audio
+    audio_bytes = base64.b64decode(req.audio)
+    logger.info(
+        "Uploading voice sample for user %s (%d bytes, format=%s)",
+        req.userId, len(audio_bytes), req.format,
+    )
+
+    # Upload to MiniMax file upload API
+    ext = req.format if req.format in {"mp3", "m4a", "wav", "webm"} else "wav"
+    upload_url = f"https://api.minimax.io/v1/files/upload?GroupId={minimax_group_id}"
+    headers = {"Authorization": f"Bearer {minimax_key}"}
+
+    try:
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            files = {"file": (f"voice_sample.{ext}", io.BytesIO(audio_bytes))}
+            data = {"purpose": "voice_clone"}
+            resp = await client.post(upload_url, headers=headers, data=data, files=files)
+            logger.info("MiniMax file upload status=%s body=%s", resp.status_code, resp.text)
+            resp.raise_for_status()
+            result = resp.json()
+    except httpx.HTTPStatusError as e:
+        logger.error("MiniMax upload HTTP error %s: %s", e.response.status_code, e.response.text)
+        from fastapi.responses import JSONResponse
+        return JSONResponse(status_code=502, content={"error": f"MiniMax API error: {e.response.status_code}", "detail": e.response.text})
+    except Exception as e:
+        logger.error("MiniMax upload failed: %s", e)
+        from fastapi.responses import JSONResponse
+        return JSONResponse(status_code=500, content={"error": str(e)})
+
+    # MiniMax may return file_id at top level or nested under "file"
+    file_id = result.get("file_id") or (result.get("file") or {}).get("file_id")
+    if not file_id:
+        logger.error("MiniMax file upload response missing file_id: %s", result)
+        from fastapi.responses import JSONResponse
+        return JSONResponse(status_code=502, content={"error": "No file_id in MiniMax response", "detail": str(result)})
+
+    file_id = str(file_id)
+    logger.info("MiniMax file uploaded for user %s: file_id=%s", req.userId, file_id)
+    return {"voiceProfileId": file_id}
+
+
+async def lookup_voice_profile(user_id: str) -> dict | None:
+    """
+    Query Convex HTTP API to fetch a user's voice profile.
+    Returns profile dict with voiceProfileId, voiceProfileStatus, language
+    or None if not found or on error.
+    """
+    convex_site_url = os.getenv("CONVEX_SITE_URL", "").rstrip("/")
+    if not convex_site_url or not user_id:
+        return None
+
+    try:
+        async with httpx.AsyncClient(timeout=5.0) as client:
+            resp = await client.get(
+                f"{convex_site_url}/api/voice-profile",
+                params={"userId": user_id},
+            )
+            if resp.status_code == 200:
+                profile = resp.json()
+                logger.info(
+                    "Convex voice profile for %s: status=%s, voiceProfileId=%s",
+                    user_id,
+                    profile.get("voiceProfileStatus"),
+                    profile.get("voiceProfileId"),
+                )
+                return profile
+            elif resp.status_code == 404:
+                logger.info("No Convex profile found for user %s", user_id)
+                return None
+            else:
+                logger.warning(
+                    "Convex profile lookup failed: %d %s",
+                    resp.status_code,
+                    resp.text,
+                )
+                return None
+    except Exception as e:
+        logger.error("Convex profile lookup error: %s", e)
+        return None
+
+
 @app.websocket("/ws/translate")
 async def websocket_translate(ws: WebSocket):
     """
@@ -91,6 +194,9 @@ async def websocket_translate(ws: WebSocket):
     speechmatics_tts = SpeechmaticsTTSClient(api_key=speechmatics_key)
     connection_open = True
 
+    # Voice profile for clone TTS (set via Convex lookup on config)
+    user_voice_id: str | None = None
+
     # Buffer for accumulating transcript before translation
     transcript_buffer = ""
     translation_lock = asyncio.Lock()
@@ -200,19 +306,36 @@ async def on_translation(text: str, is_final: bool):
             logger.error("Error in on_translation: %s", e)
 
     async def synthesize_tts(translated_text: str) -> bytes | None:
-        if tts_provider == "speechmatics":
-            audio_data = await speechmatics_tts.text_to_speech(
-                text=translated_text,
-                language=target_lang,
-            )
-            if audio_data:
-                return audio_data
-            logger.warning(
-                "Speechmatics TTS returned no audio for target language '%s'; "
-                "falling back to MiniMax TTS.",
-                target_lang,
-            )
+        # If user has a voice clone profile (MiniMax file_id), try clone TTS first
+        if user_voice_id:
+            try:
+                audio_data = await minimax.voice_clone_tts(
+                    text=translated_text,
+                    file_id=user_voice_id,
+                )
+                if audio_data:
+                    logger.info("[TTS] Used MiniMax voice clone (file_id=%s)", user_voice_id)
+                    return audio_data
+            except Exception as e:
+                logger.warning(
+                    "MiniMax voice clone TTS failed, falling back to standard: %s", e
+                )
+
+        # Primary: Speechmatics TTS
+        audio_data = await speechmatics_tts.text_to_speech(
+            text=translated_text,
+            language=target_lang,
+        )
+        if audio_data:
+            return audio_data
+
+        logger.warning(
+            "Speechmatics TTS returned no audio for target language '%s'; "
+            "falling back to MiniMax TTS.",
+            target_lang,
+        )
 
+        # Fallback: MiniMax standard TTS
         return await minimax.text_to_speech(
             text=translated_text,
             language=target_lang,
@@ -344,6 +467,28 @@ async def translate_and_speak(text: str):
                     )
                     logger.info("TTS provider: %s", tts_provider)
 
+                    # Look up voice profile from Convex if user_id provided
+                    config_user_id = msg.get("user_id", "")
+                    if config_user_id:
+                        profile = await lookup_voice_profile(config_user_id)
+                        if (
+                            profile
+                            and profile.get("voiceProfileStatus") == "ready"
+                            and profile.get("voiceProfileId")
+                        ):
+                            user_voice_id = profile["voiceProfileId"]
+                            logger.info(
+                                "Using voice profile %s for user %s",
+                                user_voice_id,
+                                config_user_id,
+                            )
+                        else:
+                            user_voice_id = None
+                            logger.info(
+                                "No ready voice profile for user %s, using default",
+                                config_user_id,
+                            )
+
                     # Initialize Speechmatics with the source language
                     if speechmatics:
                         await speechmatics.close()

diff --git a/backend/services/minimax_client.py b/backend/services/minimax_client.py
@@ -19,6 +19,8 @@
 # MiniMax API endpoints (current official domain)
 CHAT_URL = "https://api.minimax.io/v1/text/chatcompletion_v2"
 TTS_WS_URL = "wss://api.minimax.io/ws/v1/t2a_v2"
+FILE_UPLOAD_URL = "https://api.minimax.io/v1/files/upload"
+VOICE_CLONE_URL = "https://api.minimax.io/v1/voice_clone"
 
 DEFAULT_TEXT_MODEL = "MiniMax-M2"
 DEFAULT_TTS_MODEL = "speech-2.8-turbo"
@@ -273,15 +275,17 @@ async def text_to_speech(
         self,
         text: str,
         language: str = "en",
+        voice_id: Optional[str] = None,
     ) -> Optional[bytes]:
         """
         Generate speech audio using MiniMax T2A WebSocket API.
         Returns MP3 audio bytes assembled from streaming chunks.
+        If voice_id is provided, it overrides the language-based voice selection.
         """
         if not text.strip():
             return None
 
-        voice_id = self._resolve_voice_id(language)
+        voice_id = voice_id or self._resolve_voice_id(language)
         try:
             return await self._text_to_speech_once(text=text, voice_id=voice_id)
         except ValueError as e:
@@ -447,6 +451,93 @@ async def _wait_for_event(self, ws, expected_events: set[str], timeout: float) -
                 )
                 raise ValueError(f"MiniMax TTS task_failed: {status_msg}")
 
+    async def upload_file(
+        self,
+        audio_bytes: bytes,
+        filename: str = "voice_sample.wav",
+        purpose: str = "voice_clone",
+    ) -> str:
+        """
+        Upload an audio file to MiniMax File API.
+        Returns the file_id for use with voice clone API.
+        """
+        headers = {"Authorization": f"Bearer {self.api_key}"}
+        files = {"file": (filename, audio_bytes)}
+        data = {"purpose": purpose}
+
+        resp = await self.client.post(
+            FILE_UPLOAD_URL,
+            headers=headers,
+            data=data,
+            files=files,
+        )
+        resp.raise_for_status()
+        result = resp.json()
+
+        file_id = result.get("file", {}).get("file_id")
+        if not file_id:
+            raise ValueError(f"MiniMax file upload returned no file_id: {result}")
+
+        logger.info("MiniMax file uploaded: %s (purpose=%s)", file_id, purpose)
+        return file_id
+
+    async def voice_clone_tts(
+        self,
+        text: str,
+        file_id: str,
+        voice_id: str = "cloned_voice",
+        model: str = "speech-2.6-hd",
+    ) -> Optional[bytes]:
+        """
+        Synthesize speech using MiniMax voice clone API.
+        Sends the cloned voice file_id and text, returns audio bytes.
+        """
+        if not text.strip():
+            return None
+
+        payload = {
+            "file_id": file_id,
+            "voice_id": voice_id,
+            "text": text,
+            "model": model,
+        }
+
+        try:
+            resp = await self.client.post(
+                VOICE_CLONE_URL,
+                headers=self._headers,
+                json=payload,
+                timeout=30.0,
+            )
+            resp.raise_for_status()
+            result = resp.json()
+
+            # Response contains base64-encoded audio
+            import base64
+            audio_b64 = result.get("audio", "")
+            if not audio_b64:
+                logger.warning("MiniMax voice clone returned no audio: %s", result)
+                return None
+
+            audio_bytes = base64.b64decode(audio_b64)
+            logger.info(
+                "MiniMax voice clone TTS generated %d bytes (voice_id=%s)",
+                len(audio_bytes),
+                voice_id,
+            )
+            return audio_bytes
+
+        except httpx.HTTPStatusError as e:
+            logger.error(
+                "MiniMax voice clone HTTP error: %s - %s",
+                e.response.status_code,
+                e.response.text,
+            )
+            raise
+        except (ConnectionError, ValueError) as e:
+            logger.error("MiniMax voice clone error: %s", e)
+            raise
+
     async def close(self):
         """Close the HTTP client."""
         await self.client.aclose()
diff --git a/web/.gitignore b/web/.gitignore
@@ -0,0 +1,2 @@
+
+.env.local