diff --git a/service/.env.example b/service/.env.example index bd53e34..410a221 100644 --- a/service/.env.example +++ b/service/.env.example @@ -19,3 +19,7 @@ DB_PASSWORD=password DB_HOST=localhost DB_PORT=5432 DB_NAME=lingo + +# Zaban STT/TTS API (optional; used for upload STT and TTS) +ZABAN_BASE_URL=http://localhost:8000 +ZABAN_API_KEY= diff --git a/service/Dockerfile b/service/Dockerfile index 1de7a4c..a5e9751 100644 --- a/service/Dockerfile +++ b/service/Dockerfile @@ -3,7 +3,7 @@ FROM python:3.9-slim WORKDIR /lingo/service RUN apt-get update && \ - apt-get install -y --no-install-recommends ffmpeg gcc build-essential && \ + apt-get install -y --no-install-recommends ffmpeg sox gcc build-essential && \ rm -rf /var/lib/apt/lists/* COPY requirements.txt . diff --git a/service/audio_service.py b/service/audio_service.py index d3baccb..d8721b9 100644 --- a/service/audio_service.py +++ b/service/audio_service.py @@ -1,4 +1,7 @@ +import numpy as np from scipy import misc +from scipy.io import wavfile +from scipy.signal import resample # @misc{lintoai2023whispertimestamped, @@ -33,7 +36,18 @@ from fastapi import HTTPException, UploadFile import openai from dotenv import load_dotenv -from config import openai_api_key, model_id, model_path +from config import openai_api_key, model_id, model_path, zaban_base_url, zaban_api_key +from constants import ( + ZABAN_LANG_TO_CODE, + ZABAN_API_PATH_STT, + ZABAN_STT_MODEL, + SILERO_VAD_THRESHOLD, + SILERO_VAD_REPO, + SILERO_VAD_MODEL, + MIN_AUDIO_SIZE_BYTES, + MIN_SPEECH_DURATION_SEC, + SILERO_SAMPLING_RATE, +) from load_model import load_model import logging import whisper_timestamped as whisper_ts @@ -41,11 +55,131 @@ from urllib.parse import urlparse import tempfile import os -from detect_intent import client logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +_vad_model = None +_vad_utils = None + + +def _get_silero_vad(): + """Lazy-load Silero VAD model and utils (read_audio, get_speech_timestamps).""" + global _vad_model, _vad_utils + if _vad_model is None: + import torch + _vad_model, _vad_utils = torch.hub.load( + repo_or_dir=SILERO_VAD_REPO, + model=SILERO_VAD_MODEL, + force_reload=False, + trust_repo=True, + ) + return _vad_model, _vad_utils + + +def _load_wav_fallback(path: str, target_sr: int = SILERO_SAMPLING_RATE): + """ + Load WAV with scipy when Silero's read_audio (torchaudio) fails. + + Why: In minimal Docker images or when libsox is missing, torchaudio cannot + load files and raises OSError/RuntimeError. This fallback uses scipy.io.wavfile + so validation (empty/silence checks) still works for WAV uploads without + adding sox to the image. We normalize to float [-1, 1], convert to mono, + and resample to target_sr so the result matches what Silero VAD expects + (1D float tensor at 16 kHz). + """ + import torch + try: + sr, data = wavfile.read(path) + except Exception: + return None + if data is None or data.size == 0: + return None + if data.dtype == np.int16: + data = data.astype(np.float32) / 32768.0 + elif data.dtype == np.int32: + data = data.astype(np.float32) / 2147483648.0 + elif data.dtype != np.float32 and data.dtype != np.float64: + return None + if data.ndim > 1: + data = data.mean(axis=1) + if sr != target_sr: + num_samples = int(len(data) * target_sr / sr) + data = resample(data, num_samples).astype(np.float32) + return torch.from_numpy(data).float() + + +def validate_uploaded_audio(temp_file_path: str, content: bytes) -> None: + """ + Validate uploaded audio: reject empty files and audio with no speech (silence or noise only). + Uses Silero VAD with threshold 0.2 to detect speech. Raises HTTPException on validation failure. + """ + if not content or len(content) < MIN_AUDIO_SIZE_BYTES: + raise HTTPException( + status_code=500, + detail="Empty or invalid audio file. Please upload a non-empty audio file.", + ) + + try: + import torch + vad_model, vad_utils = _get_silero_vad() + get_speech_timestamps_fn = vad_utils[0] + read_audio_fn = vad_utils[2] + + wav = None + try: + wav = read_audio_fn(temp_file_path, sampling_rate=SILERO_SAMPLING_RATE) + except (OSError, RuntimeError) as e: + logger.info("Silero read_audio failed (%s), trying scipy WAV fallback", e) + wav = _load_wav_fallback(temp_file_path) + + if wav is None or (hasattr(wav, "numel") and wav.numel() == 0): + raise HTTPException( + status_code=500, + detail="Audio file could not be read or is empty.", + ) + + # Silero expects 1D tensor; ensure we have samples + if torch.is_tensor(wav): + if wav.dim() > 1: + wav = wav.squeeze() + min_samples = 512 + if wav.numel() < min_samples: + raise HTTPException( + status_code=500, + detail="Audio is too short to analyze. Please upload a longer recording.", + ) + + speech_timestamps = get_speech_timestamps_fn( + wav, + vad_model, + threshold=SILERO_VAD_THRESHOLD, + sampling_rate=SILERO_SAMPLING_RATE, + return_seconds=False, + ) + + if not speech_timestamps: + raise HTTPException( + status_code=500, + detail="No speech detected. Audio may be silent or contain only noise. Please record again with clear speech.", + ) + + total_speech_samples = sum(ts["end"] - ts["start"] for ts in speech_timestamps) + min_speech_samples = int(MIN_SPEECH_DURATION_SEC * SILERO_SAMPLING_RATE) + if total_speech_samples < min_speech_samples: + raise HTTPException( + status_code=500, + detail="No meaningful speech detected. Audio may be silent or contain only noise. Please record again with clear speech.", + ) + except HTTPException: + raise + except Exception as e: + logger.warning(f"Silero VAD validation failed: {e}") + raise HTTPException( + status_code=500, + detail="Audio could not be validated. Please ensure the file is a valid audio recording.", + ) + # Load environment variables load_dotenv() openai.api_key = openai_api_key @@ -144,9 +278,25 @@ def translate_with_whisper_timestamped(audioPath): detail=f"Translation failed: {str(e)}" ) +def _zaban_lang_to_code(lang: str) -> str: + """Normalize Zaban/BCP-47 language to short code (e.g. hin_Deva -> hi).""" + if not lang: + return "en" + key = lang.strip() + if key in ZABAN_LANG_TO_CODE: + return ZABAN_LANG_TO_CODE[key] + if len(key) <= 3: + return key + # BCP-47 style: take first 2 chars of script part (e.g. hin_Deva -> hin -> hi) + prefix = key.split("_")[0] + if len(prefix) >= 2: + return prefix[:2] + return "en" + + def translate_with_whisper_from_upload(upload_file: UploadFile): - """Translate uploaded audio file to English language using whisper model (without timestamps).""" - logger.info("Translation from upload started") + """Transcribe uploaded audio via Zaban STT. Returns (id, [_, text], [_, lang_code], _) for main.py compatibility.""" + logger.info("STT from upload started (Zaban)") temp_file_path = None try: # Create a temporary file with the original file extension @@ -160,20 +310,34 @@ def translate_with_whisper_from_upload(upload_file: UploadFile): temp_file.write(content) temp_file.flush() - '''options = dict(beam_size=5, best_of=5) - translate_options = dict(task="translate", **options) - result = model.transcribe(temp_file_path, **translate_options,prompt="Only Indian langues,like, hindi, marthi,tamil,gujarti,telegu,bengali,panjabi,bengali,malayalam,kannada or Indian english voice is used as voice banking service. voice will be like, check balance, pay money to some Indian names, list of beneficiaries, transactions list or ask for transaction insights. Do not translitarate, translate to English words, do not mix other language words") - return result''' - if temp_file_path: - with open(temp_file_path, "rb") as audio_file: - response = client.speech_to_text.translate( - file=audio_file, - model="saaras:v2.5", - prompt="Voice Banking" - ) - else: - repsonse = "Unclear command" - return response + validate_uploaded_audio(temp_file_path, content) + + if not temp_file_path: + return (None, [None, "Unclear command"], [None, "en"], None) + + url = f"{zaban_base_url.rstrip('/')}{ZABAN_API_PATH_STT}" + headers = {} + if zaban_api_key: + headers["X-API-Key"] = zaban_api_key + with open(temp_file_path, "rb") as audio_file: + files = {"audio": (upload_file.filename or "audio.wav", audio_file, "audio/wav")} + data = {"model": ZABAN_STT_MODEL} + r = requests.post(url, files=files, data=data, headers=headers, timeout=60) + r.raise_for_status() + result = r.json() + text = result.get("text", "").strip() or "Unclear command" + raw_lang = result.get("language", "en") + lang_code = _zaban_lang_to_code(raw_lang) + # main.py expects: id, response, lang, dia = ...; response[1] = text; lang[1] = language + return (None, [None, text], [None, lang_code], None) + except HTTPException: + raise # Let validation errors (empty/silence) propagate with original status_code and detail + except requests.RequestException as e: + logger.error(f"Zaban STT request failed: {str(e)}") + raise HTTPException( + status_code=502, + detail=f"Speech-to-text failed: {str(e)}" + ) except Exception as e: logger.error(f"Translation from upload failed: {str(e)}") raise HTTPException( diff --git a/service/config.py b/service/config.py index 0e6dcda..c7d4c01 100644 --- a/service/config.py +++ b/service/config.py @@ -18,7 +18,10 @@ db_port = os.getenv("DB_PORT") db_name = os.getenv("DB_NAME") -sarvam_api_key = os.getenv("SARVAM_API_KEY","") +# Zaban STT/TTS API (replaces Sarvam for speech). Use https to avoid redirect (POST→GET causes 405). +zaban_base_url = os.getenv("ZABAN_BASE_URL", "") +zaban_api_key = os.getenv("ZABAN_API_KEY", "") + # Redis configuration redis_host = os.getenv("REDIS_HOST", "localhost") redis_port = int(os.getenv("REDIS_PORT", 6379)) diff --git a/service/constants.py b/service/constants.py new file mode 100644 index 0000000..aad9c48 --- /dev/null +++ b/service/constants.py @@ -0,0 +1,30 @@ +# Silero VAD: threshold 0.2 = require speech above this probability (-20 interpreted as 0.2) +SILERO_VAD_THRESHOLD = 0.2 +MIN_AUDIO_SIZE_BYTES = 100 # bytes +MIN_SPEECH_DURATION_SEC = 2 # seconds +SILERO_SAMPLING_RATE = 16000 # Hz (samples per second) +SILERO_VAD_REPO = "snakers4/silero-vad" +SILERO_VAD_MODEL = "silero_vad" + +# Zaban API path segments (base URL from config). Model name for STT. +ZABAN_API_PATH_STT = "/api/v1/stt" +ZABAN_API_PATH_TRANSLATE = "/api/v1/translate" +ZABAN_STT_MODEL = "whisper" + +# Single source of truth: Zaban/BCP-47 -> short code. Reverse (short -> BCP-47) +ZABAN_LANG_TO_CODE = { + "hin_Deva": "hi", "eng_Latn": "en", "ben_Beng": "bn", "tam_Taml": "ta", + "tel_Telu": "te", "mar_Deva": "mr", "mal_Mlym": "ml", "kan_Knda": "kn", + "guj_Gujr": "gu", "pan_Guru": "pa", "ory_Orya": "or", "urd_Arab": "ur", + "san_Deva": "sa", "asm_Beng": "as", +} + +# Short language code -> display name +LANG_MAP = { + "en": "English", "hi": "Hindi", "bn": "Bengali", "ta": "Tamil", "te": "Telugu", + "mr": "Marathi", "ml": "Malayalam", "kn": "Kannada", "gu": "Gujarati", "pa": "Punjabi", + "or": "Odia", "ur": "Urdu", "sa": "Sanskrit", "ar": "Arabic", "fr": "French", + "de": "German", "es": "Spanish", "it": "Italian", "pt": "Portuguese", "zh": "Chinese", + "ja": "Japanese", "ko": "Korean", "ru": "Russian", "sv": "Swedish", "pl": "Polish", + "tr": "Turkish", "cs": "Czech", "fi": "Finnish", "he": "Hebrew", +} diff --git a/service/detect_intent.py b/service/detect_intent.py index 00fbe6f..f23fb1e 100644 --- a/service/detect_intent.py +++ b/service/detect_intent.py @@ -5,23 +5,18 @@ import re import logging import ollama -from config import ollama_host, ollama_model_name, ollama_translation_model_name, sarvam_api_key +from config import ollama_host, ollama_model_name, zaban_base_url, zaban_api_key from typing import Dict, Any from time_utils import normalize_timeframe import requests import os -from sarvamai import SarvamAI + +from constants import ZABAN_LANG_TO_CODE, ZABAN_API_PATH_TRANSLATE logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) -lang_map = { - "en": "English", "hi": "Hindi", "bn": "Bengali", "ta": "Tamil", "te": "Telugu", - "mr": "Marathi", "ml": "Malayalam", "kn": "Kannada", "gu": "Gujarati", "pa": "Punjabi", - "or": "Odia", "ur": "Urdu", "sa": "Sanskrit", "ar": "Arabic", "fr": "French", - "de": "German", "es": "Spanish", "it": "Italian", "pt": "Portuguese", "zh": "Chinese", - "ja": "Japanese", "ko": "Korean", "ru": "Russian", "sv": "Swedish", "pl": "Polish", - "tr": "Turkish", "cs": "Czech", "fi": "Finnish", "he": "Hebrew" -} +# Derived from single constant in constants to avoid mismatches. +LANG_CODE_TO_BCP47 = {v: k for k, v in ZABAN_LANG_TO_CODE.items()} ALLOWED_INTENTS = ["check_balance", "recent_txn", "transfer_money", "txn_insights", "list_beneficiaries", "unknown"] SYSTEM = """ @@ -99,9 +94,7 @@ """ -client = SarvamAI( - api_subscription_key=sarvam_api_key, -) + def safe_json_parse(s: str) -> Dict[str, Any]: # Try direct parse try: @@ -174,19 +167,37 @@ def validate_schema(result: dict) -> dict: "language": language, "confidence": confidence, } -def translate(message:str, lang_code: str = "en"): - #lang_code = lang_map.get(lang_code,"English") - logger.info(f"Model: {ollama_translation_model_name}, language: {lang_code}") - if lang_code == "en-IN": +def translate(message: str, lang_code: str = "en") -> str: + """Translate message from English to target language using Zaban Translation API. No-op for English.""" + short = (lang_code or "en").split("-")[0].strip().lower() + if short == "en": + return message + target_bcp = LANG_CODE_TO_BCP47.get(short) + if not target_bcp: + logger.warning(f"Unsupported translation target: {lang_code}, returning original") + return message + if not zaban_api_key: + logger.warning("ZABAN_API_KEY not set; translation requires it. Returning original.") return message try: - id,response,lang = client.text.translate( - input=message, - source_language_code="en-IN", - target_language_code=f"{lang_code}", - speaker_gender="Female" - ) - return response[1] + url = f"{zaban_base_url.rstrip('/')}{ZABAN_API_PATH_TRANSLATE}" + headers = { + "Content-Type": "application/json", + "X-API-Key": zaban_api_key, + } + payload = { + "text": message, + "source_lang": "eng_Latn", + "target_lang": target_bcp, + } + r = requests.post(url, json=payload, headers=headers, timeout=30) + r.raise_for_status() + result = r.json() + out = (result.get("translated_text") or "").strip() + return out if out else message + except requests.RequestException as e: + logger.error(f"Zaban translation failed: {str(e)}") + return message except Exception as e: logger.error(f"Error during translation: {str(e)}") return message diff --git a/service/requirements.txt b/service/requirements.txt index d57ab45..5edcda6 100644 --- a/service/requirements.txt +++ b/service/requirements.txt @@ -30,5 +30,4 @@ sentencepiece ninja sqlalchemy psycopg2-binary -redis -sarvamai +redis \ No newline at end of file diff --git a/service/surv.py b/service/surv.py index f46b64b..1d73f7f 100644 --- a/service/surv.py +++ b/service/surv.py @@ -1,26 +1,14 @@ -from sarvamai import SarvamAI +"""Example: text translation via detect_intent.translate (Ollama). No Sarvam.""" +from detect_intent import translate - -#SARVAM_API_KEY="sk_lz33toms_amJdXnvyWxlBQIs4OPue1yexi" -SARVAM_API_KEY="sk_t7fvsjjb_7JsD5ZXGrEhHqjUtAQSFsCxB" -client = SarvamAI( - api_subscription_key=SARVAM_API_KEY, -) - -response = client.text.translate( - #input="Please confirm your the transaction 10by entring the OTP you have recieved on your registered mobile number" - input="Please confirm the transaction ₹10.00 to Suresh Patil by entering the OTP you have recieved on your registered mobile number", - source_language_code="auto", - target_language_code="hi-IN", - speaker_gender="Female", - numerals_format="native" +# Uses Ollama (ollama_translation_model_name) for text translation +message = ( + "Please confirm the transaction ₹10.00 to Suresh Patil by entering the OTP " + "you have received on your registered mobile number" ) +response = translate(message, "hi") print(response) -''' -response = client.text_to_speech.convert( - text="Your account balacne is 2000.35", - target_language_code="ta-IN", -) -print(response) -''' +# For TTS (text-to-speech), call Zaban TTS API: +# POST {ZABAN_BASE_URL}/api/v1/tts with X-API-Key, JSON body: {"text": "...", "language": "hi"} +# Returns WAV bytes.