Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions service/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,7 @@ DB_PASSWORD=password
DB_HOST=localhost
DB_PORT=5432
DB_NAME=lingo

# Zaban STT/TTS API (optional; used for upload STT and TTS)
ZABAN_BASE_URL=http://localhost:8000
ZABAN_API_KEY=
2 changes: 1 addition & 1 deletion service/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ FROM python:3.9-slim
WORKDIR /lingo/service

RUN apt-get update && \
apt-get install -y --no-install-recommends ffmpeg gcc build-essential && \
apt-get install -y --no-install-recommends ffmpeg sox gcc build-essential && \
rm -rf /var/lib/apt/lists/*

COPY requirements.txt .
Expand Down
200 changes: 182 additions & 18 deletions service/audio_service.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import numpy as np
from scipy import misc
from scipy.io import wavfile
from scipy.signal import resample


# @misc{lintoai2023whispertimestamped,
Expand Down Expand Up @@ -33,19 +36,150 @@
from fastapi import HTTPException, UploadFile
import openai
from dotenv import load_dotenv
from config import openai_api_key, model_id, model_path
from config import openai_api_key, model_id, model_path, zaban_base_url, zaban_api_key
from constants import (
ZABAN_LANG_TO_CODE,
ZABAN_API_PATH_STT,
ZABAN_STT_MODEL,
SILERO_VAD_THRESHOLD,
SILERO_VAD_REPO,
SILERO_VAD_MODEL,
MIN_AUDIO_SIZE_BYTES,
MIN_SPEECH_DURATION_SEC,
SILERO_SAMPLING_RATE,
)
from load_model import load_model
import logging
import whisper_timestamped as whisper_ts
import requests
from urllib.parse import urlparse
import tempfile
import os
from detect_intent import client

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

_vad_model = None
_vad_utils = None


def _get_silero_vad():
"""Lazy-load Silero VAD model and utils (read_audio, get_speech_timestamps)."""
global _vad_model, _vad_utils
if _vad_model is None:
import torch
_vad_model, _vad_utils = torch.hub.load(
repo_or_dir=SILERO_VAD_REPO,
model=SILERO_VAD_MODEL,
force_reload=False,
trust_repo=True,
)
return _vad_model, _vad_utils


def _load_wav_fallback(path: str, target_sr: int = SILERO_SAMPLING_RATE):
"""
Load WAV with scipy when Silero's read_audio (torchaudio) fails.

Why: In minimal Docker images or when libsox is missing, torchaudio cannot
load files and raises OSError/RuntimeError. This fallback uses scipy.io.wavfile
so validation (empty/silence checks) still works for WAV uploads without
adding sox to the image. We normalize to float [-1, 1], convert to mono,
and resample to target_sr so the result matches what Silero VAD expects
(1D float tensor at 16 kHz).
"""
import torch
try:
sr, data = wavfile.read(path)
except Exception:
return None
if data is None or data.size == 0:
return None
if data.dtype == np.int16:
data = data.astype(np.float32) / 32768.0
elif data.dtype == np.int32:
data = data.astype(np.float32) / 2147483648.0
elif data.dtype != np.float32 and data.dtype != np.float64:
return None
if data.ndim > 1:
data = data.mean(axis=1)
if sr != target_sr:
num_samples = int(len(data) * target_sr / sr)
data = resample(data, num_samples).astype(np.float32)
return torch.from_numpy(data).float()


def validate_uploaded_audio(temp_file_path: str, content: bytes) -> None:
"""
Validate uploaded audio: reject empty files and audio with no speech (silence or noise only).
Uses Silero VAD with threshold 0.2 to detect speech. Raises HTTPException on validation failure.
"""
if not content or len(content) < MIN_AUDIO_SIZE_BYTES:
raise HTTPException(
status_code=500,
detail="Empty or invalid audio file. Please upload a non-empty audio file.",
)

try:
import torch
vad_model, vad_utils = _get_silero_vad()
get_speech_timestamps_fn = vad_utils[0]
read_audio_fn = vad_utils[2]

wav = None
try:
wav = read_audio_fn(temp_file_path, sampling_rate=SILERO_SAMPLING_RATE)
except (OSError, RuntimeError) as e:
logger.info("Silero read_audio failed (%s), trying scipy WAV fallback", e)
wav = _load_wav_fallback(temp_file_path)

if wav is None or (hasattr(wav, "numel") and wav.numel() == 0):
raise HTTPException(
status_code=500,
detail="Audio file could not be read or is empty.",
)

# Silero expects 1D tensor; ensure we have samples
if torch.is_tensor(wav):
if wav.dim() > 1:
wav = wav.squeeze()
min_samples = 512
if wav.numel() < min_samples:
raise HTTPException(
status_code=500,
detail="Audio is too short to analyze. Please upload a longer recording.",
)

speech_timestamps = get_speech_timestamps_fn(
wav,
vad_model,
threshold=SILERO_VAD_THRESHOLD,
sampling_rate=SILERO_SAMPLING_RATE,
return_seconds=False,
)

if not speech_timestamps:
raise HTTPException(
status_code=500,
detail="No speech detected. Audio may be silent or contain only noise. Please record again with clear speech.",
)

total_speech_samples = sum(ts["end"] - ts["start"] for ts in speech_timestamps)
min_speech_samples = int(MIN_SPEECH_DURATION_SEC * SILERO_SAMPLING_RATE)
if total_speech_samples < min_speech_samples:
raise HTTPException(
status_code=500,
detail="No meaningful speech detected. Audio may be silent or contain only noise. Please record again with clear speech.",
)
except HTTPException:
raise
except Exception as e:
logger.warning(f"Silero VAD validation failed: {e}")
raise HTTPException(
status_code=500,
detail="Audio could not be validated. Please ensure the file is a valid audio recording.",
)

# Load environment variables
load_dotenv()
openai.api_key = openai_api_key
Expand Down Expand Up @@ -144,9 +278,25 @@ def translate_with_whisper_timestamped(audioPath):
detail=f"Translation failed: {str(e)}"
)

def _zaban_lang_to_code(lang: str) -> str:
"""Normalize Zaban/BCP-47 language to short code (e.g. hin_Deva -> hi)."""
if not lang:
return "en"
key = lang.strip()
if key in ZABAN_LANG_TO_CODE:
return ZABAN_LANG_TO_CODE[key]
if len(key) <= 3:
return key
# BCP-47 style: take first 2 chars of script part (e.g. hin_Deva -> hin -> hi)
prefix = key.split("_")[0]
if len(prefix) >= 2:
return prefix[:2]
return "en"


def translate_with_whisper_from_upload(upload_file: UploadFile):
"""Translate uploaded audio file to English language using whisper model (without timestamps)."""
logger.info("Translation from upload started")
"""Transcribe uploaded audio via Zaban STT. Returns (id, [_, text], [_, lang_code], _) for main.py compatibility."""
logger.info("STT from upload started (Zaban)")
temp_file_path = None
try:
# Create a temporary file with the original file extension
Expand All @@ -160,20 +310,34 @@ def translate_with_whisper_from_upload(upload_file: UploadFile):
temp_file.write(content)
temp_file.flush()

'''options = dict(beam_size=5, best_of=5)
translate_options = dict(task="translate", **options)
result = model.transcribe(temp_file_path, **translate_options,prompt="Only Indian langues,like, hindi, marthi,tamil,gujarti,telegu,bengali,panjabi,bengali,malayalam,kannada or Indian english voice is used as voice banking service. voice will be like, check balance, pay money to some Indian names, list of beneficiaries, transactions list or ask for transaction insights. Do not translitarate, translate to English words, do not mix other language words")
return result'''
if temp_file_path:
with open(temp_file_path, "rb") as audio_file:
response = client.speech_to_text.translate(
file=audio_file,
model="saaras:v2.5",
prompt="Voice Banking"
)
else:
repsonse = "Unclear command"
return response
validate_uploaded_audio(temp_file_path, content)

if not temp_file_path:
return (None, [None, "Unclear command"], [None, "en"], None)

url = f"{zaban_base_url.rstrip('/')}{ZABAN_API_PATH_STT}"
headers = {}
if zaban_api_key:
headers["X-API-Key"] = zaban_api_key
with open(temp_file_path, "rb") as audio_file:
files = {"audio": (upload_file.filename or "audio.wav", audio_file, "audio/wav")}
data = {"model": ZABAN_STT_MODEL}
r = requests.post(url, files=files, data=data, headers=headers, timeout=60)
r.raise_for_status()
result = r.json()
text = result.get("text", "").strip() or "Unclear command"
raw_lang = result.get("language", "en")
lang_code = _zaban_lang_to_code(raw_lang)
# main.py expects: id, response, lang, dia = ...; response[1] = text; lang[1] = language
return (None, [None, text], [None, lang_code], None)
except HTTPException:
raise # Let validation errors (empty/silence) propagate with original status_code and detail
except requests.RequestException as e:
logger.error(f"Zaban STT request failed: {str(e)}")
raise HTTPException(
status_code=502,
detail=f"Speech-to-text failed: {str(e)}"
)
except Exception as e:
logger.error(f"Translation from upload failed: {str(e)}")
raise HTTPException(
Expand Down
5 changes: 4 additions & 1 deletion service/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@
db_port = os.getenv("DB_PORT")
db_name = os.getenv("DB_NAME")

sarvam_api_key = os.getenv("SARVAM_API_KEY","")
# Zaban STT/TTS API (replaces Sarvam for speech). Use https to avoid redirect (POST→GET causes 405).
zaban_base_url = os.getenv("ZABAN_BASE_URL", "")
zaban_api_key = os.getenv("ZABAN_API_KEY", "")

# Redis configuration
redis_host = os.getenv("REDIS_HOST", "localhost")
redis_port = int(os.getenv("REDIS_PORT", 6379))
Expand Down
30 changes: 30 additions & 0 deletions service/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Silero VAD: threshold 0.2 = require speech above this probability (-20 interpreted as 0.2)
SILERO_VAD_THRESHOLD = 0.2
MIN_AUDIO_SIZE_BYTES = 100 # bytes
MIN_SPEECH_DURATION_SEC = 2 # seconds
SILERO_SAMPLING_RATE = 16000 # Hz (samples per second)
SILERO_VAD_REPO = "snakers4/silero-vad"
SILERO_VAD_MODEL = "silero_vad"

# Zaban API path segments (base URL from config). Model name for STT.
ZABAN_API_PATH_STT = "/api/v1/stt"
ZABAN_API_PATH_TRANSLATE = "/api/v1/translate"
ZABAN_STT_MODEL = "whisper"

# Single source of truth: Zaban/BCP-47 -> short code. Reverse (short -> BCP-47)
ZABAN_LANG_TO_CODE = {
"hin_Deva": "hi", "eng_Latn": "en", "ben_Beng": "bn", "tam_Taml": "ta",
"tel_Telu": "te", "mar_Deva": "mr", "mal_Mlym": "ml", "kan_Knda": "kn",
"guj_Gujr": "gu", "pan_Guru": "pa", "ory_Orya": "or", "urd_Arab": "ur",
"san_Deva": "sa", "asm_Beng": "as",
}

# Short language code -> display name
LANG_MAP = {
"en": "English", "hi": "Hindi", "bn": "Bengali", "ta": "Tamil", "te": "Telugu",
"mr": "Marathi", "ml": "Malayalam", "kn": "Kannada", "gu": "Gujarati", "pa": "Punjabi",
"or": "Odia", "ur": "Urdu", "sa": "Sanskrit", "ar": "Arabic", "fr": "French",
"de": "German", "es": "Spanish", "it": "Italian", "pt": "Portuguese", "zh": "Chinese",
"ja": "Japanese", "ko": "Korean", "ru": "Russian", "sv": "Swedish", "pl": "Polish",
"tr": "Turkish", "cs": "Czech", "fi": "Finnish", "he": "Hebrew",
}
59 changes: 35 additions & 24 deletions service/detect_intent.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,18 @@
import re
import logging
import ollama
from config import ollama_host, ollama_model_name, ollama_translation_model_name, sarvam_api_key
from config import ollama_host, ollama_model_name, zaban_base_url, zaban_api_key
from typing import Dict, Any
from time_utils import normalize_timeframe
import requests
import os
from sarvamai import SarvamAI

from constants import ZABAN_LANG_TO_CODE, ZABAN_API_PATH_TRANSLATE

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
lang_map = {
"en": "English", "hi": "Hindi", "bn": "Bengali", "ta": "Tamil", "te": "Telugu",
"mr": "Marathi", "ml": "Malayalam", "kn": "Kannada", "gu": "Gujarati", "pa": "Punjabi",
"or": "Odia", "ur": "Urdu", "sa": "Sanskrit", "ar": "Arabic", "fr": "French",
"de": "German", "es": "Spanish", "it": "Italian", "pt": "Portuguese", "zh": "Chinese",
"ja": "Japanese", "ko": "Korean", "ru": "Russian", "sv": "Swedish", "pl": "Polish",
"tr": "Turkish", "cs": "Czech", "fi": "Finnish", "he": "Hebrew"
}
# Derived from single constant in constants to avoid mismatches.
LANG_CODE_TO_BCP47 = {v: k for k, v in ZABAN_LANG_TO_CODE.items()}
ALLOWED_INTENTS = ["check_balance", "recent_txn", "transfer_money", "txn_insights", "list_beneficiaries", "unknown"]

SYSTEM = """
Expand Down Expand Up @@ -99,9 +94,7 @@

"""

client = SarvamAI(
api_subscription_key=sarvam_api_key,
)

def safe_json_parse(s: str) -> Dict[str, Any]:
# Try direct parse
try:
Expand Down Expand Up @@ -174,19 +167,37 @@ def validate_schema(result: dict) -> dict:
"language": language,
"confidence": confidence,
}
def translate(message:str, lang_code: str = "en"):
#lang_code = lang_map.get(lang_code,"English")
logger.info(f"Model: {ollama_translation_model_name}, language: {lang_code}")
if lang_code == "en-IN":
def translate(message: str, lang_code: str = "en") -> str:
"""Translate message from English to target language using Zaban Translation API. No-op for English."""
short = (lang_code or "en").split("-")[0].strip().lower()
if short == "en":
return message
target_bcp = LANG_CODE_TO_BCP47.get(short)
if not target_bcp:
logger.warning(f"Unsupported translation target: {lang_code}, returning original")
return message
if not zaban_api_key:
logger.warning("ZABAN_API_KEY not set; translation requires it. Returning original.")
return message
try:
id,response,lang = client.text.translate(
input=message,
source_language_code="en-IN",
target_language_code=f"{lang_code}",
speaker_gender="Female"
)
return response[1]
url = f"{zaban_base_url.rstrip('/')}{ZABAN_API_PATH_TRANSLATE}"
headers = {
"Content-Type": "application/json",
"X-API-Key": zaban_api_key,
}
payload = {
"text": message,
"source_lang": "eng_Latn",
"target_lang": target_bcp,
}
r = requests.post(url, json=payload, headers=headers, timeout=30)
r.raise_for_status()
result = r.json()
out = (result.get("translated_text") or "").strip()
return out if out else message
except requests.RequestException as e:
logger.error(f"Zaban translation failed: {str(e)}")
return message
except Exception as e:
logger.error(f"Error during translation: {str(e)}")
return message
Expand Down
3 changes: 1 addition & 2 deletions service/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,4 @@ sentencepiece
ninja
sqlalchemy
psycopg2-binary
redis
sarvamai
redis
Loading