diff --git a/zaban_backend/app/routes/v1.py b/zaban_backend/app/routes/v1.py index e5fe6de..ba93d5c 100644 --- a/zaban_backend/app/routes/v1.py +++ b/zaban_backend/app/routes/v1.py @@ -9,7 +9,7 @@ from ..services.ai4bharat import Ai4BharatClient from ..services.language_detection import get_language_detector -from ..services.constants import WHISPER_TO_BCP47 +from ..services.constants import WHISPER_TO_BCP47, WHISPER_BEAM_SIZE, WHISPER_BEST_OF router = APIRouter() @@ -360,15 +360,22 @@ async def stt( temp_file_path = temp_file.name try: - # Transcribe - result = whisper_model.transcribe( - temp_file_path, - language=lang[:2] if lang and len(lang) >= 2 else None, - task="transcribe" - ) + # Transcribe with improved auto language detection: condition_on_previous_text=False, + # temperature=0, and beam_size for stable, deterministic language decode. + lang_arg = lang[:2] if lang and len(lang) >= 2 else None + transcribe_kw = { + "language": lang_arg, + "task": "transcribe", + } + if lang_arg is None: + transcribe_kw["condition_on_previous_text"] = False + transcribe_kw["temperature"] = 0 + transcribe_kw["beam_size"] = WHISPER_BEAM_SIZE + transcribe_kw["best_of"] = WHISPER_BEST_OF + result = whisper_model.transcribe(temp_file_path, **transcribe_kw) - # Get detected language - detected_lang = result.get("language", lang or "unknown") + # Get detected language from decoding result (same as reference: getattr(info, "language", None) or "en") + detected_lang = result.get("language") or lang or "en" detected_prob = None # Map to BCP-47 diff --git a/zaban_backend/app/services/faster_whisper_stt.py b/zaban_backend/app/services/faster_whisper_stt.py index 8d4599c..b82eabc 100644 --- a/zaban_backend/app/services/faster_whisper_stt.py +++ b/zaban_backend/app/services/faster_whisper_stt.py @@ -191,6 +191,15 @@ async def transcribe( "task": task, "verbose": False, } + # Improve auto language detection accuracy (same approach as reference + best practices): + # - Do not condition on previous text so detection is not biased by prior context. + # - Use temperature=0 for deterministic decoding so language detection is stable. + # - Use beam search when auto-detecting for more consistent language decode. + if lang_arg is None: + transcribe_kw["condition_on_previous_text"] = False + transcribe_kw["temperature"] = 0 + transcribe_kw["beam_size"] = WHISPER_BEAM_SIZE + transcribe_kw["best_of"] = WHISPER_BEST_OF # Add beam search options for translation to improve quality if translate_to_english: transcribe_kw["beam_size"] = WHISPER_BEAM_SIZE @@ -219,6 +228,7 @@ async def transcribe( for s in raw_segments ] full_text = (result.get("text") or "").strip() + # Language from decoding result (same as reference: getattr(info, "language", None) or "en") detected_lang = result.get("language") or lang_arg or "en" bcp47_lang = self.WHISPER_TO_BCP47.get(detected_lang, f"{detected_lang}_Latn")