Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 16 additions & 9 deletions zaban_backend/app/routes/v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from ..services.ai4bharat import Ai4BharatClient
from ..services.language_detection import get_language_detector
from ..services.constants import WHISPER_TO_BCP47
from ..services.constants import WHISPER_TO_BCP47, WHISPER_BEAM_SIZE, WHISPER_BEST_OF


router = APIRouter()
Expand Down Expand Up @@ -360,15 +360,22 @@ async def stt(
temp_file_path = temp_file.name

try:
# Transcribe
result = whisper_model.transcribe(
temp_file_path,
language=lang[:2] if lang and len(lang) >= 2 else None,
task="transcribe"
)
# Transcribe with improved auto language detection: condition_on_previous_text=False,
# temperature=0, and beam_size for stable, deterministic language decode.
lang_arg = lang[:2] if lang and len(lang) >= 2 else None
transcribe_kw = {
"language": lang_arg,
"task": "transcribe",
}
if lang_arg is None:
transcribe_kw["condition_on_previous_text"] = False
transcribe_kw["temperature"] = 0
transcribe_kw["beam_size"] = WHISPER_BEAM_SIZE
transcribe_kw["best_of"] = WHISPER_BEST_OF
result = whisper_model.transcribe(temp_file_path, **transcribe_kw)

# Get detected language
detected_lang = result.get("language", lang or "unknown")
# Get detected language from decoding result (same as reference: getattr(info, "language", None) or "en")
detected_lang = result.get("language") or lang or "en"
detected_prob = None

# Map to BCP-47
Expand Down
10 changes: 10 additions & 0 deletions zaban_backend/app/services/faster_whisper_stt.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,15 @@ async def transcribe(
"task": task,
"verbose": False,
}
# Improve auto language detection accuracy (same approach as reference + best practices):
# - Do not condition on previous text so detection is not biased by prior context.
# - Use temperature=0 for deterministic decoding so language detection is stable.
# - Use beam search when auto-detecting for more consistent language decode.
if lang_arg is None:
transcribe_kw["condition_on_previous_text"] = False
transcribe_kw["temperature"] = 0
transcribe_kw["beam_size"] = WHISPER_BEAM_SIZE
transcribe_kw["best_of"] = WHISPER_BEST_OF
# Add beam search options for translation to improve quality
if translate_to_english:
transcribe_kw["beam_size"] = WHISPER_BEAM_SIZE
Expand Down Expand Up @@ -219,6 +228,7 @@ async def transcribe(
for s in raw_segments
]
full_text = (result.get("text") or "").strip()
# Language from decoding result (same as reference: getattr(info, "language", None) or "en")
detected_lang = result.get("language") or lang_arg or "en"
bcp47_lang = self.WHISPER_TO_BCP47.get(detected_lang, f"{detected_lang}_Latn")

Expand Down