Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions daras_ai_v2/asr.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,15 @@
"uz": "uzbek", "vi": "vietnamese", "cy": "welsh", "yi": "yiddish", "yo": "yoruba",
} # fmt: skip

GEMINI_SUPPORTED = {
"af", "sq", "am", "ar", "hy", "as", "az", "eu", "be", "bn", "bs", "bg", "ca", "ceb", "zh", "co", "hr", "cs", "da",
"dv", "nl", "en", "eo", "et", "fil", "fi", "fr", "fy", "gl", "ka", "de", "el", "gu", "ht", "ha", "haw", "iw", "hi",
"hmn", "hu", "is", "ig", "id", "ga", "it", "ja", "jv", "kn", "kk", "km", "ko", "kri", "ku", "ky", "lo", "la", "lv",
"lt", "lb", "mk", "mg", "ms", "ml", "mt", "mi", "mr", "mni-Mtei", "mn", "my", "ne", "no", "ny", "or", "ps", "fa",
"pl", "pt", "pa", "ro", "ru", "sm", "gd", "sr", "st", "sn", "sd", "si", "sk", "sl", "so", "es", "su", "sw", "sv",
"tg", "ta", "te", "th", "tr", "uk", "ur", "ug", "uz", "vi", "cy", "xh", "yi", "yo", "zu"
} # fmt: skip

# https://huggingface.co/facebook/seamless-m4t-v2-large#supported-languages
# For now, below are listed the languages that support ASR. Note that Seamless only accepts ISO 639-3 codes.
SEAMLESS_v2_ASR_SUPPORTED = {
Expand Down Expand Up @@ -272,6 +281,9 @@ class AsrModels(Enum):
whisper_large_v3 = "Whisper Large v3 (openai)"
gpt_4_o_audio = "GPT-4o (openai)"
gpt_4_o_mini_audio = "GPT-4o mini (openai)"
gemini_2_5_flash_lite = "Gemini 2.5 Flash Lite (Google)"
gemini_2_5_flash = "Gemini 2.5 Flash (Google)"
gemini_2_5_pro = "Gemini 2.5 Pro (Google)"
gcp_v1 = "Google Cloud V1"
usm = "Chirp / USM (Google V2)"
deepgram = "Deepgram"
Expand Down Expand Up @@ -336,6 +348,9 @@ def supports_input_prompt(self) -> bool:


asr_model_ids = {
AsrModels.gemini_2_5_flash_lite: "gemini-2.5-flash-lite",
AsrModels.gemini_2_5_flash: "gemini-2.5-flash",
AsrModels.gemini_2_5_pro: "gemini-2.5-pro",
AsrModels.gpt_4_o_audio: "gpt-4o-transcribe",
AsrModels.gpt_4_o_mini_audio: "gpt-4o-mini-transcribe",
AsrModels.whisper_large_v3: "vaibhavs10/incredibly-fast-whisper:3ab86df6c8f54c11309d4d1f930ac292bad43ace52d10c80d87eb258b3c9f79c",
Expand Down Expand Up @@ -365,6 +380,9 @@ def supports_input_prompt(self) -> bool:
}

asr_supported_languages = {
AsrModels.gemini_2_5_flash_lite: GEMINI_SUPPORTED,
AsrModels.gemini_2_5_flash: GEMINI_SUPPORTED,
AsrModels.gemini_2_5_pro: GEMINI_SUPPORTED,
AsrModels.whisper_large_v3: WHISPER_LARGE_V3_SUPPORTED,
AsrModels.gpt_4_o_audio: WHISPER_LARGE_V2_SUPPORTED, # https://platform.openai.com/docs/guides/speech-to-text#supported-languages
AsrModels.gpt_4_o_mini_audio: WHISPER_LARGE_V2_SUPPORTED,
Expand Down Expand Up @@ -1284,6 +1302,38 @@ def run_asr(
prompt=input_prompt,
response_format="text",
)
elif selected_model in {
AsrModels.gemini_2_5_flash_lite,
AsrModels.gemini_2_5_flash,
AsrModels.gemini_2_5_pro,
}:
from daras_ai_v2.language_model import CHATML_ROLE_USER, call_gemini_api

if language:
lobj = langcodes.Language.get(language.strip())
prompt = f"Transcribe this audio without translation. The spoken language is {lobj.display_name()}."
else:
prompt = "Transcribe this audio."

return call_gemini_api(
model_id=asr_model_ids[selected_model],
contents=[
{
"role": CHATML_ROLE_USER,
"parts": [
{
"fileData": {
"fileUri": audio_url,
"mimeType": "audio/wav",
}
},
{"text": prompt},
],
}
],
max_output_tokens=16384,
temperature=0.0,
)
Comment on lines +1305 to +1336
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

⚠️ Potential issue

Vertex fileUri likely needs a GCS URI; also add translation support.

  • Vertex fileData.fileUri usually expects a GCS URI. Use gs_url_to_uri(audio_url) for consistency with image handling.
  • Honor speech_translation_target by adjusting the prompt.

Apply:

@@
-    elif selected_model in {
+    elif selected_model in {
         AsrModels.gemini_2_5_flash_lite,
         AsrModels.gemini_2_5_flash,
         AsrModels.gemini_2_5_pro,
     }:
         from daras_ai_v2.language_model import CHATML_ROLE_USER, call_gemini_api
 
-        if language:
-            lobj = langcodes.Language.get(language.strip())
-            prompt = f"Transcribe this audio without translation. The spoken language is {lobj.display_name()}."
-        else:
-            prompt = "Transcribe this audio."
+        if language:
+            lobj = langcodes.Language.get(language.strip())
+            prompt = f"Transcribe this audio without translation. The spoken language is {lobj.display_name()}."
+        else:
+            prompt = "Transcribe this audio."
+        if speech_translation_target:
+            tgt = langcodes.Language.get(speech_translation_target.strip())
+            prompt = f"Transcribe this audio and then translate the transcript into {tgt.display_name()}. Return only the translated text."
 
         return call_gemini_api(
             model_id=asr_model_ids[selected_model],
             contents=[
                 {
                     "role": CHATML_ROLE_USER,
                     "parts": [
                         {
                             "fileData": {
-                                "fileUri": audio_url,
+                                "fileUri": gs_url_to_uri(audio_url),
                                 "mimeType": "audio/wav",
                             }
                         },
                         {"text": prompt},
                     ],
                 }
             ],
             max_output_tokens=16384,
             temperature=0.0,
         )
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
elif selected_model in {
AsrModels.gemini_2_5_flash_lite,
AsrModels.gemini_2_5_flash,
AsrModels.gemini_2_5_pro,
}:
from daras_ai_v2.language_model import CHATML_ROLE_USER, call_gemini_api
if language:
lobj = langcodes.Language.get(language.strip())
prompt = f"Transcribe this audio without translation. The spoken language is {lobj.display_name()}."
else:
prompt = "Transcribe this audio."
return call_gemini_api(
model_id=asr_model_ids[selected_model],
contents=[
{
"role": CHATML_ROLE_USER,
"parts": [
{
"fileData": {
"fileUri": audio_url,
"mimeType": "audio/wav",
}
},
{"text": prompt},
],
}
],
max_output_tokens=16384,
temperature=0.0,
)
elif selected_model in {
AsrModels.gemini_2_5_flash_lite,
AsrModels.gemini_2_5_flash,
AsrModels.gemini_2_5_pro,
}:
from daras_ai_v2.language_model import CHATML_ROLE_USER, call_gemini_api
if language:
lobj = langcodes.Language.get(language.strip())
prompt = f"Transcribe this audio without translation. The spoken language is {lobj.display_name()}."
else:
prompt = "Transcribe this audio."
if speech_translation_target:
tgt = langcodes.Language.get(speech_translation_target.strip())
prompt = f"Transcribe this audio and then translate the transcript into {tgt.display_name()}. Return only the translated text."
return call_gemini_api(
model_id=asr_model_ids[selected_model],
contents=[
{
"role": CHATML_ROLE_USER,
"parts": [
{
"fileData": {
"fileUri": gs_url_to_uri(audio_url),
"mimeType": "audio/wav",
}
},
{"text": prompt},
],
}
],
max_output_tokens=16384,
temperature=0.0,
)
🤖 Prompt for AI Agents
daras_ai_v2/asr.py around lines 1305 to 1336: Vertex's fileData.fileUri should
be a GCS URI and the prompt must honor speech_translation_target; replace
audio_url with gs_url_to_uri(audio_url) when building fileUri (import
gs_url_to_uri if not already imported) and adjust the prompt construction to
include translation when speech_translation_target is set (e.g., if
speech_translation_target: prompt should instruct to translate the audio to that
language, otherwise keep "Transcribe this audio" or "Transcribe this audio
without translation" depending on current behavior); ensure the call_gemini_api
contents use the converted GCS URI and that speech_translation_target is
referenced safely (strip/validate) when composing the prompt.

# call one of the self-hosted models
else:
kwargs = {}
Expand Down
6 changes: 3 additions & 3 deletions daras_ai_v2/language_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2118,7 +2118,7 @@ def _run_gemini_pro(
"parts": [{"text": "OK"}],
},
)
msg = _call_gemini_api(
msg = call_gemini_api(
model_id=model_id,
contents=contents,
max_output_tokens=max_output_tokens,
Expand Down Expand Up @@ -2155,7 +2155,7 @@ def _run_gemini_pro_vision(
}
]
return [
_call_gemini_api(
call_gemini_api(
model_id=model_id,
contents=contents,
max_output_tokens=max_output_tokens,
Expand All @@ -2166,7 +2166,7 @@ def _run_gemini_pro_vision(


@retry_if(vertex_ai_should_retry)
def _call_gemini_api(
def call_gemini_api(
*,
model_id: str,
contents: list[dict],
Expand Down
28 changes: 28 additions & 0 deletions scripts/init_llm_pricing.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,34 @@ def run():

# Gemini

llm_pricing_create(
model_id="gemini-2.5-flash-lite",
model_name=LargeLanguageModels.gemini_2_5_flash_lite.name,
unit_cost_input=0.1,
unit_cost_output=0.4,
unit_quantity=10**6,
provider=ModelProvider.google,
pricing_url="https://ai.google.dev/gemini-api/docs/pricing#gemini-2.5-flash-lite",
)
llm_pricing_create(
model_id="gemini-2.5-pro",
model_name=LargeLanguageModels.gemini_2_5_pro.name,
unit_cost_input=1.25,
unit_cost_output=10,
unit_quantity=10**6,
provider=ModelProvider.google,
pricing_url="https://ai.google.dev/gemini-api/docs/pricing#gemini-2.5-pro",
)
llm_pricing_create(
model_id="gemini-2.5-flash",
model_name=LargeLanguageModels.gemini_2_5_flash.name,
unit_cost_input=0.30,
unit_cost_output=2.5,
unit_quantity=10**6,
provider=ModelProvider.google,
pricing_url="https://ai.google.dev/gemini-api/docs/pricing#gemini-2.5-flash",
)
# duplicate: because model_id is prefixed with "google/" with the OpenAI-compatible API
llm_pricing_create(
model_id="google/gemini-2.5-flash-lite",
model_name=LargeLanguageModels.gemini_2_5_flash_lite.name,
Expand Down