@@ -901,16 +901,16 @@ def load_qwen2_audio_model():
901
901
return processor, model
902
902
903
903
904
- def process_qwen2_audio(processor, model, audio_file , prompt):
904
+ def process_qwen2_audio(processor, model, input_audio_mm , prompt):
905
905
conversation = [
906
906
{'role': 'system', 'content': 'You are a helpful assistant.'},
907
907
{"role": "user", "content": [
908
- {"type": "audio", "audio_url": audio_file },
908
+ {"type": "audio", "audio_url": input_audio_mm },
909
909
{"type": "text", "text": prompt},
910
910
]},
911
911
]
912
912
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
913
- audio, _ = librosa.load(audio_file , sr=processor.feature_extractor.sampling_rate)
913
+ audio, _ = librosa.load(input_audio_mm , sr=processor.feature_extractor.sampling_rate)
914
914
inputs = processor(text=text, audios=[audio], return_tensors="pt", padding=True)
915
915
inputs.input_ids = inputs.input_ids.to("cuda")
916
916
generate_ids = model.generate(**inputs, max_length=256)
@@ -1024,7 +1024,7 @@ def get_languages():
1024
1024
}
1025
1025
1026
1026
1027
- def generate_text_and_speech(input_text, system_prompt, input_audio, llm_model_type, llm_model_name, llm_lora_model_name, enable_web_search, enable_libretranslate, target_lang, enable_openparse, pdf_file, enable_multimodal, input_image, input_video, enable_tts,
1027
+ def generate_text_and_speech(input_text, system_prompt, input_audio, llm_model_type, llm_model_name, llm_lora_model_name, enable_web_search, enable_libretranslate, target_lang, enable_openparse, pdf_file, enable_multimodal, input_image, input_video, input_audio_mm, enable_tts,
1028
1028
llm_settings_html, max_new_tokens, max_length, min_length, n_ctx, n_batch, temperature, top_p, min_p, typical_p, top_k,
1029
1029
do_sample, early_stopping, stopping, repetition_penalty, frequency_penalty, presence_penalty, length_penalty, no_repeat_ngram_size, num_beams, num_return_sequences, chat_history_format, tts_settings_html, speaker_wav, language, tts_temperature, tts_top_p, tts_top_k, tts_speed, tts_repetition_penalty, tts_length_penalty, output_format):
1030
1030
global chat_history, chat_dir, tts_model, whisper_model
@@ -1217,7 +1217,7 @@ def image_to_base64_data_uri(image_path):
1217
1217
return None, None, "Qwen2-Audio is not supported with llama model type."
1218
1218
else:
1219
1219
try:
1220
- response = process_qwen2_audio(processor, model, input_audio , prompt)
1220
+ response = process_qwen2_audio(processor, model, input_audio_mm , prompt)
1221
1221
if not chat_history or chat_history[-1][1] is not None:
1222
1222
chat_history.append([prompt, ""])
1223
1223
chat_history[-1][1] = response
@@ -8863,6 +8863,7 @@ def reload_interface():
8863
8863
gr.Checkbox(label=_("Enable Multimodal", lang), value=False),
8864
8864
gr.Image(label=_("Upload your image (for Multimodal)", lang), type="filepath"),
8865
8865
gr.Video(label=_("Upload your video (for Multimodal)", lang)),
8866
+ gr.Audio(label=_("Upload your audio (for Multimodal)", lang), type="filepath"),
8866
8867
gr.Checkbox(label=_("Enable TTS", lang), value=False),
8867
8868
gr.HTML(_("<h3>LLM Settings</h3>", lang)),
8868
8869
gr.Slider(minimum=256, maximum=32768, value=512, step=1, label=_("Max tokens", lang)),
@@ -11490,7 +11491,7 @@ def reload_interface():
11490
11491
dropdowns_to_update = [
11491
11492
chat_interface.input_components[4],
11492
11493
chat_interface.input_components[5],
11493
- chat_interface.input_components[38 ],
11494
+ chat_interface.input_components[40 ],
11494
11495
tts_stt_interface.input_components[3],
11495
11496
txt2img_interface.input_components[2],
11496
11497
txt2img_interface.input_components[4],
0 commit comments