Skip to content

Commit a46111a

Browse files
authored
Merge pull request #244 from Dartvauder/dev
LLM-FIX
2 parents 1366d43 + 2530c5b commit a46111a

File tree

3 files changed

+9
-6
lines changed

3 files changed

+9
-6
lines changed

LaunchFile/app.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -901,16 +901,16 @@ def load_qwen2_audio_model():
901901
return processor, model
902902

903903

904-
def process_qwen2_audio(processor, model, audio_file, prompt):
904+
def process_qwen2_audio(processor, model, input_audio_mm, prompt):
905905
conversation = [
906906
{'role': 'system', 'content': 'You are a helpful assistant.'},
907907
{"role": "user", "content": [
908-
{"type": "audio", "audio_url": audio_file},
908+
{"type": "audio", "audio_url": input_audio_mm},
909909
{"type": "text", "text": prompt},
910910
]},
911911
]
912912
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
913-
audio, _ = librosa.load(audio_file, sr=processor.feature_extractor.sampling_rate)
913+
audio, _ = librosa.load(input_audio_mm, sr=processor.feature_extractor.sampling_rate)
914914
inputs = processor(text=text, audios=[audio], return_tensors="pt", padding=True)
915915
inputs.input_ids = inputs.input_ids.to("cuda")
916916
generate_ids = model.generate(**inputs, max_length=256)
@@ -1024,7 +1024,7 @@ def get_languages():
10241024
}
10251025

10261026

1027-
def generate_text_and_speech(input_text, system_prompt, input_audio, llm_model_type, llm_model_name, llm_lora_model_name, enable_web_search, enable_libretranslate, target_lang, enable_openparse, pdf_file, enable_multimodal, input_image, input_video, enable_tts,
1027+
def generate_text_and_speech(input_text, system_prompt, input_audio, llm_model_type, llm_model_name, llm_lora_model_name, enable_web_search, enable_libretranslate, target_lang, enable_openparse, pdf_file, enable_multimodal, input_image, input_video, input_audio_mm, enable_tts,
10281028
llm_settings_html, max_new_tokens, max_length, min_length, n_ctx, n_batch, temperature, top_p, min_p, typical_p, top_k,
10291029
do_sample, early_stopping, stopping, repetition_penalty, frequency_penalty, presence_penalty, length_penalty, no_repeat_ngram_size, num_beams, num_return_sequences, chat_history_format, tts_settings_html, speaker_wav, language, tts_temperature, tts_top_p, tts_top_k, tts_speed, tts_repetition_penalty, tts_length_penalty, output_format):
10301030
global chat_history, chat_dir, tts_model, whisper_model
@@ -1217,7 +1217,7 @@ def image_to_base64_data_uri(image_path):
12171217
return None, None, "Qwen2-Audio is not supported with llama model type."
12181218
else:
12191219
try:
1220-
response = process_qwen2_audio(processor, model, input_audio, prompt)
1220+
response = process_qwen2_audio(processor, model, input_audio_mm, prompt)
12211221
if not chat_history or chat_history[-1][1] is not None:
12221222
chat_history.append([prompt, ""])
12231223
chat_history[-1][1] = response
@@ -8863,6 +8863,7 @@ def reload_interface():
88638863
gr.Checkbox(label=_("Enable Multimodal", lang), value=False),
88648864
gr.Image(label=_("Upload your image (for Multimodal)", lang), type="filepath"),
88658865
gr.Video(label=_("Upload your video (for Multimodal)", lang)),
8866+
gr.Audio(label=_("Upload your audio (for Multimodal)", lang), type="filepath"),
88668867
gr.Checkbox(label=_("Enable TTS", lang), value=False),
88678868
gr.HTML(_("<h3>LLM Settings</h3>", lang)),
88688869
gr.Slider(minimum=256, maximum=32768, value=512, step=1, label=_("Max tokens", lang)),
@@ -11490,7 +11491,7 @@ def reload_interface():
1149011491
dropdowns_to_update = [
1149111492
chat_interface.input_components[4],
1149211493
chat_interface.input_components[5],
11493-
chat_interface.input_components[38],
11494+
chat_interface.input_components[40],
1149411495
tts_stt_interface.input_components[3],
1149511496
txt2img_interface.input_components[2],
1149611497
txt2img_interface.input_components[4],

translations/ru.json

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"LLM and TTS Settings": "LLM и TTS настройки",
88
"TTS and STT Settings": "TTS и STT настройки",
99
"Upload your video (for Multimodal)": "Загрузите ваше видео (для мультимодального режима)",
10+
"Upload your audio (for Multimodal)": "Загрузите свое аудио (для мультимодального режима)",
1011
"Max tokens": "Максимум токенов",
1112
"Min length": "Минимальная длина",
1213
"Context size (N_CTX) for llama type models": "Размер контекста (N_CTX) для моделей типа llama",

translations/zh.json

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"LLM and TTS Settings": "LLM和TTS设置",
88
"TTS and STT Settings": "TTS和STT设置",
99
"Upload your video (for Multimodal)": "上传您的视频(用于多模态)",
10+
"Upload your audio (for Multimodal)": "上传您的音频(用于多模态)",
1011
"Max tokens": "最大令牌数",
1112
"Min length": "最小长度",
1213
"Context size (N_CTX) for llama type models": "llama类型模型的上下文大小 (N_CTX)",

0 commit comments

Comments
 (0)