Skip to content

Commit 1366d43

Browse files
authored
Merge pull request #243 from Dartvauder/dev
Dev
2 parents 2b364ef + ca9ea12 commit 1366d43

File tree

6 files changed

+186
-15
lines changed

6 files changed

+186
-15
lines changed

LaunchFile/app.py

+163-9
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import base64
3131
import gc
3232
import cv2
33+
import av
3334
import subprocess
3435
import json
3536
import torch
@@ -85,6 +86,9 @@ def wrapper():
8586
AutoTokenizer = lazy_import('transformers', 'AutoTokenizer')
8687
AutoProcessor = lazy_import('transformers', 'AutoProcessor')
8788
TextIteratorStreamer = lazy_import('transformers', 'TextIteratorStreamer')
89+
LlavaNextVideoProcessor = lazy_import('transformers', 'LlavaNextVideoProcessor')
90+
LlavaNextVideoForConditionalGeneration = lazy_import('transformers', 'LlavaNextVideoForConditionalGeneration')
91+
Qwen2AudioForConditionalGeneration = lazy_import('transformers', 'Qwen2AudioForConditionalGeneration')
8892
BarkModel = lazy_import('transformers', 'BarkModel')
8993
pipeline = lazy_import('transformers', 'pipeline')
9094
T5EncoderModel = lazy_import('transformers', 'T5EncoderModel')
@@ -103,7 +107,6 @@ def wrapper():
103107
GPT2Tokenizer = lazy_import('transformers', 'GPT2Tokenizer')
104108
GPT2LMHeadModel = lazy_import('transformers', 'GPT2LMHeadModel')
105109
GenerationConfig = lazy_import('transformers', 'GenerationConfig')
106-
PeftModel = lazy_import('peft', 'PeftModel')
107110

108111
# Diffusers import
109112
diffusers = lazy_import('diffusers', '')
@@ -225,6 +228,7 @@ def wrapper():
225228
pixelize = lazy_import('pixeloe.pixelize', 'pixelize')
226229
RVCInference = lazy_import('rvc_python.infer', 'RVCInference')
227230
remove = lazy_import('rembg', 'remove')
231+
PeftModel = lazy_import('peft', 'PeftModel')
228232

229233

230234
XFORMERS_AVAILABLE = False
@@ -825,12 +829,12 @@ def load_lora_model(base_model_name, lora_model_name, model_type):
825829

826830

827831
def load_moondream2_model(model_id, revision):
828-
moondream2_model_path = os.path.join("inputs", "text", "llm_models", model_id)
832+
moondream2_model_path = os.path.join("inputs", "text", model_id)
833+
device = "cuda" if torch.cuda.is_available() else "cpu"
829834
try:
830835
if not os.path.exists(moondream2_model_path):
831836
print(f"Downloading MoonDream2 model...")
832837
os.makedirs(moondream2_model_path, exist_ok=True)
833-
device = "cuda" if torch.cuda.is_available() else "cpu"
834838
model = AutoModelForCausalLM().AutoModelForCausalLM.from_pretrained(
835839
model_id, trust_remote_code=True, revision=revision
836840
).to(device)
@@ -839,7 +843,6 @@ def load_moondream2_model(model_id, revision):
839843
tokenizer.save_pretrained(moondream2_model_path)
840844
print("MoonDream2 model downloaded")
841845
else:
842-
device = "cuda" if torch.cuda.is_available() else "cpu"
843846
model = AutoModelForCausalLM().AutoModelForCausalLM.from_pretrained(moondream2_model_path, trust_remote_code=True).to(device)
844847
tokenizer = AutoTokenizer().AutoTokenizer.from_pretrained(moondream2_model_path)
845848
return model, tokenizer, None
@@ -852,6 +855,70 @@ def load_moondream2_model(model_id, revision):
852855
flush()
853856

854857

858+
def load_llava_next_video_model():
859+
model_path = os.path.join("inputs", "text", "LLaVA-NeXT-Video")
860+
device = "cuda" if torch.cuda.is_available() else "cpu"
861+
try:
862+
if not os.path.exists(model_path):
863+
print("Downloading LLaVA-NeXT-Video model...")
864+
os.makedirs(model_path, exist_ok=True)
865+
model = LlavaNextVideoForConditionalGeneration().LlavaNextVideoForConditionalGeneration.from_pretrained(
866+
"llava-hf/LLaVA-NeXT-Video-7B-hf",
867+
torch_dtype=torch.float16,
868+
low_cpu_mem_usage=True,
869+
trust_remote_code=True
870+
).to(device)
871+
processor = LlavaNextVideoProcessor().LlavaNextVideoProcessor.from_pretrained(
872+
"llava-hf/LLaVA-NeXT-Video-7B-hf")
873+
model.save_pretrained(model_path)
874+
processor.save_pretrained(model_path)
875+
print("LLaVA-NeXT-Video model downloaded")
876+
else:
877+
model = LlavaNextVideoForConditionalGeneration().LlavaNextVideoForConditionalGeneration.from_pretrained(
878+
model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True, trust_remote_code=True).to(device)
879+
processor = LlavaNextVideoProcessor().LlavaNextVideoProcessor.from_pretrained(model_path)
880+
return model, processor, None
881+
882+
except Exception as e:
883+
return None, None, str(e)
884+
finally:
885+
del processor
886+
del model
887+
flush()
888+
889+
890+
def load_qwen2_audio_model():
891+
qwen2_audio_path = os.path.join("inputs", "text", "Qwen2-Audio")
892+
if not os.path.exists(qwen2_audio_path):
893+
print("Downloading Qwen2-Audio model...")
894+
os.makedirs(qwen2_audio_path, exist_ok=True)
895+
Repo.clone_from("https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct", qwen2_audio_path)
896+
print("Qwen2-Audio model downloaded")
897+
898+
processor = AutoProcessor().AutoProcessor.from_pretrained(qwen2_audio_path)
899+
model = Qwen2AudioForConditionalGeneration().Qwen2AudioForConditionalGeneration.from_pretrained(qwen2_audio_path,
900+
device_map="auto")
901+
return processor, model
902+
903+
904+
def process_qwen2_audio(processor, model, audio_file, prompt):
905+
conversation = [
906+
{'role': 'system', 'content': 'You are a helpful assistant.'},
907+
{"role": "user", "content": [
908+
{"type": "audio", "audio_url": audio_file},
909+
{"type": "text", "text": prompt},
910+
]},
911+
]
912+
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
913+
audio, _ = librosa.load(audio_file, sr=processor.feature_extractor.sampling_rate)
914+
inputs = processor(text=text, audios=[audio], return_tensors="pt", padding=True)
915+
inputs.input_ids = inputs.input_ids.to("cuda")
916+
generate_ids = model.generate(**inputs, max_length=256)
917+
generate_ids = generate_ids[:, inputs.input_ids.size(1):]
918+
response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
919+
return response
920+
921+
855922
def transcribe_audio(audio_file_path):
856923
device = "cuda" if torch.cuda.is_available() else "cpu"
857924
whisper_model_path = "inputs/text/whisper-medium"
@@ -935,6 +1002,19 @@ def load_multiband_diffusion_model():
9351002
return multiband_diffusion_path
9361003

9371004

1005+
def read_video_pyav(container, indices):
1006+
frames = []
1007+
container.seek(0)
1008+
start_index = indices[0]
1009+
end_index = indices[-1]
1010+
for i, frame in enumerate(container.decode(video=0)):
1011+
if i > end_index:
1012+
break
1013+
if i >= start_index and i in indices:
1014+
frames.append(frame)
1015+
return np.stack([x.to_ndarray(format="rgb24") for x in frames])
1016+
1017+
9381018
def get_languages():
9391019
return {
9401020
"Arabic": "ara", "Chinese": "cmn", "English": "eng", "French": "fra",
@@ -944,7 +1024,7 @@ def get_languages():
9441024
}
9451025

9461026

947-
def generate_text_and_speech(input_text, system_prompt, input_audio, llm_model_type, llm_model_name, llm_lora_model_name, enable_web_search, enable_libretranslate, target_lang, enable_openparse, pdf_file, enable_multimodal, input_image, enable_tts,
1027+
def generate_text_and_speech(input_text, system_prompt, input_audio, llm_model_type, llm_model_name, llm_lora_model_name, enable_web_search, enable_libretranslate, target_lang, enable_openparse, pdf_file, enable_multimodal, input_image, input_video, enable_tts,
9481028
llm_settings_html, max_new_tokens, max_length, min_length, n_ctx, n_batch, temperature, top_p, min_p, typical_p, top_k,
9491029
do_sample, early_stopping, stopping, repetition_penalty, frequency_penalty, presence_penalty, length_penalty, no_repeat_ngram_size, num_beams, num_return_sequences, chat_history_format, tts_settings_html, speaker_wav, language, tts_temperature, tts_top_p, tts_top_k, tts_speed, tts_repetition_penalty, tts_length_penalty, output_format):
9501030
global chat_history, chat_dir, tts_model, whisper_model
@@ -974,9 +1054,9 @@ def generate_text_and_speech(input_text, system_prompt, input_audio, llm_model_t
9741054
else:
9751055
openparse_context = ""
9761056

977-
if enable_multimodal and llm_model_name == "moondream2":
1057+
if enable_multimodal and llm_model_name == "Moondream2-Image":
9781058
if llm_model_type == "llama":
979-
moondream2_path = os.path.join("inputs", "text", "llm_models", "moondream2")
1059+
moondream2_path = os.path.join("inputs", "text", "moondream2-cpp")
9801060

9811061
if not os.path.exists(moondream2_path):
9821062
print("Downloading Moondream2 model...")
@@ -1076,6 +1156,79 @@ def image_to_base64_data_uri(image_path):
10761156
del tokenizer
10771157
flush()
10781158

1159+
elif enable_multimodal and llm_model_name == "LLaVA-NeXT-Video":
1160+
device = "cuda" if torch.cuda.is_available() else "cpu"
1161+
if llm_model_type == "llama":
1162+
return None, None, "LLaVA-NeXT-Video is not supported with llama model type."
1163+
else:
1164+
1165+
model, processor = load_llava_next_video_model()
1166+
1167+
try:
1168+
if input_video:
1169+
container = av.open(input_video)
1170+
total_frames = container.streams.video[0].frames
1171+
indices = np.arange(0, total_frames, total_frames / 8).astype(int)
1172+
clip = read_video_pyav(container, indices)
1173+
else:
1174+
return None, None, "Please upload a video for LLaVA-NeXT-Video input."
1175+
1176+
context = ""
1177+
for human_text, ai_text in chat_history[-5:]:
1178+
if human_text:
1179+
context += f"Human: {human_text}\n"
1180+
if ai_text:
1181+
context += f"AI: {ai_text}\n"
1182+
1183+
conversation = [
1184+
{
1185+
"role": "user",
1186+
"content": [
1187+
{"type": "text", "text": f"{context}Human: {prompt}"},
1188+
{"type": "video"},
1189+
],
1190+
},
1191+
]
1192+
1193+
if not chat_history or chat_history[-1][1] is not None:
1194+
chat_history.append([prompt, ""])
1195+
1196+
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
1197+
inputs_video = processor(text=prompt, videos=clip, padding=True, return_tensors="pt").to(device)
1198+
1199+
output = model.generate(**inputs_video, max_new_tokens=max_new_tokens, do_sample=do_sample,
1200+
temperature=temperature, top_p=top_p, top_k=top_k)
1201+
1202+
response = processor.decode(output[0][2:], skip_special_tokens=True)
1203+
chat_history[-1][1] = response
1204+
yield chat_history, None, None
1205+
1206+
except Exception as e:
1207+
return None, None, str(e)
1208+
1209+
finally:
1210+
del model
1211+
del processor
1212+
flush()
1213+
1214+
elif enable_multimodal and llm_model_name == "Qwen2-Audio":
1215+
processor, model = load_qwen2_audio_model()
1216+
if llm_model_type == "llama":
1217+
return None, None, "Qwen2-Audio is not supported with llama model type."
1218+
else:
1219+
try:
1220+
response = process_qwen2_audio(processor, model, input_audio, prompt)
1221+
if not chat_history or chat_history[-1][1] is not None:
1222+
chat_history.append([prompt, ""])
1223+
chat_history[-1][1] = response
1224+
yield chat_history, None, None
1225+
except Exception as e:
1226+
return None, None, str(e)
1227+
finally:
1228+
del processor
1229+
del model
1230+
flush()
1231+
10791232
else:
10801233
if llm_model_type == "llama":
10811234
tokenizer, llm_model, error_message = load_model(llm_model_name, llm_model_type, n_ctx, n_batch)
@@ -8609,7 +8762,7 @@ def open_outputs_folder():
86098762
os.system(f'open "{outputs_folder}"' if os.name == "darwin" else f'xdg-open "{outputs_folder}"')
86108763

86118764

8612-
llm_models_list = [None, "moondream2"] + [model for model in os.listdir("inputs/text/llm_models") if not model.endswith(".txt") and model != "vikhyatk" and model != "lora"]
8765+
llm_models_list = [None, "Moondream2-Image", "LLaVA-NeXT-Video", "Qwen2-Audio"] + [model for model in os.listdir("inputs/text/llm_models") if not model.endswith(".txt") and model != "vikhyatk" and model != "lora"]
86138766
llm_lora_models_list = [None] + [model for model in os.listdir("inputs/text/llm_models/lora") if not model.endswith(".txt")]
86148767
speaker_wavs_list = [None] + [wav for wav in os.listdir("inputs/audio/voices") if not wav.endswith(".txt")]
86158768
stable_diffusion_models_list = [None] + [model for model in os.listdir("inputs/image/sd_models")
@@ -8641,7 +8794,7 @@ def open_outputs_folder():
86418794
def reload_model_lists():
86428795
global llm_models_list, llm_lora_models_list, speaker_wavs_list, stable_diffusion_models_list, vae_models_list, lora_models_list, quantized_flux_models_list, flux_lora_models_list, auraflow_lora_models_list, kolors_lora_models_list, textual_inversion_models_list, inpaint_models_list, rvc_models_list
86438796

8644-
llm_models_list = [None, "moondream2"] + [model for model in os.listdir("inputs/text/llm_models") if
8797+
llm_models_list = [None, "Moondream2-Image", "LLaVA-NeXT-Video", "Qwen2-Audio"] + [model for model in os.listdir("inputs/text/llm_models") if
86458798
not model.endswith(".txt") and model != "vikhyatk" and model != "lora"]
86468799
llm_lora_models_list = [None] + [model for model in os.listdir("inputs/text/llm_models/lora") if
86478800
not model.endswith(".txt")]
@@ -8709,6 +8862,7 @@ def reload_interface():
87098862
gr.File(label=_("Upload PDF file (for OpenParse)", lang), type="filepath"),
87108863
gr.Checkbox(label=_("Enable Multimodal", lang), value=False),
87118864
gr.Image(label=_("Upload your image (for Multimodal)", lang), type="filepath"),
8865+
gr.Video(label=_("Upload your video (for Multimodal)", lang)),
87128866
gr.Checkbox(label=_("Enable TTS", lang), value=False),
87138867
gr.HTML(_("<h3>LLM Settings</h3>", lang)),
87148868
gr.Slider(minimum=256, maximum=32768, value=512, step=1, label=_("Max tokens", lang)),

README.md

+7-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
## Description:
88

9-
A simple and convenient interface for using various neural network models. You can communicate with LLM and Moondream2 using text, voice and image input; use StableDiffusion, Kandinsky, Flux, HunyuanDiT, Lumina-T2X, Kolors, AuraFlow, Würstchen, DeepFloydIF, PixArt and PlaygroundV2.5, to generate images; ModelScope, ZeroScope 2, CogVideoX and Latte to generate videos; StableFast3D, Shap-E, SV34D and Zero123Plus to generate 3D objects; StableAudioOpen, AudioCraft and AudioLDM 2 to generate music and audio; CoquiTTS, MMS and SunoBark for text-to-speech; OpenAI-Whisper and MMS for speech-to-text; Wav2Lip for lip-sync; LivePortrait for animate an image; Roop to faceswap; Rembg to remove background; CodeFormer for face restore; PixelOE for image pixelization; DDColor for image colorization; LibreTranslate and SeamlessM4Tv2 for text translation; Demucs and UVR for audio file separation; RVC for voice conversion. You can also view files from the outputs directory in gallery, download the LLM and StableDiffusion models, change the application settings inside the interface and check system sensors
9+
A simple and convenient interface for using various neural network models. You can communicate with LLM using text, voice and image input; use StableDiffusion, Kandinsky, Flux, HunyuanDiT, Lumina-T2X, Kolors, AuraFlow, Würstchen, DeepFloydIF, PixArt and PlaygroundV2.5, to generate images; ModelScope, ZeroScope 2, CogVideoX and Latte to generate videos; StableFast3D, Shap-E, SV34D and Zero123Plus to generate 3D objects; StableAudioOpen, AudioCraft and AudioLDM 2 to generate music and audio; CoquiTTS, MMS and SunoBark for text-to-speech; OpenAI-Whisper and MMS for speech-to-text; Wav2Lip for lip-sync; LivePortrait for animate an image; Roop to faceswap; Rembg to remove background; CodeFormer for face restore; PixelOE for image pixelization; DDColor for image colorization; LibreTranslate and SeamlessM4Tv2 for text translation; Demucs and UVR for audio file separation; RVC for voice conversion. You can also view files from the outputs directory in gallery, download the LLM and StableDiffusion models, change the application settings inside the interface and check system sensors
1010

1111
The goal of the project - to create the easiest possible application to use neural network models
1212

@@ -51,7 +51,7 @@ The goal of the project - to create the easiest possible application to use neur
5151
* Support StableFast3D, Shap-E, SV34D and Zero123Plus for 3D generation
5252
* Support Wav2Lip
5353
* Support LivePortrait for animate an image
54-
* Support Multimodal (Moondream 2), PDF-Parsing (OpenParse), TTS (CoquiTTS), STT (Whisper), LORA and WebSearch (with DuckDuckGo) for LLM
54+
* Support Multimodal (Moondream 2, LLaVA-NeXT-Video, Qwen2-Audio), PDF-Parsing (OpenParse), TTS (CoquiTTS), STT (Whisper), LORA and WebSearch (with DuckDuckGo) for LLM
5555
* MetaData-Info viewer for generating image, video and audio
5656
* Model settings inside the interface
5757
* Online and offline Wiki
@@ -152,6 +152,9 @@ First of all, I want to thank the developers of [PyCharm](https://www.jetbrains.
152152
#### Many models have their own license for use. Before using it, I advise you to familiarize yourself with them:
153153

154154
* [Transformers](https://github.com/huggingface/transformers/blob/main/LICENSE)
155+
* [AutoGPTQ](https://github.com/AutoGPTQ/AutoGPTQ/blob/main/LICENSE)
156+
* [AutoAWQ](https://github.com/casper-hansen/AutoAWQ/blob/main/LICENSE)
157+
* [exllamav2](https://github.com/turboderp/exllamav2/blob/master/LICENSE)
155158
* [llama.cpp](https://github.com/ggerganov/llama.cpp/blob/master/LICENSE)
156159
* [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp/blob/master/LICENSE)
157160
* [CoquiTTS](https://coqui.ai/cpml)
@@ -174,6 +177,8 @@ First of all, I want to thank the developers of [PyCharm](https://www.jetbrains.
174177
* [Demucs](https://github.com/facebookresearch/demucs/blob/main/LICENSE)
175178
* [SunoBark](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/mit.md)
176179
* [Moondream2](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md)
180+
* [LLaVA-NeXT-Video](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/main/LICENSE.txt)
181+
* [Qwen2-Audio](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md)
177182
* [ZeroScope2](https://spdx.org/licenses/CC-BY-NC-4.0)
178183
* [GLIGEN](https://huggingface.co/spaces/CompVis/stable-diffusion-license)
179184
* [Wav2Lip](https://github.com/Rudrabha/Wav2Lip)

0 commit comments

Comments
 (0)