Dartvauder
diff --git a/‎LaunchFile/app.py
+163-9 b/‎LaunchFile/app.py
+163-9
diff --git a/‎README.md
+7-2 b/‎README.md
+7-2
@@ -30,6 +30,7 @@
 import base64
 import gc
 import cv2
+import av
 import subprocess
 import json
 import torch
@@ -85,6 +86,9 @@ def wrapper():
 AutoTokenizer = lazy_import('transformers', 'AutoTokenizer')
 AutoProcessor = lazy_import('transformers', 'AutoProcessor')
 TextIteratorStreamer = lazy_import('transformers', 'TextIteratorStreamer')
+LlavaNextVideoProcessor = lazy_import('transformers', 'LlavaNextVideoProcessor')
+LlavaNextVideoForConditionalGeneration = lazy_import('transformers', 'LlavaNextVideoForConditionalGeneration')
+Qwen2AudioForConditionalGeneration = lazy_import('transformers', 'Qwen2AudioForConditionalGeneration')
 BarkModel = lazy_import('transformers', 'BarkModel')
 pipeline = lazy_import('transformers', 'pipeline')
 T5EncoderModel = lazy_import('transformers', 'T5EncoderModel')
@@ -103,7 +107,6 @@ def wrapper():
 GPT2Tokenizer = lazy_import('transformers', 'GPT2Tokenizer')
 GPT2LMHeadModel = lazy_import('transformers', 'GPT2LMHeadModel')
 GenerationConfig = lazy_import('transformers', 'GenerationConfig')
-PeftModel = lazy_import('peft', 'PeftModel')
 
 # Diffusers import
 diffusers = lazy_import('diffusers', '')
@@ -225,6 +228,7 @@ def wrapper():
 pixelize = lazy_import('pixeloe.pixelize', 'pixelize')
 RVCInference = lazy_import('rvc_python.infer', 'RVCInference')
 remove = lazy_import('rembg', 'remove')
+PeftModel = lazy_import('peft', 'PeftModel')
 
 
 XFORMERS_AVAILABLE = False
@@ -825,12 +829,12 @@ def load_lora_model(base_model_name, lora_model_name, model_type):
 
 
 def load_moondream2_model(model_id, revision):
-    moondream2_model_path = os.path.join("inputs", "text", "llm_models", model_id)
+    moondream2_model_path = os.path.join("inputs", "text", model_id)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
     try:
         if not os.path.exists(moondream2_model_path):
             print(f"Downloading MoonDream2 model...")
             os.makedirs(moondream2_model_path, exist_ok=True)
-            device = "cuda" if torch.cuda.is_available() else "cpu"
             model = AutoModelForCausalLM().AutoModelForCausalLM.from_pretrained(
                 model_id, trust_remote_code=True, revision=revision
             ).to(device)
@@ -839,7 +843,6 @@ def load_moondream2_model(model_id, revision):
             tokenizer.save_pretrained(moondream2_model_path)
             print("MoonDream2 model downloaded")
         else:
-            device = "cuda" if torch.cuda.is_available() else "cpu"
             model = AutoModelForCausalLM().AutoModelForCausalLM.from_pretrained(moondream2_model_path, trust_remote_code=True).to(device)
             tokenizer = AutoTokenizer().AutoTokenizer.from_pretrained(moondream2_model_path)
         return model, tokenizer, None
@@ -852,6 +855,70 @@ def load_moondream2_model(model_id, revision):
         flush()
 
 
+def load_llava_next_video_model():
+    model_path = os.path.join("inputs", "text", "LLaVA-NeXT-Video")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    try:
+        if not os.path.exists(model_path):
+            print("Downloading LLaVA-NeXT-Video model...")
+            os.makedirs(model_path, exist_ok=True)
+            model = LlavaNextVideoForConditionalGeneration().LlavaNextVideoForConditionalGeneration.from_pretrained(
+                "llava-hf/LLaVA-NeXT-Video-7B-hf",
+                torch_dtype=torch.float16,
+                low_cpu_mem_usage=True,
+                trust_remote_code=True
+            ).to(device)
+            processor = LlavaNextVideoProcessor().LlavaNextVideoProcessor.from_pretrained(
+                "llava-hf/LLaVA-NeXT-Video-7B-hf")
+            model.save_pretrained(model_path)
+            processor.save_pretrained(model_path)
+            print("LLaVA-NeXT-Video model downloaded")
+        else:
+            model = LlavaNextVideoForConditionalGeneration().LlavaNextVideoForConditionalGeneration.from_pretrained(
+                model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True, trust_remote_code=True).to(device)
+            processor = LlavaNextVideoProcessor().LlavaNextVideoProcessor.from_pretrained(model_path)
+        return model, processor, None
+
+    except Exception as e:
+        return None, None, str(e)
+    finally:
+        del processor
+        del model
+        flush()
+
+
+def load_qwen2_audio_model():
+    qwen2_audio_path = os.path.join("inputs", "text", "Qwen2-Audio")
+    if not os.path.exists(qwen2_audio_path):
+        print("Downloading Qwen2-Audio model...")
+        os.makedirs(qwen2_audio_path, exist_ok=True)
+        Repo.clone_from("https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct", qwen2_audio_path)
+        print("Qwen2-Audio model downloaded")
+
+    processor = AutoProcessor().AutoProcessor.from_pretrained(qwen2_audio_path)
+    model = Qwen2AudioForConditionalGeneration().Qwen2AudioForConditionalGeneration.from_pretrained(qwen2_audio_path,
+                                                                                                    device_map="auto")
+    return processor, model
+
+
+def process_qwen2_audio(processor, model, audio_file, prompt):
+    conversation = [
+        {'role': 'system', 'content': 'You are a helpful assistant.'},
+        {"role": "user", "content": [
+            {"type": "audio", "audio_url": audio_file},
+            {"type": "text", "text": prompt},
+        ]},
+    ]
+    text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+    audio, _ = librosa.load(audio_file, sr=processor.feature_extractor.sampling_rate)
+    inputs = processor(text=text, audios=[audio], return_tensors="pt", padding=True)
+    inputs.input_ids = inputs.input_ids.to("cuda")
+    generate_ids = model.generate(**inputs, max_length=256)
+    generate_ids = generate_ids[:, inputs.input_ids.size(1):]
+    response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    return response
+
+
 def transcribe_audio(audio_file_path):
     device = "cuda" if torch.cuda.is_available() else "cpu"
     whisper_model_path = "inputs/text/whisper-medium"
@@ -935,6 +1002,19 @@ def load_multiband_diffusion_model():
     return multiband_diffusion_path
 
 
+def read_video_pyav(container, indices):
+    frames = []
+    container.seek(0)
+    start_index = indices[0]
+    end_index = indices[-1]
+    for i, frame in enumerate(container.decode(video=0)):
+        if i > end_index:
+            break
+        if i >= start_index and i in indices:
+            frames.append(frame)
+    return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
 def get_languages():
     return {
         "Arabic": "ara", "Chinese": "cmn", "English": "eng", "French": "fra",
@@ -944,7 +1024,7 @@ def get_languages():
     }
 
 
-def generate_text_and_speech(input_text, system_prompt, input_audio, llm_model_type, llm_model_name, llm_lora_model_name, enable_web_search, enable_libretranslate, target_lang, enable_openparse, pdf_file, enable_multimodal, input_image, enable_tts,
+def generate_text_and_speech(input_text, system_prompt, input_audio, llm_model_type, llm_model_name, llm_lora_model_name, enable_web_search, enable_libretranslate, target_lang, enable_openparse, pdf_file, enable_multimodal, input_image, input_video, enable_tts,
                              llm_settings_html, max_new_tokens, max_length, min_length, n_ctx, n_batch, temperature, top_p, min_p, typical_p, top_k,
                              do_sample, early_stopping, stopping, repetition_penalty, frequency_penalty, presence_penalty, length_penalty, no_repeat_ngram_size, num_beams, num_return_sequences, chat_history_format, tts_settings_html, speaker_wav, language, tts_temperature, tts_top_p, tts_top_k, tts_speed, tts_repetition_penalty, tts_length_penalty, output_format):
     global chat_history, chat_dir, tts_model, whisper_model
@@ -974,9 +1054,9 @@ def generate_text_and_speech(input_text, system_prompt, input_audio, llm_model_t
     else:
         openparse_context = ""
 
-    if enable_multimodal and llm_model_name == "moondream2":
+    if enable_multimodal and llm_model_name == "Moondream2-Image":
         if llm_model_type == "llama":
-            moondream2_path = os.path.join("inputs", "text", "llm_models", "moondream2")
+            moondream2_path = os.path.join("inputs", "text", "moondream2-cpp")
 
             if not os.path.exists(moondream2_path):
                 print("Downloading Moondream2 model...")
@@ -1076,6 +1156,79 @@ def image_to_base64_data_uri(image_path):
                 del tokenizer
                 flush()
 
+    elif enable_multimodal and llm_model_name == "LLaVA-NeXT-Video":
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        if llm_model_type == "llama":
+            return None, None, "LLaVA-NeXT-Video is not supported with llama model type."
+        else:
+
+            model, processor = load_llava_next_video_model()
+
+            try:
+                if input_video:
+                    container = av.open(input_video)
+                    total_frames = container.streams.video[0].frames
+                    indices = np.arange(0, total_frames, total_frames / 8).astype(int)
+                    clip = read_video_pyav(container, indices)
+                else:
+                    return None, None, "Please upload a video for LLaVA-NeXT-Video input."
+
+                context = ""
+                for human_text, ai_text in chat_history[-5:]:
+                    if human_text:
+                        context += f"Human: {human_text}\n"
+                    if ai_text:
+                        context += f"AI: {ai_text}\n"
+
+                conversation = [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": f"{context}Human: {prompt}"},
+                            {"type": "video"},
+                        ],
+                    },
+                ]
+
+                if not chat_history or chat_history[-1][1] is not None:
+                    chat_history.append([prompt, ""])
+
+                prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+                inputs_video = processor(text=prompt, videos=clip, padding=True, return_tensors="pt").to(device)
+
+                output = model.generate(**inputs_video, max_new_tokens=max_new_tokens, do_sample=do_sample,
+                                        temperature=temperature, top_p=top_p, top_k=top_k)
+
+                response = processor.decode(output[0][2:], skip_special_tokens=True)
+                chat_history[-1][1] = response
+                yield chat_history, None, None
+
+            except Exception as e:
+                return None, None, str(e)
+
+            finally:
+                del model
+                del processor
+                flush()
+
+    elif enable_multimodal and llm_model_name == "Qwen2-Audio":
+        processor, model = load_qwen2_audio_model()
+        if llm_model_type == "llama":
+            return None, None, "Qwen2-Audio is not supported with llama model type."
+        else:
+            try:
+                response = process_qwen2_audio(processor, model, input_audio, prompt)
+                if not chat_history or chat_history[-1][1] is not None:
+                    chat_history.append([prompt, ""])
+                chat_history[-1][1] = response
+                yield chat_history, None, None
+            except Exception as e:
+                return None, None, str(e)
+            finally:
+                del processor
+                del model
+                flush()
+
     else:
         if llm_model_type == "llama":
             tokenizer, llm_model, error_message = load_model(llm_model_name, llm_model_type, n_ctx, n_batch)
@@ -8609,7 +8762,7 @@ def open_outputs_folder():
             os.system(f'open "{outputs_folder}"' if os.name == "darwin" else f'xdg-open "{outputs_folder}"')
 
 
-llm_models_list = [None, "moondream2"] + [model for model in os.listdir("inputs/text/llm_models") if not model.endswith(".txt") and model != "vikhyatk" and model != "lora"]
+llm_models_list = [None, "Moondream2-Image", "LLaVA-NeXT-Video", "Qwen2-Audio"] + [model for model in os.listdir("inputs/text/llm_models") if not model.endswith(".txt") and model != "vikhyatk" and model != "lora"]
 llm_lora_models_list = [None] + [model for model in os.listdir("inputs/text/llm_models/lora") if not model.endswith(".txt")]
 speaker_wavs_list = [None] + [wav for wav in os.listdir("inputs/audio/voices") if not wav.endswith(".txt")]
 stable_diffusion_models_list = [None] + [model for model in os.listdir("inputs/image/sd_models")
@@ -8641,7 +8794,7 @@ def open_outputs_folder():
 def reload_model_lists():
     global llm_models_list, llm_lora_models_list, speaker_wavs_list, stable_diffusion_models_list, vae_models_list, lora_models_list, quantized_flux_models_list, flux_lora_models_list, auraflow_lora_models_list, kolors_lora_models_list, textual_inversion_models_list, inpaint_models_list, rvc_models_list
 
-    llm_models_list = [None, "moondream2"] + [model for model in os.listdir("inputs/text/llm_models") if
+    llm_models_list = [None, "Moondream2-Image", "LLaVA-NeXT-Video", "Qwen2-Audio"] + [model for model in os.listdir("inputs/text/llm_models") if
                                               not model.endswith(".txt") and model != "vikhyatk" and model != "lora"]
     llm_lora_models_list = [None] + [model for model in os.listdir("inputs/text/llm_models/lora") if
                                      not model.endswith(".txt")]
@@ -8709,6 +8862,7 @@ def reload_interface():
         gr.File(label=_("Upload PDF file (for OpenParse)", lang), type="filepath"),
         gr.Checkbox(label=_("Enable Multimodal", lang), value=False),
         gr.Image(label=_("Upload your image (for Multimodal)", lang), type="filepath"),
+        gr.Video(label=_("Upload your video (for Multimodal)", lang)),
         gr.Checkbox(label=_("Enable TTS", lang), value=False),
         gr.HTML(_("<h3>LLM Settings</h3>", lang)),
         gr.Slider(minimum=256, maximum=32768, value=512, step=1, label=_("Max tokens", lang)),
 
@@ -6,7 +6,7 @@
 
 ## Description:
 
-A simple and convenient interface for using various neural network models. You can communicate with LLM and Moondream2 using text, voice and image input; use StableDiffusion, Kandinsky, Flux, HunyuanDiT, Lumina-T2X, Kolors, AuraFlow, Würstchen, DeepFloydIF, PixArt and PlaygroundV2.5, to generate images; ModelScope, ZeroScope 2, CogVideoX and Latte to generate videos; StableFast3D, Shap-E, SV34D and Zero123Plus to generate 3D objects; StableAudioOpen, AudioCraft and AudioLDM 2 to generate music and audio; CoquiTTS, MMS and SunoBark for text-to-speech; OpenAI-Whisper and MMS for speech-to-text; Wav2Lip for lip-sync; LivePortrait for animate an image; Roop to faceswap; Rembg to remove background; CodeFormer for face restore; PixelOE for image pixelization; DDColor for image colorization; LibreTranslate and SeamlessM4Tv2 for text translation; Demucs and UVR for audio file separation; RVC for voice conversion. You can also view files from the outputs directory in gallery, download the LLM and StableDiffusion models, change the application settings inside the interface and check system sensors
+A simple and convenient interface for using various neural network models. You can communicate with LLM using text, voice and image input; use StableDiffusion, Kandinsky, Flux, HunyuanDiT, Lumina-T2X, Kolors, AuraFlow, Würstchen, DeepFloydIF, PixArt and PlaygroundV2.5, to generate images; ModelScope, ZeroScope 2, CogVideoX and Latte to generate videos; StableFast3D, Shap-E, SV34D and Zero123Plus to generate 3D objects; StableAudioOpen, AudioCraft and AudioLDM 2 to generate music and audio; CoquiTTS, MMS and SunoBark for text-to-speech; OpenAI-Whisper and MMS for speech-to-text; Wav2Lip for lip-sync; LivePortrait for animate an image; Roop to faceswap; Rembg to remove background; CodeFormer for face restore; PixelOE for image pixelization; DDColor for image colorization; LibreTranslate and SeamlessM4Tv2 for text translation; Demucs and UVR for audio file separation; RVC for voice conversion. You can also view files from the outputs directory in gallery, download the LLM and StableDiffusion models, change the application settings inside the interface and check system sensors
 
 The goal of the project - to create the easiest possible application to use neural network models
 
@@ -51,7 +51,7 @@ The goal of the project - to create the easiest possible application to use neur
 * Support StableFast3D, Shap-E, SV34D and Zero123Plus for 3D generation
 * Support Wav2Lip
 * Support LivePortrait for animate an image
-* Support Multimodal (Moondream 2), PDF-Parsing (OpenParse), TTS (CoquiTTS), STT (Whisper), LORA and WebSearch (with DuckDuckGo) for LLM
+* Support Multimodal (Moondream 2, LLaVA-NeXT-Video, Qwen2-Audio), PDF-Parsing (OpenParse), TTS (CoquiTTS), STT (Whisper), LORA and WebSearch (with DuckDuckGo) for LLM
 * MetaData-Info viewer for generating image, video and audio
 * Model settings inside the interface
 * Online and offline Wiki
@@ -152,6 +152,9 @@ First of all, I want to thank the developers of [PyCharm](https://www.jetbrains.
 #### Many models have their own license for use. Before using it, I advise you to familiarize yourself with them:
 
 * [Transformers](https://github.com/huggingface/transformers/blob/main/LICENSE)
+* [AutoGPTQ](https://github.com/AutoGPTQ/AutoGPTQ/blob/main/LICENSE)
+* [AutoAWQ](https://github.com/casper-hansen/AutoAWQ/blob/main/LICENSE)
+* [exllamav2](https://github.com/turboderp/exllamav2/blob/master/LICENSE)
 * [llama.cpp](https://github.com/ggerganov/llama.cpp/blob/master/LICENSE)
 * [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp/blob/master/LICENSE)
 * [CoquiTTS](https://coqui.ai/cpml)
@@ -174,6 +177,8 @@ First of all, I want to thank the developers of [PyCharm](https://www.jetbrains.
 * [Demucs](https://github.com/facebookresearch/demucs/blob/main/LICENSE)
 * [SunoBark](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/mit.md)
 * [Moondream2](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md)
+* [LLaVA-NeXT-Video](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/main/LICENSE.txt)
+* [Qwen2-Audio](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md)
 * [ZeroScope2](https://spdx.org/licenses/CC-BY-NC-4.0)
 * [GLIGEN](https://huggingface.co/spaces/CompVis/stable-diffusion-license)
 * [Wav2Lip](https://github.com/Rudrabha/Wav2Lip)