deepbeepmeep · Gunther-Schulz · Jun 20, 2025 · Jul 9, 2025 · Jul 11, 2025 · Aug 30, 2025
diff --git a/.gitignore b/.gitignore
@@ -41,3 +41,7 @@ gradio_outputs/
 ckpts/
 loras/
 loras_i2v/
+
+settings/
+
+wgp_config.json
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,92 @@
+FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+
+# Build arg for GPU architectures - specify which CUDA compute capabilities to compile for
+# Common values:
+#   7.0  - Tesla V100
+#   7.5  - RTX 2060, 2070, 2080, Titan RTX
+#   8.0  - A100, A800 (Ampere data center)
+#   8.6  - RTX 3060, 3070, 3080, 3090 (Ampere consumer)
+#   8.9  - RTX 4070, 4080, 4090 (Ada Lovelace)
+#   9.0  - H100, H800 (Hopper data center)
+#   12.0 - RTX 5070, 5080, 5090 (Blackwell) - Note: sm_120 architecture
+#
+# Examples:
+#   RTX 3060: --build-arg CUDA_ARCHITECTURES="8.6"
+#   RTX 4090: --build-arg CUDA_ARCHITECTURES="8.9"
+#   Multiple: --build-arg CUDA_ARCHITECTURES="8.0;8.6;8.9"
+#
+# Note: Including 8.9 or 9.0 may cause compilation issues on some setups
+# Default includes 8.0 and 8.6 for broad Ampere compatibility
+ARG CUDA_ARCHITECTURES="8.0;8.6"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install system dependencies
+RUN apt update && \
+    apt install -y \
+    python3 python3-pip git wget curl cmake ninja-build \
+    libgl1 libglib2.0-0 ffmpeg && \
+    apt clean
+
+WORKDIR /workspace
+
+COPY requirements.txt .
+
+# Upgrade pip first
+RUN pip install --upgrade pip setuptools wheel
+
+# Install requirements if exists
+RUN pip install -r requirements.txt
+
+# Install PyTorch with CUDA support
+RUN pip install --extra-index-url https://download.pytorch.org/whl/cu124 \
+    torch==2.6.0+cu124 torchvision==0.21.0+cu124
+
+# Install SageAttention from git (patch GPU detection)
+ENV TORCH_CUDA_ARCH_LIST="${CUDA_ARCHITECTURES}"
+ENV FORCE_CUDA="1"
+ENV MAX_JOBS="1"
+
+COPY <<EOF /tmp/patch_setup.py
+import os
+with open('setup.py', 'r') as f:
+    content = f.read()
+
+# Get architectures from environment variable
+arch_list = os.environ.get('TORCH_CUDA_ARCH_LIST')
+arch_set = '{' + ', '.join([f'"{arch}"' for arch in arch_list.split(';')]) + '}'
+
+# Replace the GPU detection section
+old_section = '''compute_capabilities = set()
+device_count = torch.cuda.device_count()
+for i in range(device_count):
+    major, minor = torch.cuda.get_device_capability(i)
+    if major < 8:
+        warnings.warn(f"skipping GPU {i} with compute capability {major}.{minor}")
+        continue
+    compute_capabilities.add(f"{major}.{minor}")'''
+
+new_section = 'compute_capabilities = ' + arch_set + '''
+print(f"Manually set compute capabilities: {compute_capabilities}")'''
+
+content = content.replace(old_section, new_section)
+
+with open('setup.py', 'w') as f:
+    f.write(content)
+EOF
+
+RUN git clone https://github.com/thu-ml/SageAttention.git /tmp/sageattention && \
+    cd /tmp/sageattention && \
+    python3 /tmp/patch_setup.py && \
+    pip install --no-build-isolation .
+
+RUN useradd -u 1000 -ms /bin/bash user
+
+RUN chown -R user:user /workspace
+
+RUN mkdir /home/user/.cache && \
+    chown -R user:user /home/user/.cache
+
+COPY entrypoint.sh /workspace/entrypoint.sh
+
+ENTRYPOINT ["/workspace/entrypoint.sh"]
diff --git a/README.md b/README.md
diff --git a/configs/animate.json b/configs/animate.json
@@ -0,0 +1,15 @@
+{
+  "_class_name": "WanModel",
+  "_diffusers_version": "0.30.0",
+  "dim": 5120,
+  "eps": 1e-06,
+  "ffn_dim": 13824,
+  "freq_dim": 256,
+  "in_dim": 36,
+  "model_type": "i2v",
+  "num_heads": 40,
+  "num_layers": 40,
+  "out_dim": 16,
+  "text_len": 512,
+  "motion_encoder_dim": 512
+}
diff --git a/configs/lucy_edit.json b/configs/lucy_edit.json
@@ -0,0 +1,14 @@
+{
+  "_class_name": "WanModel",
+  "_diffusers_version": "0.33.0",
+  "dim": 3072,
+  "eps": 1e-06,
+  "ffn_dim": 14336,
+  "freq_dim": 256,
+  "in_dim": 96,
+  "model_type": "ti2v2_2",
+  "num_heads": 24,
+  "num_layers": 30,
+  "out_dim": 48,
+  "text_len": 512
+}
diff --git a/configs/lynx.json b/configs/lynx.json
@@ -0,0 +1,15 @@
+{
+  "_class_name": "WanModel",
+  "_diffusers_version": "0.30.0",
+  "dim": 5120,
+  "eps": 1e-06,
+  "ffn_dim": 13824,
+  "freq_dim": 256,
+  "in_dim": 16,
+  "model_type": "t2v",
+  "num_heads": 40,
+  "num_layers": 40,
+  "out_dim": 16,
+  "text_len": 512,
+  "lynx": "full"
+}
diff --git a/configs/vace_lynx_14B.json b/configs/vace_lynx_14B.json
@@ -0,0 +1,17 @@
+{
+  "_class_name": "WanModel",
+  "_diffusers_version": "0.30.0",
+  "dim": 5120,
+  "eps": 1e-06,
+  "ffn_dim": 13824,
+  "freq_dim": 256,
+  "in_dim": 16,
+  "model_type": "t2v",
+  "num_heads": 40,
+  "num_layers": 40,
+  "out_dim": 16,
+  "text_len": 512,
+  "vace_layers": [0, 5, 10, 15, 20, 25, 30, 35],
+  "vace_in_dim": 96,  
+  "lynx": "full"
+}
diff --git a/defaults/animate.json b/defaults/animate.json
@@ -0,0 +1,17 @@
+{
+    "model": {
+        "name": "Wan2.2 Animate",
+        "architecture": "animate",
+        "description": "Wan-Animate takes a video and a character image as input, and generates a video in either 'Animation' or 'Replacement' mode. Sliding Window of 81 frames at least are recommeded to obtain the best Style continuity.",
+        "URLs": [
+            "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_animate_14B_bf16.safetensors",
+            "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_animate_14B_quanto_fp16_int8.safetensors",
+            "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_animate_14B_quanto_bf16_int8.safetensors"
+        ],
+		"preload_URLs" :
+		[
+			"https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_animate_relighting_lora.safetensors"
+		],
+        "group": "wan2_2"
+    }
+}
diff --git a/defaults/flux_dev_kontext.json b/defaults/flux_dev_kontext.json
@@ -7,8 +7,6 @@
             "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1_kontext_dev_bf16.safetensors",
             "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1_kontext_dev_quanto_bf16_int8.safetensors"
         ],
-		"image_outputs": true,		
-		"reference_image": true,		
 		"flux-model": "flux-dev-kontext"		
     },
 	"prompt": "add a hat",

diff --git a/defaults/flux_dev_umo.json b/defaults/flux_dev_umo.json
@@ -0,0 +1,24 @@
+{
+    "model": {
+        "name": "Flux 1 Dev UMO 12B",
+        "architecture": "flux",
+        "description": "FLUX.1 Dev UMO is a model that can Edit Images with a specialization in combining multiple image references (resized internally at 512x512 max) to produce an Image output. Best Image preservation at 768x768 Resolution Output.",
+        "URLs": "flux",
+		"flux-model": "flux-dev-umo",		
+		"loras": ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-UMO_dit_lora_bf16.safetensors"],
+		"resolutions":  [ ["1024x1024 (1:1)", "1024x1024"],
+						["768x1024 (3:4)", "768x1024"],
+						["1024x768 (4:3)", "1024x768"],
+						["512x1024 (1:2)", "512x1024"],
+						["1024x512 (2:1)", "1024x512"],
+						["768x768 (1:1)", "768x768"],
+						["768x512 (3:2)", "768x512"],
+						["512x768 (2:3)", "512x768"]]
+    },	
+	"prompt": "the man is wearing a hat",
+	"embedded_guidance_scale": 4,
+    "resolution": "768x768",
+    "batch_size": 1
+}
+
+
diff --git a/defaults/flux_dev_uso.json b/defaults/flux_dev_uso.json
@@ -2,15 +2,13 @@
     "model": {
         "name": "Flux 1 Dev USO 12B",
         "architecture": "flux",
-        "description": "FLUX.1 Dev USO is a model specialized to Edit Images with a specialization in Style Transfers (up to two).",
+        "description": "FLUX.1 Dev USO is a model that can Edit Images with a specialization in Style Transfers (up to two).",
 		"modules": [ ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-USO_projector_bf16.safetensors"]],
         "URLs": "flux",
 		"loras": ["https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-dev-USO_dit_lora_bf16.safetensors"],
-		"image_outputs": true,		
-		"reference_image": true,		
 		"flux-model": "flux-dev-uso"		
     },
-	"prompt": "add a hat",
+	"prompt": "the man is wearing a hat",
 	"embedded_guidance_scale": 4,
     "resolution": "1024x1024",
     "batch_size": 1

diff --git a/defaults/flux_srpo.json b/defaults/flux_srpo.json
@@ -0,0 +1,15 @@
+{
+    "model": {
+        "name": "Flux 1 SRPO Dev 12B",
+        "architecture": "flux",
+        "description": "By fine-tuning the FLUX.1.dev model with optimized denoising and online reward adjustment, SRPO improves its human-evaluated realism and aesthetic quality by over 3x.",
+        "URLs": [
+            "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-srpo-dev_bf16.safetensors",
+            "https://huggingface.co/DeepBeepMeep/Flux/resolve/main/flux1-srpo-dev_quanto_bf16_int8.safetensors"
+        ],
+        "flux-model": "flux-dev"
+    },
+    "prompt": "draw a hat",
+    "resolution": "1024x1024",
+    "batch_size": 1
+}
diff --git a/defaults/flux_srpo_uso.json b/defaults/flux_srpo_uso.json
@@ -0,0 +1,17 @@
+{
+    "model": {
+        "name": "Flux 1 SRPO USO 12B",
+        "architecture": "flux",
+        "description": "FLUX.1 SRPO USO is a model that can Edit Images with a specialization in Style Transfers (up to two). It leverages the improved Image quality brought by the SRPO process",
+		"modules": [ "flux_dev_uso"],
+        "URLs": "flux_srpo",
+		"loras": "flux_dev_uso",
+		"flux-model": "flux-dev-uso"		
+    },
+	"prompt": "the man is wearing a hat",
+	"embedded_guidance_scale": 4,
+    "resolution": "1024x1024",
+    "batch_size": 1
+}
+
+
diff --git a/defaults/lucy_edit.json b/defaults/lucy_edit.json
@@ -0,0 +1,19 @@
+{
+    "model": {
+        "name": "Wan2.2 Lucy Edit 5B",
+        "architecture": "lucy_edit",
+        "description": "Lucy Edit is a video editing model that performs instruction-guided edits on videos using free-text prompts. It supports a variety of edits, such as clothing & accessory changes, character changes, object insertions, and scene replacements while preserving the motion and composition perfectly.",
+        "URLs": [
+            "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_lucy_edit_mbf16.safetensors",
+            "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_lucy_edit_quanto_mbf16_int8.safetensors",
+            "https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/wan2.2_lucy_edit_quanto_mfp16_int8.safetensors"
+        ],
+        "group": "wan2_2"
+    },
+	"prompt": "change the clothes to red",	
+    "video_length": 81,
+    "guidance_scale": 5,
+    "flow_shift": 5,
+    "num_inference_steps": 30,
+    "resolution": "1280x720"
+}
diff --git a/defaults/lucy_edit_fastwan.json b/defaults/lucy_edit_fastwan.json
@@ -0,0 +1,16 @@
+{
+    "model": {
+        "name": "Wan2.2 FastWan Lucy Edit 5B",
+        "architecture": "lucy_edit",
+        "description": "Lucy Edit is a video editing model that performs instruction-guided edits on videos using free-text prompts. It supports a variety of edits, such as clothing & accessory changes, character changes, object insertions, and scene replacements while preserving the motion and composition perfectly. This is the FastWan version for faster generation.",
+        "URLs": "lucy_edit",
+        "group": "wan2_2",
+		"loras": "ti2v_2_2_fastwan"
+    },
+	"prompt": "change the clothes to red",
+    "video_length": 81,
+    "guidance_scale": 1,
+    "flow_shift": 3,
+    "num_inference_steps": 5,
+    "resolution": "1280x720"
+}
diff --git a/defaults/lynx.json b/defaults/lynx.json
@@ -0,0 +1,18 @@
+{
+    "model": {
+        "name": "Wan2.1 Lynx 14B",
+        "modules": [
+            [
+                "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_lynx_full_module_14B_bf16.safetensors",
+                "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_lynx_full_module_14B_quanto_bf16_int8.safetensors",
+                "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_lynx_full_module_14B_quanto_fp16_int8.safetensors"
+            ]
+        ],
+        "architecture": "lynx",
+        "description": "The Lynx ControlNet offers State of the Art Identity Preservation. You need to provide a Reference Image which is a close up of a person face to transfer this person in the Video.",
+        "URLs": "t2v",
+        "preload_URLs": [
+            "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_lynx_full_arc_resampler.safetensors"
+        ]
+    }
+}
diff --git a/defaults/qwen_image_edit_20B.json b/defaults/qwen_image_edit_20B.json
@@ -7,11 +7,10 @@
             "https://huggingface.co/DeepBeepMeep/Qwen_image/resolve/main/qwen_image_edit_20B_bf16.safetensors",
             "https://huggingface.co/DeepBeepMeep/Qwen_image/resolve/main/qwen_image_edit_20B_quanto_bf16_int8.safetensors"
         ],
+		"preload_URLs": ["https://huggingface.co/DeepBeepMeep/Qwen_image/resolve/main/qwen_image_edit_inpainting.safetensors"],
         "attention": {
             "<89": "sdpa"
-        },
-        "reference_image": true,
-        "image_outputs": true
+        }
     },
     "prompt": "add a hat",
     "resolution": "1280x720",

diff --git a/defaults/qwen_image_edit_plus_20B.json b/defaults/qwen_image_edit_plus_20B.json
@@ -0,0 +1,17 @@
+{
+    "model": {
+        "name": "Qwen Image Edit Plus 20B",
+        "architecture": "qwen_image_edit_plus_20B",
+        "description": "Qwen Image Edit Plus is a generative model that can generate very high quality images with long texts in it. Best results will be at 720p. This model is optimized to combine multiple Subjects & Objects.",
+        "URLs": [
+            "https://huggingface.co/DeepBeepMeep/Qwen_image/resolve/main/qwen_image_edit_plus_20B_quanto_bf16_int8.safetensors"
+        ],
+        "preload_URLs": "qwen_image_edit_20B",
+        "attention": {
+            "<89": "sdpa"
+        }
+    },
+    "prompt": "add a hat",
+    "resolution": "1024x1024",
+    "batch_size": 1
+}
diff --git a/defaults/standin.json b/defaults/standin.json
@@ -4,7 +4,7 @@
 		"name": "Wan2.1 Standin 14B",
 		"modules": [ ["https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/Stand-In_wan2.1_T2V_14B_ver1.0_bf16.safetensors"]],
 		"architecture" : "standin",
-		"description": "The original Wan Text 2 Video model combined with the StandIn module to improve Identity Preservation. You need to provide a Reference Image with white background which is a close up of person face to transfer this person in the Video.",
+		"description": "The original Wan Text 2 Video model combined with the StandIn module to improve Identity Preservation. You need to provide a Reference Image with white background which is a close up of a person face to transfer this person in the Video.",
 		"URLs":  "t2v"
 	}
 }
diff --git a/defaults/ti2v_2_2_fastwan.json b/defaults/ti2v_2_2_fastwan.json
@@ -7,6 +7,7 @@
 		"loras": ["https://huggingface.co/DeepBeepMeep/Wan2.2/resolve/main/loras_accelerators/Wan2_2_5B_FastWanFullAttn_lora_rank_128_bf16.safetensors"],
         "group": "wan2_2"
     },
+	"prompt" : "Put the person into a clown outfit.", 
     "video_length": 121,
     "guidance_scale": 1,
     "flow_shift": 3,