CompVis
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎scripts/cldm_inference.py‎
Lines changed: 6 additions & 5 deletions b/‎scripts/cldm_inference.py‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎scripts/ldm/model_base.py‎
Lines changed: 247 additions & 3 deletions b/‎scripts/ldm/model_base.py‎
Lines changed: 247 additions & 3 deletions
diff --git a/‎scripts/ldm/model_management.py‎
Lines changed: 28 additions & 18 deletions b/‎scripts/ldm/model_management.py‎
Lines changed: 28 additions & 18 deletions
@@ -51,3 +51,6 @@ temp/RD_decode_temp.png
 temp/RD_encode_temp.png
 temp/RD_mask.png
 venv
+models/base/BLIP/*
+models/base/LECO/*
+models/base/RDGenerationLog.txt
@@ -6,9 +6,7 @@
 from ldm.model_management import unload_all_models
 from ldm.lora import load_lora_for_models
 from ldm.sd import load_checkpoint_guess_config
-
-import copy
-from PIL import Image, ImageOps
+from PIL import ImageOps
 import numpy as np
 import torch
 
@@ -62,7 +60,7 @@ def load_controlnet(
         output_clip=False,
         output_clipvision=False,
     )
-    
+
     model_patcher = out[0]
 
     # Apply loras
@@ -86,6 +84,9 @@ def load_controlnet(
 
         # Apply controlnet to conditioning
         (cldm_conditioning,) = apply_controlnet(cldm_conditioning, controlnet, image, controlnet_input["weight"])
+        
+    # Patch the model
+    lora_model_patcher.patch_model()
 
     return lora_model_patcher, cldm_conditioning, cldm_negative_conditioning
 
@@ -131,4 +132,4 @@ def unload_cldm():
     # Unload the model
     unload_all_models()
 
-    return
+    return
@@ -3,6 +3,9 @@
 import ldm.conds
 from enum import Enum
 
+import ldm.ops
+import ldm.model_management
+
 from ldm.cldm_models import UNetModel
 from . import utils
 
@@ -33,6 +36,247 @@ class ModelSampling(s, c):
 
 
 class BaseModel(torch.nn.Module):
+    def __init__(
+        self, model_config, model_type=ModelType.EPS, device=None, unet_model=UNetModel
+    ):
+        super().__init__()
+
+        unet_config = model_config.unet_config
+        self.latent_format = model_config.latent_format
+        self.model_config = model_config
+        self.manual_cast_dtype = model_config.manual_cast_dtype
+
+        if not unet_config.get("disable_unet_model_creation", False):
+            if self.manual_cast_dtype is not None:
+                operations = ldm.ops.manual_cast
+            else:
+                operations = ldm.ops.disable_weight_init
+            self.diffusion_model = unet_model(
+                **unet_config, device=device, operations=operations
+            )
+        self.model_type = model_type
+        self.model_sampling = model_sampling(model_config, model_type)
+
+        self.adm_channels = unet_config.get("adm_in_channels", None)
+        if self.adm_channels is None:
+            self.adm_channels = 0
+        self.inpaint_model = False
+        print("model_type", model_type.name)
+        print("adm", self.adm_channels)
+
+    def apply_model(
+        self,
+        x,
+        t,
+        c_concat=None,
+        c_crossattn=None,
+        control=None,
+        transformer_options={},
+        **kwargs
+    ):
+        sigma = t
+        xc = self.model_sampling.calculate_input(sigma, x)
+        if c_concat is not None:
+            xc = torch.cat([xc] + [c_concat], dim=1)
+
+        context = c_crossattn
+        dtype = self.get_dtype()
+
+        if self.manual_cast_dtype is not None:
+            dtype = self.manual_cast_dtype
+
+        xc = xc.to(dtype)
+        t = self.model_sampling.timestep(t).float()
+        context = context.to(dtype)
+        extra_conds = {}
+        for o in kwargs:
+            extra = kwargs[o]
+            if hasattr(extra, "dtype"):
+                if extra.dtype != torch.int and extra.dtype != torch.long:
+                    extra = extra.to(dtype)
+            extra_conds[o] = extra
+
+        model_output = self.diffusion_model(
+            xc,
+            t,
+            context=context,
+            control=control,
+            transformer_options=transformer_options,
+            **extra_conds
+        ).float()
+        return self.model_sampling.calculate_denoised(sigma, model_output, x)
+
+    def get_dtype(self):
+        return self.diffusion_model.dtype
+
+    def is_adm(self):
+        return self.adm_channels > 0
+
+    def encode_adm(self, **kwargs):
+        return None
+
+    def extra_conds(self, **kwargs):
+        out = {}
+        if self.inpaint_model:
+            concat_keys = ("mask", "masked_image")
+            cond_concat = []
+            denoise_mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
+            concat_latent_image = kwargs.get("concat_latent_image", None)
+            if concat_latent_image is None:
+                concat_latent_image = kwargs.get("latent_image", None)
+            else:
+                concat_latent_image = self.process_latent_in(concat_latent_image)
+
+            noise = kwargs.get("noise", None)
+            device = kwargs["device"]
+
+            if concat_latent_image.shape[1:] != noise.shape[1:]:
+                concat_latent_image = utils.common_upscale(
+                    concat_latent_image,
+                    noise.shape[-1],
+                    noise.shape[-2],
+                    "bilinear",
+                    "center",
+                )
+
+            concat_latent_image = utils.resize_to_batch_size(
+                concat_latent_image, noise.shape[0]
+            )
+
+            if len(denoise_mask.shape) == len(noise.shape):
+                denoise_mask = denoise_mask[:, :1]
+
+            denoise_mask = denoise_mask.reshape(
+                (-1, 1, denoise_mask.shape[-2], denoise_mask.shape[-1])
+            )
+            if denoise_mask.shape[-2:] != noise.shape[-2:]:
+                denoise_mask = utils.common_upscale(
+                    denoise_mask, noise.shape[-1], noise.shape[-2], "bilinear", "center"
+                )
+            denoise_mask = utils.resize_to_batch_size(
+                denoise_mask.round(), noise.shape[0]
+            )
+
+            def blank_inpaint_image_like(latent_image):
+                blank_image = torch.ones_like(latent_image)
+                # these are the values for "zero" in pixel space translated to latent space
+                blank_image[:, 0] *= 0.8223
+                blank_image[:, 1] *= -0.6876
+                blank_image[:, 2] *= 0.6364
+                blank_image[:, 3] *= 0.1380
+                return blank_image
+
+            for ck in concat_keys:
+                if denoise_mask is not None:
+                    if ck == "mask":
+                        cond_concat.append(denoise_mask.to(device))
+                    elif ck == "masked_image":
+                        cond_concat.append(
+                            concat_latent_image.to(device)
+                        )  # NOTE: the latent_image should be masked by the mask in pixel space
+                else:
+                    if ck == "mask":
+                        cond_concat.append(torch.ones_like(noise)[:, :1])
+                    elif ck == "masked_image":
+                        cond_concat.append(blank_inpaint_image_like(noise))
+            data = torch.cat(cond_concat, dim=1)
+            out["c_concat"] = ldm.conds.CONDNoiseShape(data)
+
+        adm = self.encode_adm(**kwargs)
+        if adm is not None:
+            out["y"] = ldm.conds.CONDRegular(adm)
+
+        cross_attn = kwargs.get("cross_attn", None)
+        if cross_attn is not None:
+            out["c_crossattn"] = ldm.conds.CONDCrossAttn(cross_attn)
+
+        cross_attn_cnet = kwargs.get("cross_attn_controlnet", None)
+        if cross_attn_cnet is not None:
+            out["crossattn_controlnet"] = ldm.conds.CONDCrossAttn(cross_attn_cnet)
+
+        return out
+
+    def load_model_weights(self, sd, unet_prefix=""):
+        to_load = {}
+        keys = list(sd.keys())
+        for k in keys:
+            if k.startswith(unet_prefix):
+                to_load[k[len(unet_prefix) :]] = sd.pop(k)
+
+        to_load = self.model_config.process_unet_state_dict(to_load)
+        m, u = self.diffusion_model.load_state_dict(to_load, strict=False)
+        if len(m) > 0:
+            print("unet missing:", m)
+
+        if len(u) > 0:
+            print("unet unexpected:", u)
+        del to_load
+        return self
+
+    def process_latent_in(self, latent):
+        return self.latent_format.process_in(latent)
+
+    def process_latent_out(self, latent):
+        return self.latent_format.process_out(latent)
+
+    def state_dict_for_saving(
+        self, clip_state_dict=None, vae_state_dict=None, clip_vision_state_dict=None
+    ):
+        extra_sds = []
+        if clip_state_dict is not None:
+            extra_sds.append(
+                self.model_config.process_clip_state_dict_for_saving(clip_state_dict)
+            )
+        if vae_state_dict is not None:
+            extra_sds.append(
+                self.model_config.process_vae_state_dict_for_saving(vae_state_dict)
+            )
+        if clip_vision_state_dict is not None:
+            extra_sds.append(
+                self.model_config.process_clip_vision_state_dict_for_saving(
+                    clip_vision_state_dict
+                )
+            )
+
+        unet_state_dict = self.diffusion_model.state_dict()
+        unet_state_dict = self.model_config.process_unet_state_dict_for_saving(
+            unet_state_dict
+        )
+
+        if self.get_dtype() == torch.float16:
+            extra_sds = map(
+                lambda sd: utils.convert_sd_to(sd, torch.float16), extra_sds
+            )
+
+        if self.model_type == ModelType.V_PREDICTION:
+            unet_state_dict["v_pred"] = torch.tensor([])
+
+        for sd in extra_sds:
+            unet_state_dict.update(sd)
+
+        return unet_state_dict
+
+    def set_inpaint(self):
+        self.inpaint_model = True
+
+    def memory_required(self, input_shape):
+        if (
+            ldm.model_management.xformers_enabled()
+            or ldm.model_management.pytorch_attention_flash_attention()
+        ):
+            dtype = self.get_dtype()
+            if self.manual_cast_dtype is not None:
+                dtype = self.manual_cast_dtype
+            # TODO: this needs to be tweaked
+            area = input_shape[0] * input_shape[2] * input_shape[3]
+            return (area * ldm.model_management.dtype_size(dtype) / 50) * (
+                1024 * 1024
+            )
+        else:
+            # TODO: this formula might be too aggressive since I tweaked the sub-quad and split algorithms to use less memory.
+            area = input_shape[0] * input_shape[2] * input_shape[3]
+            return (((area * 0.6) / 0.9) + 1024) * (1024 * 1024)
+
     def __init__(self, model_config, model_type=ModelType.EPS, device=None):
         super().__init__()
 
@@ -162,10 +406,10 @@ def set_inpaint(self):
 
     def memory_required(self, input_shape):
         if ldm.model_management.xformers_enabled() or ldm.model_management.pytorch_attention_flash_attention():
-            #TODO: this needs to be tweaked
+            # TODO: this needs to be tweaked
             area = input_shape[0] * input_shape[2] * input_shape[3]
             return (area * ldm.model_management.dtype_size(self.get_dtype()) / 50) * (1024 * 1024)
         else:
-            #TODO: this formula might be too aggressive since I tweaked the sub-quad and split algorithms to use less memory.
+            # TODO: this formula might be too aggressive since I tweaked the sub-quad and split algorithms to use less memory.
             area = input_shape[0] * input_shape[2] * input_shape[3]
-            return (((area * 0.6) / 0.9) + 1024) * (1024 * 1024)
+            return (((area * 0.6) / 0.9) + 1024) * (1024 * 1024)
@@ -309,31 +309,41 @@ def model_load(self, lowvram_model_memory=0):
 
         if lowvram_model_memory > 0:
             print("loading in lowvram mode", lowvram_model_memory / (1024 * 1024))
-            device_map = accelerate.infer_auto_device_map(
-                self.real_model,
-                max_memory={
-                    0: "{}MiB".format(lowvram_model_memory // (1024 * 1024)),
-                    "cpu": "16GiB",
-                },
-            )
-            accelerate.dispatch_model(
-                self.real_model, device_map=device_map, main_device=self.device
-            )
+            mem_counter = 0
+            for m in self.real_model.modules():
+                if hasattr(m, "comfy_cast_weights"):
+                    m.prev_comfy_cast_weights = m.comfy_cast_weights
+                    m.comfy_cast_weights = True
+                    module_mem = module_size(m)
+                    if mem_counter + module_mem < lowvram_model_memory:
+                        m.to(self.device)
+                        mem_counter += module_mem
+                elif hasattr(
+                    m, "weight"
+                ):  # only modules with comfy_cast_weights can be set to lowvram mode
+                    m.to(self.device)
+                    mem_counter += module_size(m)
+                    print("lowvram: loaded module regularly", m)
+
             self.model_accelerated = True
 
-        if is_intel_xpu() and not disable_ipex_optimize:
-            self.real_model = torch.xpu.optimize(
-                self.real_model.eval(),
-                inplace=True,
-                auto_kernel_selection=True,
-                graph_mode=True,
-            )
+        # if is_intel_xpu() and not args.disable_ipex_optimize:
+        #     self.real_model = torch.xpu.optimize(
+        #         self.real_model.eval(),
+        #         inplace=True,
+        #         auto_kernel_selection=True,
+        #         graph_mode=True,
+        #     )
 
         return self.real_model
 
     def model_unload(self):
         if self.model_accelerated:
-            accelerate.hooks.remove_hook_from_submodules(self.real_model)
+            for m in self.real_model.modules():
+                if hasattr(m, "prev_comfy_cast_weights"):
+                    m.comfy_cast_weights = m.prev_comfy_cast_weights
+                    del m.prev_comfy_cast_weights
+
             self.model_accelerated = False
 
         self.model.unpatch_model(self.model.offload_device)