MrForExample
diff --git a/‎Checkpoints/CRM_T2I_V3/Put CRM_T2I_V3 model here.txt b/‎Checkpoints/CRM_T2I_V3/Put CRM_T2I_V3 model here.txt
diff --git a/‎Configs/CRM_T2I_V3_configs/sd_v2_base_ipmv_chin8_zero_snr.yaml
Lines changed: 62 additions & 0 deletions b/‎Configs/CRM_T2I_V3_configs/sd_v2_base_ipmv_chin8_zero_snr.yaml
Lines changed: 62 additions & 0 deletions
diff --git a/‎Configs/CRM_T2I_V3_configs/sd_v2_base_ipmv_zero_SNR_Hyper.yaml
Lines changed: 74 additions & 0 deletions b/‎Configs/CRM_T2I_V3_configs/sd_v2_base_ipmv_zero_SNR_Hyper.yaml
Lines changed: 74 additions & 0 deletions
diff --git a/‎Gen_3D_Modules/CRM_T2I_V2/model/crm/sampler.py
Lines changed: 1 addition & 1 deletion b/‎Gen_3D_Modules/CRM_T2I_V2/model/crm/sampler.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎Gen_3D_Modules/CRM_T2I_V3/imagedream/__init__.py
Lines changed: 1 addition & 0 deletions b/‎Gen_3D_Modules/CRM_T2I_V3/imagedream/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎Gen_3D_Modules/CRM_T2I_V3/imagedream/camera_utils.py
Lines changed: 99 additions & 0 deletions b/‎Gen_3D_Modules/CRM_T2I_V3/imagedream/camera_utils.py
Lines changed: 99 additions & 0 deletions
diff --git a/‎Gen_3D_Modules/CRM_T2I_V3/imagedream/configs/sd_v2_base_ipmv.yaml
Lines changed: 61 additions & 0 deletions b/‎Gen_3D_Modules/CRM_T2I_V3/imagedream/configs/sd_v2_base_ipmv.yaml
Lines changed: 61 additions & 0 deletions
diff --git a/‎Gen_3D_Modules/CRM_T2I_V3/imagedream/configs/sd_v2_base_ipmv_ch8.yaml
Lines changed: 61 additions & 0 deletions b/‎Gen_3D_Modules/CRM_T2I_V3/imagedream/configs/sd_v2_base_ipmv_ch8.yaml
Lines changed: 61 additions & 0 deletions
@@ -0,0 +1,62 @@
+model:
+  target: CRM_T2I_V2.imagedream.ldm.interface.LatentDiffusionInterface
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    timesteps: 1000
+    scale_factor: 0.18215
+    parameterization: "eps"
+    zero_snr: true
+
+    unet_config:
+      target: CRM_T2I_V2.imagedream.ldm.modules.diffusionmodules.openaimodel.MultiViewUNetModelStage2
+      params:
+        image_size: 32 # unused
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        use_checkpoint: False
+        legacy: False
+        camera_dim: 16
+        with_ip: True
+        ip_dim: 16 # ip token length
+        ip_mode: "local_resample"
+
+    vae_config:
+      target: CRM_T2I_V2.imagedream.ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          #attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    clip_config:
+      target: CRM_T2I_V2.imagedream.ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"
+        ip_mode: "local_resample"
@@ -0,0 +1,74 @@
+model:
+  target: CRM_T2I_V3.imagedream.ldm.interface.LatentDiffusionInterface
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    timesteps: 1000
+    scale_factor: 0.18215
+    parameterization: "eps"
+    zero_snr: true
+
+    unet_config:
+      target: CRM_T2I_V3.imagedream.ldm.modules.diffusionmodules.openaimodel.MultiViewUNetModelHyper
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        use_checkpoint: False
+        legacy: False
+        camera_dim: 16
+        with_ip: True
+        ip_dim: 16 # ip token length
+        ip_mode: "local_resample"
+
+    vae_config:
+      target: CRM_T2I_V3.imagedream.ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          #attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    clip_config:
+      target: CRM_T2I_V3.imagedream.ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"
+        ip_mode: "local_resample"
+
+sampler:
+    target: CRM_T2I_V3.libs.sample.ImageDreamDiffusion
+    params:
+        mode: pixel
+        num_frames: 7
+        camera_views: [1, 2, 3, 4, 5, 0, 0]
+        ref_position: 6
+        random_background: false
+        offset_noise: true
+        resize_rate: 1.0
+        input_branch_size: 3
@@ -67,7 +67,7 @@ def stage1_sample(
 
         # remove reference views
         mask = torch.arange(1, stage1_images.shape[0]+1, 1) % 7 != 0
-        return stage1_images[mask] # (N, H, W, 3) in [0, 255]
+        return stage1_images[mask] # (N, H, W, 3) in [0, 1]
 
     @classmethod
     def stage2_sample(
 
@@ -0,0 +1 @@
+from .model_zoo import build_model
@@ -0,0 +1,99 @@
+import numpy as np
+import torch
+
+
+def create_camera_to_world_matrix(elevation, azimuth):
+    elevation = np.radians(elevation)
+    azimuth = np.radians(azimuth)
+    # Convert elevation and azimuth angles to Cartesian coordinates on a unit sphere
+    x = np.cos(elevation) * np.sin(azimuth)
+    y = np.sin(elevation)
+    z = np.cos(elevation) * np.cos(azimuth)
+
+    # Calculate camera position, target, and up vectors
+    camera_pos = np.array([x, y, z])
+    target = np.array([0, 0, 0])
+    up = np.array([0, 1, 0])
+
+    # Construct view matrix
+    forward = target - camera_pos
+    forward /= np.linalg.norm(forward)
+    right = np.cross(forward, up)
+    right /= np.linalg.norm(right)
+    new_up = np.cross(right, forward)
+    new_up /= np.linalg.norm(new_up)
+    cam2world = np.eye(4)
+    cam2world[:3, :3] = np.array([right, new_up, -forward]).T
+    cam2world[:3, 3] = camera_pos
+    return cam2world
+
+
+def convert_opengl_to_blender(camera_matrix):
+    if isinstance(camera_matrix, np.ndarray):
+        # Construct transformation matrix to convert from OpenGL space to Blender space
+        flip_yz = np.array([[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0], [0, 0, 0, 1]])
+        camera_matrix_blender = np.dot(flip_yz, camera_matrix)
+    else:
+        # Construct transformation matrix to convert from OpenGL space to Blender space
+        flip_yz = torch.tensor(
+            [[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0], [0, 0, 0, 1]]
+        )
+        if camera_matrix.ndim == 3:
+            flip_yz = flip_yz.unsqueeze(0)
+        camera_matrix_blender = torch.matmul(flip_yz.to(camera_matrix), camera_matrix)
+    return camera_matrix_blender
+
+
+def normalize_camera(camera_matrix):
+    """normalize the camera location onto a unit-sphere"""
+    if isinstance(camera_matrix, np.ndarray):
+        camera_matrix = camera_matrix.reshape(-1, 4, 4)
+        translation = camera_matrix[:, :3, 3]
+        translation = translation / (
+            np.linalg.norm(translation, axis=1, keepdims=True) + 1e-8
+        )
+        camera_matrix[:, :3, 3] = translation
+    else:
+        camera_matrix = camera_matrix.reshape(-1, 4, 4)
+        translation = camera_matrix[:, :3, 3]
+        translation = translation / (
+            torch.norm(translation, dim=1, keepdim=True) + 1e-8
+        )
+        camera_matrix[:, :3, 3] = translation
+    return camera_matrix.reshape(-1, 16)
+
+
+def get_camera(
+    num_frames, 
+    elevation=15, 
+    azimuth_start=0, 
+    azimuth_span=360, 
+    blender_coord=True,
+    extra_view=False,
+):
+    angle_gap = azimuth_span / num_frames
+    cameras = []
+    for azimuth in np.arange(azimuth_start, azimuth_span + azimuth_start, angle_gap):
+        camera_matrix = create_camera_to_world_matrix(elevation, azimuth)
+        if blender_coord:
+            camera_matrix = convert_opengl_to_blender(camera_matrix)
+        cameras.append(camera_matrix.flatten())
+        
+    if extra_view:
+        dim = len(cameras[0])
+        cameras.append(np.zeros(dim))  
+    return torch.tensor(np.stack(cameras, 0)).float()
+
+
+def get_camera_for_index(data_index):
+    """
+    按照当前我们的数据格式, 以000为正对我们的情况:
+    000是正面, ev: 0, azimuth: 0
+    001是左边, ev: 0, azimuth: -90
+    002是下面, ev: -90, azimuth: 0
+    003是背面, ev: 0, azimuth: 180
+    004是右边, ev: 0, azimuth: 90
+    005是上面, ev: 90, azimuth: 0
+    """
+    params = [(0, 0), (0, -90), (-90, 0), (0, 180), (0, 90), (90, 0)]
+    return get_camera(1, *params[data_index])
@@ -0,0 +1,61 @@
+model:
+  target: imagedream.ldm.interface.LatentDiffusionInterface
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    timesteps: 1000
+    scale_factor: 0.18215
+    parameterization: "eps"
+
+    unet_config:
+      target: imagedream.ldm.modules.diffusionmodules.openaimodel.MultiViewUNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        use_checkpoint: False
+        legacy: False
+        camera_dim: 16
+        with_ip: True
+        ip_dim: 16 # ip token length
+        ip_mode: "local_resample"
+
+    vae_config:
+      target: imagedream.ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          #attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    clip_config:
+      target: imagedream.ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"
+        ip_mode: "local_resample"
@@ -0,0 +1,61 @@
+model:
+  target: imagedream.ldm.interface.LatentDiffusionInterface
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    timesteps: 1000
+    scale_factor: 0.18215
+    parameterization: "eps"
+
+    unet_config:
+      target: imagedream.ldm.modules.diffusionmodules.openaimodel.MultiViewUNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 8
+        out_channels: 8
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        use_checkpoint: False
+        legacy: False
+        camera_dim: 16
+        with_ip: True
+        ip_dim: 16 # ip token length
+        ip_mode: "local_resample"
+
+    vae_config:
+      target: imagedream.ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          #attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    clip_config:
+      target: imagedream.ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"
+        ip_mode: "local_resample"