Skip to content
Open
65 changes: 44 additions & 21 deletions data_juicer/config/config_all.yaml

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions data_juicer/ops/mapper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,9 @@
from .video_ffmpeg_wrapped_mapper import VideoFFmpegWrappedMapper
from .video_hand_reconstruction_hawor_mapper import VideoHandReconstructionHaworMapper
from .video_hand_reconstruction_mapper import VideoHandReconstructionMapper
from .video_normal_map_mapper import VideoNormalMapMapper
from .video_object_segmenting_mapper import VideoObjectSegmentingMapper
from .video_optical_flow_mapper import VideoOpticalFlowMapper
from .video_remove_watermark_mapper import VideoRemoveWatermarkMapper
from .video_resize_aspect_ratio_mapper import VideoResizeAspectRatioMapper
from .video_resize_resolution_mapper import VideoResizeResolutionMapper
Expand All @@ -113,6 +115,7 @@
from .video_tagging_from_audio_mapper import VideoTaggingFromAudioMapper
from .video_tagging_from_frames_mapper import VideoTaggingFromFramesMapper
from .video_undistort_mapper import VideoUndistortMapper
from .video_universal_segmentation_mapper import VideoUniversalSegmentationMapper
from .video_whole_body_pose_estimation_mapper import VideoWholeBodyPoseEstimationMapper
from .whitespace_normalization_mapper import WhitespaceNormalizationMapper

Expand Down Expand Up @@ -211,7 +214,9 @@
"VideoHandReconstructionHaworMapper",
"VideoHandReconstructionMapper",
"VideoFaceBlurMapper",
"VideoNormalMapMapper",
"VideoObjectSegmentingMapper",
"VideoOpticalFlowMapper",
"VideoRemoveWatermarkMapper",
"VideoResizeAspectRatioMapper",
"VideoResizeResolutionMapper",
Expand All @@ -221,6 +226,7 @@
"VideoTaggingFromAudioMapper",
"VideoTaggingFromFramesMapper",
"VideoUndistortMapper",
"VideoUniversalSegmentationMapper",
"VideoWholeBodyPoseEstimationMapper",
"WhitespaceNormalizationMapper",
]
190 changes: 190 additions & 0 deletions data_juicer/ops/mapper/video_normal_map_mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
import os

import cv2
import numpy as np
from pydantic import PositiveInt

import data_juicer
from data_juicer.ops.load import load_ops
from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE
from data_juicer.utils.constant import Fields, MetaKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import SpecialTokens
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Mapper
from ..op_fusion import LOADED_VIDEOS

OP_NAME = "video_normal_map_mapper"


@OPERATORS.register_module(OP_NAME)
@LOADED_VIDEOS.register_module(OP_NAME)
class VideoNormalMapMapper(Mapper):
"""Generate normal maps for videos (with the Metric3D model)."""

_accelerator = "cuda"

def __init__(
self,
model_path: str = "onnx-community/metric3d-vit-large/onnx/model.onnx",
if_save_visualization: bool = True,
save_visualization_dir: str = DATA_JUICER_ASSETS_CACHE,
frame_num: PositiveInt = 3,
duration: float = 0,
frame_dir: str = DATA_JUICER_ASSETS_CACHE,
*args,
**kwargs,
):
"""
Initialization method.

:param model_path: The path to the Metric3D model.
:param if_save_visualization: Whether to save visualization results.
:param save_visualization_dir: The path for saving visualization results.
:param frame_num: The number of frames to be extracted uniformly from
the video. If it's 1, only the middle frame will be extracted. If
it's 2, only the first and the last frames will be extracted. If
it's larger than 2, in addition to the first and the last frames,
other frames will be extracted uniformly within the video duration.
If "duration" > 0, frame_num is the number of frames per segment.
:param duration: The duration of each segment in seconds.
If 0, frames are extracted from the entire video.
If duration > 0, the video is segmented into multiple segments
based on duration, and frames are extracted from each segment.
:param frame_dir: Output directory to save extracted frames.

"""

super().__init__(*args, **kwargs)
LazyLoader.check_packages(["onnxruntime"])

self.model_key = prepare_model(model_type="normal_map_metric3d", model_path=model_path)
self.if_save_visualization = if_save_visualization
self.save_visualization_dir = save_visualization_dir
self.frame_field = MetaKeys.video_frames
self.tag_field_name = MetaKeys.video_normal_map_tags
self.frame_num = frame_num
self.duration = duration
self.frame_dir = frame_dir
self.input_size = (616, 1064)

self.video_extract_frames_mapper_args = {
"frame_sampling_method": "uniform",
"frame_num": frame_num,
"duration": duration,
"frame_dir": frame_dir,
"frame_key": MetaKeys.video_frames,
}
self.fused_ops = load_ops([{"video_extract_frames_mapper": self.video_extract_frames_mapper_args}])

def prepare_input(self, rgb_image):

input_size = self.input_size
h, w = rgb_image.shape[:2]
scale = min(input_size[0] / h, input_size[1] / w)
rgb = cv2.resize(rgb_image, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_LINEAR)

padding = [123.675, 116.28, 103.53]
h, w = rgb.shape[:2]
pad_h = input_size[0] - h
pad_w = input_size[1] - w
pad_h_half = pad_h // 2
pad_w_half = pad_w // 2
rgb: np.ndarray = cv2.copyMakeBorder(
rgb,
pad_h_half,
pad_h - pad_h_half,
pad_w_half,
pad_w - pad_w_half,
cv2.BORDER_CONSTANT,
value=padding,
)
pad_info = [pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half]

onnx_input = {
"pixel_values": np.ascontiguousarray(np.transpose(rgb, (2, 0, 1))[None], dtype=np.float32), # 1, 3, H, W
}
return onnx_input, pad_info

def process_single(self, sample=None, rank=None):

# check if it's generated already
if self.tag_field_name in sample[Fields.meta]:
return sample

# there is no video in this sample
if (self.video_key not in sample or not sample[self.video_key]) and self.frame_field not in sample:
sample[Fields.meta][self.tag_field_name] = {"pred_norm": [], "pred_norm_rgb": [], "pred_depth": []}
return sample

ort_session = get_model(model_key=self.model_key, rank=rank, use_cuda=self.use_cuda())

if self.frame_field in sample:
frames_path = sample[self.frame_field]
video_name = frames_path[0].split("/")[-2]
Comment thread
Qirui-jiao marked this conversation as resolved.
Outdated
else:
# load videos
ds_list = [{"text": SpecialTokens.video, "videos": sample[self.video_key]}]

dataset = data_juicer.core.data.NestedDataset.from_list(ds_list)
dataset = self.fused_ops[0].run(dataset)

temp_frame_name = os.path.splitext(os.path.basename(sample[self.video_key][0]))[0]
frames_root = os.path.join(self.frame_dir, temp_frame_name)
frame_names = os.listdir(frames_root)
frames_path = sorted([os.path.join(frames_root, frame_name) for frame_name in frame_names])
video_name = os.path.splitext(os.path.basename(sample[self.video_key][0]))[0]

if self.if_save_visualization:
os.makedirs(os.path.join(self.save_visualization_dir, video_name), exist_ok=True)

final_pred_norm = []
final_pred_norm_rgb = []
final_pred_depth = []

for temp_img_path_id, temp_img_path in enumerate(frames_path):
rgb_image = cv2.imread(temp_img_path)[:, :, ::-1] # BGR to RGB
Comment thread
Qirui-jiao marked this conversation as resolved.
original_shape = rgb_image.shape[:2]
onnx_input, pad_info = self.prepare_input(rgb_image)
outputs = ort_session.run(None, onnx_input)

# normal map
normal = outputs[1].squeeze()
normal = normal[
:,
pad_info[0] : self.input_size[0] - pad_info[1],
pad_info[2] : self.input_size[1] - pad_info[3],
]
normal = normal.transpose(1, 2, 0)
normal = cv2.resize(normal, (original_shape[1], original_shape[0]), interpolation=cv2.INTER_LINEAR)

normal_vis = (normal + 1.0) / 2.0
normal_vis = (normal_vis * 255).clip(0, 255).astype(np.uint8)
normal_vis = normal_vis[..., ::-1]

final_pred_norm.append(normal)
final_pred_norm_rgb.append(normal_vis)

if self.if_save_visualization:
cv2.imwrite(
os.path.join(self.save_visualization_dir, video_name, f"vis_{str(temp_img_path_id)}.jpg"),
normal_vis,
)

# depth
depth = outputs[0].squeeze() # [H, W]
depth = depth[
pad_info[0] : self.input_size[0] - pad_info[1],
pad_info[2] : self.input_size[1] - pad_info[3],
]
depth = cv2.resize(depth, (original_shape[1], original_shape[0]), interpolation=cv2.INTER_LINEAR)

final_pred_depth.append(depth)

sample[Fields.meta][self.tag_field_name] = {}
sample[Fields.meta][self.tag_field_name]["pred_norm"] = final_pred_norm
sample[Fields.meta][self.tag_field_name]["pred_norm_rgb"] = final_pred_norm_rgb
sample[Fields.meta][self.tag_field_name]["pred_depth"] = final_pred_depth
Comment thread
Qirui-jiao marked this conversation as resolved.
Outdated

return sample
151 changes: 151 additions & 0 deletions data_juicer/ops/mapper/video_optical_flow_mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import os

import cv2
import numpy as np
from pydantic import PositiveInt

import data_juicer
from data_juicer.ops.load import load_ops
from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE
from data_juicer.utils.constant import Fields, MetaKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import SpecialTokens
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Mapper
from ..op_fusion import LOADED_VIDEOS

OP_NAME = "video_optical_flow_mapper"

torch = LazyLoader("torch")
torchvision = LazyLoader("torchvision")


@OPERATORS.register_module(OP_NAME)
@LOADED_VIDEOS.register_module(OP_NAME)
class VideoOpticalFlowMapper(Mapper):
"""Generate optical flow information for videos."""

_accelerator = "cuda"

def __init__(
self,
if_save_visualization: bool = True,
save_visualization_dir: str = DATA_JUICER_ASSETS_CACHE,
frame_num: PositiveInt = 3,
duration: float = 0,
frame_dir: str = DATA_JUICER_ASSETS_CACHE,
*args,
**kwargs,
):
"""
Initialization method.

:param if_save_visualization: Whether to save visualization results.
:param save_visualization_dir: The path for saving visualization results.
:param frame_num: The number of frames to be extracted uniformly from
the video. If it's 1, only the middle frame will be extracted. If
it's 2, only the first and the last frames will be extracted. If
it's larger than 2, in addition to the first and the last frames,
other frames will be extracted uniformly within the video duration.
If "duration" > 0, frame_num is the number of frames per segment.
:param duration: The duration of each segment in seconds.
If 0, frames are extracted from the entire video.
If duration > 0, the video is segmented into multiple segments
based on duration, and frames are extracted from each segment.
:param frame_dir: Output directory to save extracted frames.

"""

super().__init__(*args, **kwargs)
LazyLoader.check_packages(["torchvision"])

self.model_key = prepare_model(model_type="optical_flow_raft")
self.if_save_visualization = if_save_visualization
self.save_visualization_dir = save_visualization_dir
self.frame_field = MetaKeys.video_frames
self.tag_field_name = MetaKeys.video_optical_flow_tags
self.frame_num = frame_num
self.duration = duration
self.frame_dir = frame_dir

self.video_extract_frames_mapper_args = {
"frame_sampling_method": "uniform",
"frame_num": frame_num,
"duration": duration,
"frame_dir": frame_dir,
"frame_key": MetaKeys.video_frames,
}
self.fused_ops = load_ops([{"video_extract_frames_mapper": self.video_extract_frames_mapper_args}])

def raft_preprocess(self, img1_batch, img2_batch, transforms):
img1_batch = torchvision.transforms.functional.resize(img1_batch, size=[520, 960], antialias=False)
img2_batch = torchvision.transforms.functional.resize(img2_batch, size=[520, 960], antialias=False)
return transforms(img1_batch, img2_batch)

def process_single(self, sample=None, rank=None):

# check if it's generated already
if self.tag_field_name in sample[Fields.meta]:
return sample

# there is no video in this sample
if (self.video_key not in sample or not sample[self.video_key]) and self.frame_field not in sample:
sample[Fields.meta][self.tag_field_name] = {"pred_flow": []}
return sample

model, transforms = get_model(model_key=self.model_key, rank=rank, use_cuda=self.use_cuda())

if rank is not None:
device = f"cuda:{str(rank)}"
else:
device = "cuda"

if self.frame_field in sample:
frames_path = sample[self.frame_field]
video_name = frames_path[0].split("/")[-2]
Comment thread
Qirui-jiao marked this conversation as resolved.
Outdated
else:
# load videos
ds_list = [{"text": SpecialTokens.video, "videos": sample[self.video_key]}]

dataset = data_juicer.core.data.NestedDataset.from_list(ds_list)
dataset = self.fused_ops[0].run(dataset)

temp_frame_name = os.path.splitext(os.path.basename(sample[self.video_key][0]))[0]
frames_root = os.path.join(self.frame_dir, temp_frame_name)
frame_names = os.listdir(frames_root)
frames_path = sorted([os.path.join(frames_root, frame_name) for frame_name in frame_names])
video_name = os.path.splitext(os.path.basename(sample[self.video_key][0]))[0]

if len(frames_path) < 2:
sample[Fields.meta][self.tag_field_name] = {"pred_flow": []}
return sample

frame_arr_list = []
for temp_img_path_id, temp_img_path in enumerate(frames_path):
frame_arr_list.append(cv2.imread(temp_img_path)[None, :])
Comment thread
Qirui-jiao marked this conversation as resolved.
Outdated

frame_tensor = torch.from_numpy(np.concatenate(frame_arr_list, axis=0)).permute(0, 3, 1, 2)
img1_batch = frame_tensor.clone()[:-1, :, :, :].to(device)
img2_batch = frame_tensor.clone()[1:, :, :, :].to(device)

img1_batch, img2_batch = self.raft_preprocess(img1_batch, img2_batch, transforms)

with torch.no_grad():
list_of_flows = model(img1_batch, img2_batch)
predicted_flow = list_of_flows[-1]

if self.if_save_visualization:
os.makedirs(os.path.join(self.save_visualization_dir, video_name), exist_ok=True)

flow_imgs = torchvision.utils.flow_to_image(predicted_flow).cpu().permute(0, 2, 3, 1).numpy()
for temp_flow_img_id in range(len(flow_imgs)):
cv2.imwrite(
os.path.join(self.save_visualization_dir, video_name, f"vis_{str(temp_flow_img_id)}.jpg"),
flow_imgs[temp_flow_img_id],
)

sample[Fields.meta][self.tag_field_name] = {}
sample[Fields.meta][self.tag_field_name]["pred_flow"] = predicted_flow
Comment thread
Qirui-jiao marked this conversation as resolved.
Outdated

return sample
Loading
Loading