diff --git a/data_juicer/config/config_all.yaml b/data_juicer/config/config_all.yaml
index cfe56a631b9..eaafda6ef94 100644
--- a/data_juicer/config/config_all.yaml
+++ b/data_juicer/config/config_all.yaml
@@ -561,6 +561,16 @@ process:
       if_output_point_maps_from_projection: True              # Determines whether to output point maps directly inferred by VGGT.
       if_output_point_maps_from_unprojection: True            # Determines whether to output point maps constructed from depth maps and camera parameters.
       if_output_point_tracks: True                            # Determines whether to output point tracks.
+  - video_animal_pose_mapper:                           # Detect quadruped animal pose on the video.
+      vitpose_model_path: "apt36k.pth"                        # The path to the ViTPose model.
+      vitpose_config: "configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/apt36k/ViTPose_huge_apt36k_256x192.py"   # Please select the appropriate model configuration.
+      yoloe_model_path: "yoloe-26x-seg.pt"                    # The path to the YOLOE model.
+      animal_class: []                                        # Specifies the quadruped animal categories to be detected. If no value is input, the default list will be used.
+      if_save_visualization: True                             # Whether to save visualization results.
+      save_visualization_dir: null                            # The path for saving visualization results.
+      frame_num: 3                                            # the number of frames to be extracted uniformly from the video. If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
+      duration: 0                                             # the duration of each segment in seconds. If 0, frames are extracted from the entire video. If duration > 0, the video is segmented into multiple segments based on duration, and frames are extracted from each segment.
+      frame_dir: null                                         # Output directory to save extracted frames.
   - video_camera_calibration_static_deepcalib_mapper:   # Compute the camera intrinsics and field of view (FOV) for a static camera using DeepCalib.
       model_path: "weights_10_0.02.h5"                        # The path to the DeepCalib Regression model.
       frame_num: 3                                            # the number of frames to be extracted uniformly from the video. If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
@@ -632,6 +642,13 @@ process:
       blur_type: 'gaussian'                                   # type of blur kernel, including ['mean', 'box', 'gaussian']
       radius: 2                                               # radius of blur kernel
       save_dir: null                                          # The directory where generated files will be stored. If not specified, outputs will be saved in the same directory as their corresponding input files. This path can alternatively be defined by setting the `DJ_PRODUCED_DATA_DIR` environment variable.
+  - video_face_keypoints_mapper:                            # Detect face keypoints (98 points) on the video.
+      ldeq_model_path: "final.pth.tar"                        # The path to the LDEQ model.
+      if_save_visualization: True                             # Whether to save visualization results.
+      save_visualization_dir: null                            # The path for saving visualization results.
+      frame_num: 3                                            # the number of frames to be extracted uniformly from the video. If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
+      duration: 0                                             # the duration of each segment in seconds. If 0, frames are extracted from the entire video. If duration > 0, the video is segmented into multiple segments based on duration, and frames are extracted from each segment.
+      frame_dir: null                                         # Output directory to save extracted frames.
   - video_ffmpeg_wrapped_mapper:                            # simple wrapper for FFmpeg video filters
       filter_name: null                                       # ffmpeg audio filter name. e.g. 'scale'.
       filter_kwargs: null                                     # keyword-arguments passed to ffmpeg filter. e.g. {'width': 224, 'height': 224}.
diff --git a/data_juicer/ops/common/ldeq_face_keypoints_func.py b/data_juicer/ops/common/ldeq_face_keypoints_func.py
new file mode 100644
index 00000000000..849009023fc
--- /dev/null
+++ b/data_juicer/ops/common/ldeq_face_keypoints_func.py
@@ -0,0 +1,971 @@
+# Adapted from https://github.com/polo5/LDEQ_RwR.git
+
+import random
+
+import numpy as np
+
+from data_juicer.utils.lazy_loader import LazyLoader
+
+torch = LazyLoader("torch")
+nn = LazyLoader("torch.nn")
+torchinfo = LazyLoader("torchinfo")
+
+
+class Normalize(nn.Module):
+    """normalize to [0,1]"""
+
+    def __init__(self, n_channels, mode, beta=1.0, learn_beta=False):
+        super().__init__()
+        self.mode = mode
+        assert mode in ["softargmax", "linear"], f"norm {mode} not recognized"
+
+        if mode == "softargmax":
+            if learn_beta:
+                self.nonlinearity = nn.Softplus()
+                self.beta = (
+                    torch.nn.Parameter(torch.ones(n_channels) * beta).view(1, -1, 1, 1).cuda()
+                )  # one beta per heatmap. TODO: why isn't this put on gpu when we put whole model?
+            else:
+                self.nonlinearity = lambda x: x
+                self.beta = beta
+
+    def forward(self, heatmaps):
+
+        if self.mode == "softargmax":
+            heatmaps = heatmaps - torch.amax(heatmaps, dim=(2, 3), keepdim=True)
+            heatmaps = torch.exp(
+                self.nonlinearity(self.beta) * heatmaps
+            )  # nonlinearity makes sure beta is positive so that exp input still in [-inf,0] so that output is in [0,1]
+
+        elif self.mode == "linear":
+            heatmaps_max, heatmaps_min = torch.amax(heatmaps, dim=(2, 3), keepdim=True), torch.amin(
+                heatmaps, dim=(2, 3), keepdim=True
+            )  # shape (B,n_kpts,1,1)
+            heatmaps = (heatmaps - heatmaps_min) / (heatmaps_max - heatmaps_min + 1e-5)
+
+        return heatmaps
+
+
+class HeatmapsToKeypoints(nn.Module):
+    """converts 2D heatmaps into (x,y) coordinates in range [0,1] that our loss can use"""
+
+    def __init__(self):
+        super().__init__()
+        self.first_run = True
+
+    def forward(self, heatmaps):
+        """heatmap values must all be between 0 and 1. This is achieved with the Normalize class above"""
+        B, n_keypoints, H, W = heatmaps.shape
+        heatmaps = heatmaps / (1e-4 + torch.sum(heatmaps, dim=[2, 3], keepdim=True))  # now heatmap values all sum to 1
+
+        if self.first_run:
+            col_vals = torch.arange(0, W)
+            self.col_grid = (
+                col_vals.repeat(H, 1).view(1, 1, H, W).to(heatmaps.device)
+            )  # each column is a single repeated number
+            row_vals = torch.arange(0, H).view(H, -1)
+            self.row_grid = (
+                row_vals.repeat(1, W).view(1, 1, H, W).float().to(heatmaps.device)
+            )  # each row is a single repeated number
+            self.first_run = False
+
+        weighted_x = heatmaps * self.col_grid
+        x_vals = weighted_x.sum(dim=[2, 3]) / H  # in range [0,1], shape (B,98)
+        weighted_y = heatmaps * self.row_grid
+        y_vals = weighted_y.sum(dim=[2, 3]) / H  # in range [0,1], shape (B,98)
+        out = torch.stack((x_vals, y_vals), dim=2)
+
+        # TODO: not sure this is still correct if using linear normalization for heatmaps:
+        var_x = ((self.col_grid - x_vals.unsqueeze(2).unsqueeze(3)).pow(2) * heatmaps).sum(
+            dim=[2, 3]
+        )  # this is like a variance term and can take on large values (~600 for heatmap size 64)
+        var_y = ((self.row_grid - y_vals.unsqueeze(2).unsqueeze(3)).pow(2) * heatmaps).sum(dim=[2, 3])
+        # NB: if x_vals=mean=5, then (col_grid - x_vals) will be a grid, with 0 at location of the mean and polynomially increasing values around the mean
+        # then heatmaps weighs this grid with the spread of predictions. If heatmaps is non zero only in location of mean, then sigma_x = 0
+
+        stds = torch.sqrt(0.5 * var_x + 0.5 * var_y) / H  # shape (B,98), i.e. one std value per heatmap.
+
+        return out, stds  # out is (B, 98, 2)
+
+
+class HeatmapsToKeypointsNoSum(nn.Module):
+    """converts 2D heatmaps into (x,y) coordinates in range [0,1] that our loss can use,
+    This can be used if input heatmaps are already divided by their sum
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.first_run = True
+
+    def forward(self, heatmaps):
+        """heatmap values must all be between 0 and 1. This is achieved with the Normalize class above"""
+        B, n_keypoints, H, W = heatmaps.shape
+        # heatmaps = heatmaps/(1e-4+torch.sum(heatmaps, dim=[2,3], keepdim=True))
+
+        if self.first_run:
+            col_vals = torch.arange(0, W)
+            self.col_grid = (
+                col_vals.repeat(H, 1).view(1, 1, H, W).to(heatmaps.device)
+            )  # each column is a single repeated number
+            row_vals = torch.arange(0, H).view(H, -1)
+            self.row_grid = (
+                row_vals.repeat(1, W).view(1, 1, H, W).float().to(heatmaps.device)
+            )  # each row is a single repeated number
+            self.first_run = False
+
+        weighted_x = heatmaps * self.col_grid
+        x_vals = weighted_x.sum(dim=[2, 3]) / H  # in range [0,1], shape (B,98)
+        weighted_y = heatmaps * self.row_grid
+        y_vals = weighted_y.sum(dim=[2, 3]) / H  # in range [0,1], shape (B,98)
+        out = torch.stack((x_vals, y_vals), dim=2)
+
+        # TODO: not sure this is still correct if using linear normalization for heatmaps:
+        var_x = ((self.col_grid - x_vals.unsqueeze(2).unsqueeze(3)).pow(2) * heatmaps).sum(
+            dim=[2, 3]
+        )  # this is like a variance term and can take on large values (~600 for heatmap size 64)
+        var_y = ((self.row_grid - y_vals.unsqueeze(2).unsqueeze(3)).pow(2) * heatmaps).sum(dim=[2, 3])
+        # NB: if x_vals=mean=5, then (col_grid - x_vals) will be a grid, with 0 at location of the mean and polynomially increasing values around the mean
+        # then heatmaps weighs this grid with the spread of predictions. If heatmaps is non zero only in location of mean, then sigma_x = 0
+
+        stds = torch.sqrt(0.5 * var_x + 0.5 * var_y) / H  # shape (B,98), i.e. one std value per heatmap.
+
+        return out, stds
+
+
+def _safe_norm(v):
+    if not torch.isfinite(v).all():
+        return np.inf
+    return torch.norm(v)
+
+
+def scalar_search_armijo(phi, phi0, derphi0, c1=1e-4, alpha0=1, amin=0):
+    """
+    see https://github.com/scipy/scipy/blob/main/scipy/optimize/_linesearch.py
+    Minimize over alpha, the function phi(alpha). Uses the interpolation algorithm (Armijo backtracking) as suggested by
+    Wright and Nocedal in 'Numerical Optimization', 1999, pp. 56-57. alpha > 0 is assumed to be a descent direction.
+
+    phi = callable function phi(alpha)
+    phi0 = value of phi(alpha) for original estimate
+    derphi = callable function phi'(alpha).
+
+    In our case phi(alpha) = torch.norm(g(x0 + alpha * update))**2 ?
+
+    """
+    ite = 0
+    phi_a0 = phi(alpha0)  # First do an update with step size 1
+    if phi_a0 <= phi0 + c1 * alpha0 * derphi0:
+        return alpha0, phi_a0, ite
+
+    # Otherwise, compute the minimizer of a quadratic interpolant
+    alpha1 = -(derphi0) * alpha0**2 / 2.0 / (phi_a0 - phi0 - derphi0 * alpha0)
+    phi_a1 = phi(alpha1)
+
+    # Otherwise loop with cubic interpolation until we find an alpha which
+    # satisfies the first Wolfe condition (since we are backtracking, we will
+    # assume that the value of alpha is not too small and satisfies the second
+    # condition.
+    while alpha1 > amin:  # we are assuming alpha>0 is a descent direction
+        factor = alpha0**2 * alpha1**2 * (alpha1 - alpha0)
+        a = alpha0**2 * (phi_a1 - phi0 - derphi0 * alpha1) - alpha1**2 * (phi_a0 - phi0 - derphi0 * alpha0)
+        a = a / factor
+        b = -(alpha0**3) * (phi_a1 - phi0 - derphi0 * alpha1) + alpha1**3 * (phi_a0 - phi0 - derphi0 * alpha0)
+        b = b / factor
+
+        alpha2 = (-b + torch.sqrt(torch.abs(b**2 - 3 * a * derphi0))) / (3.0 * a)
+        phi_a2 = phi(alpha2)
+        ite += 1
+
+        if phi_a2 <= phi0 + c1 * alpha2 * derphi0:
+            return alpha2, phi_a2, ite
+
+        if (alpha1 - alpha2) > alpha1 / 2.0 or (1 - alpha2 / alpha1) < 0.96:
+            alpha2 = alpha1 / 2.0
+
+        alpha0 = alpha1
+        alpha1 = alpha2
+        phi_a0 = phi_a1
+        phi_a1 = phi_a2
+
+    # Failed to find a suitable step length
+    return None, phi_a1, ite
+
+
+def line_search(update, x0, g0, g, on=True):
+    """
+    Instead of solving for the best step size to use exactly, we use a fast line search algorithm
+    to find an okay step size, so that compute can be spent on computing the update itself rather
+    than the step size.
+
+    `update` is the propsoed direction of update.
+
+    Code adapted from scipy.
+    """
+    tmp_s = [0]
+    tmp_g0 = [g0]
+    tmp_phi = [torch.norm(g0) ** 2]
+    # s_norm = torch.norm(x0) / torch.norm(update) #for wolfe search only
+
+    def phi(s, store=True):
+        """takes in step size alpha being tried, produces the next x_est with it,
+        and returns what we want to minimize, i.e. norm of g(x_est)"""
+        if s == tmp_s[0]:
+            return tmp_phi[0]  # If the step size is so small... just return something
+        x_est = x0 + s * update
+        g0_new = g(x_est)
+        phi_new = _safe_norm(g0_new) ** 2
+        if store:
+            tmp_s[0] = s
+            tmp_g0[0] = g0_new
+            tmp_phi[0] = phi_new
+        return phi_new
+
+    if on:
+        s, phi1, ite = scalar_search_armijo(phi, tmp_phi[0], -tmp_phi[0], amin=1e-2)
+    if (not on) or s is None:
+        s = 1.0
+        ite = 0
+
+    x_est = x0 + s * update
+    if s == tmp_s[0]:
+        g0_new = tmp_g0[0]
+    else:
+        g0_new = g(x_est)
+
+    return x_est, g0_new, x_est - x0, g0_new - g0, ite
+
+
+def rmatvec(part_Us, part_VTs, x):
+    # Compute x^T(-I + UV^T)
+    # x: (N, 2d, L')
+    # part_Us: (N, 2d, L', threshold)
+    # part_VTs: (N, threshold, 2d, L')
+    if part_Us.nelement() == 0:
+        return -x
+    xTU = torch.einsum("bij, bijd -> bd", x, part_Us)  # (N, threshold)
+    return -x + torch.einsum("bd, bdij -> bij", xTU, part_VTs)  # (N, 2d, L'), but should really be (N, 1, (2d*L'))
+
+
+def matvec(part_Us, part_VTs, x):
+    # Compute (-I + UV^T)x
+    # x: (N, 2d, L')
+    # part_Us: (N, 2d, L', threshold)
+    # part_VTs: (N, threshold, 2d, L')
+    if part_Us.nelement() == 0:
+        return -x
+    VTx = torch.einsum("bdij, bij -> bd", part_VTs, x)  # (N, threshold)
+    return -x + torch.einsum("bijd, bd -> bij", part_Us, VTx)  # (N, 2d, L'), but should really be (N, (2d*L'), 1)
+
+
+def broyden(f, x0, max_iters, eps=1e-3, stop_mode="rel", ls=False, verbose=False, save_trajectory=False):
+    # print(f'broyden input size: {x0.size()}')
+    bsz, total_hsize, seq_len = x0.size()
+
+    # g = lambda y: f(y) - y
+    def g(y):
+        return f(y) - y
+
+    dev = x0.device
+    alternative_mode = "rel" if stop_mode == "abs" else "abs"
+    trajectory = []
+
+    x_est = x0  # (bsz, 2d, L')
+    gx = g(x_est)  # (bsz, 2d, L')
+    nstep = 0
+    tnstep = 0
+
+    # For fast calculation of inv_jacobian (approximately)
+    Us = torch.zeros(bsz, total_hsize, seq_len, max_iters).to(
+        dev
+    )  # One can also use an L-BFGS scheme to further reduce memory
+    VTs = torch.zeros(bsz, max_iters, total_hsize, seq_len).to(dev)
+    update = -matvec(
+        Us[:, :, :, :nstep], VTs[:, :nstep], gx
+    )  # Formally should be -torch.matmul(inv_jacobian, (-I), gx)
+    prot_break = False
+
+    # To be used in protective breaks
+    protect_thres = (1e6 if stop_mode == "abs" else 1e3) * seq_len
+
+    trace_dict = {"abs": [], "rel": []}
+    lowest_dict = {"abs": 1e8, "rel": 1e8}
+    lowest_step_dict = {"abs": 0, "rel": 0}
+    nstep, lowest_xest, _ = 0, x_est, gx
+
+    while nstep < max_iters:
+        x_est, gx, delta_x, delta_gx, ite = line_search(
+            update, x_est, gx, g, on=ls
+        )  # returns x_est, gx_new, x_est_new - x_est_prev, gx_new - gx_prev, ite
+        nstep += 1
+        tnstep += ite + 1
+
+        abs_diffs = gx.norm(dim=1)
+        rel_diffs = abs_diffs / (1e-5 + (gx + x_est).norm(dim=1))
+        abs_diff, rel_diff = (
+            abs_diffs.mean(),
+            rel_diffs.mean(),
+        )  # rel diff correctly calculated is ~5% different from official implementation
+
+        if verbose:
+            print(
+                f"abs diff {abs_diff:.2E} \t rel diff: {rel_diff:.2E} \t z scale: {torch.mean(x_est):.0E} +/- {torch.std(x_est):.0E}"
+            )
+        diff_dict = {"abs": abs_diff, "rel": rel_diff}
+        trace_dict["abs"].append(abs_diff)
+        trace_dict["rel"].append(rel_diff)
+        # print(f'broyden step {nstep} --- abs diff {abs_diff} --- rel diff {rel_diff}')
+        for mode in ["rel", "abs"]:
+            if diff_dict[mode] < lowest_dict[mode] or nstep == 1:
+                if mode == stop_mode:
+                    lowest_xest, _ = x_est.clone().detach(), gx.clone().detach()
+                lowest_dict[mode] = diff_dict[mode]
+                lowest_step_dict[mode] = nstep
+
+        if save_trajectory:
+            trajectory.append(x_est.view_as(x0).clone().detach())
+
+        # Added by Paul to measure stability of solver
+        if nstep == 1:
+            stability = 1
+            prev_rel_diff = rel_diff
+        else:
+            if rel_diff > prev_rel_diff:  # error is jumping around
+                stability = 0
+            prev_rel_diff = rel_diff
+
+        new_objective = diff_dict[stop_mode]
+        if new_objective < eps:  # stop even if haven't reached max_iters steps
+            if verbose:
+                print(f"STOPPING BROYDEN SPECIAL CASE: met tolerance")
+            break
+        if (
+            new_objective < 3 * eps
+            and nstep > 30
+            and np.max(trace_dict[stop_mode][-30:]) / np.min(trace_dict[stop_mode][-30:]) < 1.3
+        ):
+            # if there's hardly been any progress in the last 30 steps
+            if verbose:
+                print("STOPPING BROYDEN SPECIAL CASE: no progress in last 30 steps")
+            break
+        if new_objective > trace_dict[stop_mode][0] * protect_thres:
+            if verbose:
+                print("STOPPING BROYDEN SPECIAL CASE: protect thresh")
+            prot_break = True
+            break
+
+        part_Us, part_VTs = Us[:, :, :, : nstep - 1], VTs[:, : nstep - 1]
+        vT = rmatvec(part_Us, part_VTs, delta_x)
+        u = (delta_x - matvec(part_Us, part_VTs, delta_gx)) / torch.einsum("bij, bij -> b", vT, delta_gx)[:, None, None]
+        vT[vT != vT] = 0  # replace nans with zeros
+        u[u != u] = 0
+        VTs[:, nstep - 1] = vT
+        Us[:, :, :, nstep - 1] = u
+        update = -matvec(Us[:, :, :, :nstep], VTs[:, :nstep], gx)
+        # print(update.device)
+
+    # Fill everything up to the max_iters length
+    for _ in range(max_iters + 1 - len(trace_dict[stop_mode])):
+        trace_dict[stop_mode].append(lowest_dict[stop_mode])
+        trace_dict[alternative_mode].append(lowest_dict[alternative_mode])
+
+    # print(f'{name} total broyden steps: {nstep} --- rel diff {rel_diff:02.5f}')
+
+    out = {
+        "result": lowest_xest,
+        "lowest_abs_diff": lowest_dict["abs"].item(),
+        "lowest_rel_diff": lowest_dict["rel"].item(),
+        "nstep_best": lowest_step_dict[stop_mode],  # which step was the best in hindsight
+        "nstep": nstep,
+        "prot_break": prot_break,
+        "abs_trace": trace_dict["abs"],
+        "rel_trace": trace_dict["rel"],
+        "eps": eps,
+        "trajectory": trajectory,
+        "stability": stability,
+    }
+
+    return out
+
+
+def anderson(
+    f, x0, m=6, lam=1e-4, max_iters=50, eps=1e-3, stop_mode="rel", beta=1.0, verbose=False, save_trajectory=False
+):
+    """Anderson acceleration for fixed point iteration."""
+    # print('stop mode ', stop_mode)
+    bsz, d, L = x0.shape
+    m = int(m)
+    alternative_mode = "rel" if stop_mode == "abs" else "abs"
+    X = torch.zeros(bsz, m, int(d * L), dtype=x0.dtype, device=x0.device)  # keep track of all previous estimates x_i s
+    F = torch.zeros(bsz, m, int(d * L), dtype=x0.dtype, device=x0.device)  # keep track of all previous f(x_i) s
+    X[:, 0], F[:, 0] = x0.reshape(bsz, -1), f(x0).reshape(bsz, -1)  # first estimate x0 is given as input
+    X[:, 1], F[:, 1] = F[:, 0], f(F[:, 0].reshape_as(x0)).reshape(
+        bsz, -1
+    )  # second estimate in X is just f(x0) as in fpi because we don't have any previous estimates to lookback to
+
+    H = torch.zeros(bsz, m + 1, m + 1, dtype=x0.dtype, device=x0.device)
+    H[:, 0, 1:] = H[:, 1:, 0] = 1
+    y = torch.zeros(bsz, m + 1, 1, dtype=x0.dtype, device=x0.device)
+    y[:, 0] = 1
+
+    trace_dict = {"abs": [], "rel": []}
+    lowest_dict = {"abs": 1e8, "rel": 1e8}
+    lowest_step_dict = {"abs": 0, "rel": 0}
+    trajectory = []
+
+    # if verbose: print('Original tensors ')
+    # if verbose: debug_print([X, F, H, y])
+
+    for k in range(2, max_iters + 2):
+        n = min(k, m)
+        G = F[:, :n] - X[:, :n]
+        H[:, 1 : n + 1, 1 : n + 1] = (
+            torch.bmm(G, G.transpose(1, 2)) + lam * torch.eye(n, dtype=x0.dtype, device=x0.device)[None]
+        )
+
+        # alpha = torch.solve(y[:,:n+1], H[:,:n+1,:n+1])[0][:, 1:n+1, 0]   # (bsz x n)
+        alpha = torch.linalg.solve(H[:, : n + 1, : n + 1], y[:, : n + 1])[:, 1 : n + 1, 0]  # (bsz x n)
+
+        X[:, k % m] = (
+            beta * (alpha[:, None] @ F[:, :n])[:, 0] + (1 - beta) * (alpha[:, None] @ X[:, :n])[:, 0]
+        )  # beta=1.0 in normal anderson formulation. beta<1 is damped anderson acceleration, while beta>1 is overprojected
+        F[:, k % m] = f(X[:, k % m].reshape_as(x0)).reshape(bsz, -1)
+        gx = F[:, k % m] - X[:, k % m]  # .view_as(x0)
+
+        abs_diffs = gx.norm(dim=1)
+        rel_diffs = abs_diffs / (1e-5 + F[:, k % m]).norm(dim=1)
+        abs_diff, rel_diff = (
+            abs_diffs.mean(),
+            rel_diffs.mean(),
+        )  # rel diff correctly calculated is ~5% different from official implementation
+        if verbose:
+            print(
+                f"abs diff {abs_diff:.2E} \t rel diff: {rel_diff:.2E} \t z scale: {torch.mean(X[:, k % m]):.0E} +/- {torch.std(X[:,k % m]):.0E}"
+            )
+
+        diff_dict = {"abs": abs_diff, "rel": rel_diff}
+        trace_dict["abs"].append(abs_diff)
+        trace_dict["rel"].append(rel_diff)
+
+        for mode in ["rel", "abs"]:
+            # print(diff_dict[mode], lowest_dict[mode])
+            if (diff_dict[mode] < lowest_dict[mode]) or k == 2:
+                if mode == stop_mode:
+                    lowest_xest, _ = X[:, k % m].view_as(x0).clone().detach(), gx.clone().detach()
+                lowest_dict[mode] = diff_dict[mode]
+                lowest_step_dict[mode] = k
+
+        if save_trajectory:
+            trajectory.append(X[:, k % m].view_as(x0).clone().detach())
+            # print('------ ', float(torch.sum(X[:,k%m].view_as(x0).clone().detach())))
+
+        # --------------- Added by Paul to measure stability of solver
+        if k == 2:
+            stability = 1
+            abs_error_prev = abs_diff
+        else:
+            if abs_diff > abs_error_prev:  # error is jumping around
+                stability = 0
+            abs_error_prev = abs_diff
+        # ---------------
+
+        if trace_dict[stop_mode][-1] < eps:
+            for _ in range(max_iters + 1 - k):  # paul changed -1 to +1
+                trace_dict[stop_mode].append(lowest_dict[stop_mode])
+                trace_dict[alternative_mode].append(lowest_dict[alternative_mode])
+            break
+
+    out = {
+        "result": lowest_xest,  # not necessarily the last z of trajectory. It's the z with lowest error
+        "lowest_abs_diff": lowest_dict["abs"].item(),
+        "lowest_rel_diff": lowest_dict["rel"].item(),
+        "nstep_best": lowest_step_dict[stop_mode],  # which step was the best in hindsight
+        "nstep": k - 1,
+        "prot_break": False,
+        "abs_trace": trace_dict["abs"],
+        "rel_trace": trace_dict["rel"],
+        "eps": eps,
+        "trajectory": trajectory,
+        "stability": stability,
+    }
+
+    return out
+
+
+def fpi(f, x0, max_iters, eps=1e-3, stop_mode="rel", verbose=False, save_trajectory=False):
+    """fast and cheap in memory but no guarantees to return stable FP, contrary to other solvers"""
+    trajectory = []
+    bsz = x0.shape[0]
+    x_prev = x0
+    iter_idx = 0
+
+    while iter_idx < max_iters:
+        x_new = f(x_prev)
+        abs_diffs = (x_new - x_prev).view(bsz, -1).norm(dim=1)
+        rel_diffs = abs_diffs / (1e-5 + x_new.view(bsz, -1).norm(dim=1))
+        abs_diff, rel_diff = abs_diffs.mean(), rel_diffs.mean()
+        if verbose:
+            print(
+                f"abs diff {abs_diff:.3E} \t rel diff: {rel_diff:.3E} \t z scale: {torch.mean(x_new):.0E} +/- {torch.std(x_new):.0E}"
+            )
+
+        if save_trajectory:
+            trajectory.append(x_new.clone().detach())
+
+        # --------------- Added by Paul to measure stability of solver
+        if iter_idx == 0:
+            stability = 1
+            abs_error_prev = abs_diff
+        else:
+            if abs_diff > abs_error_prev:  # error is jumping around
+                stability = 0
+            abs_error_prev = abs_diff
+        # ---------------
+
+        iter_idx += 1
+
+        if (stop_mode == "abs" and abs_diff < eps) or (stop_mode == "rel" and rel_diff < eps):
+            break
+
+        x_prev = x_new
+
+    return x_new, iter_idx, abs_diff.item(), rel_diff.item(), stability, trajectory
+
+
+def root_solver(f, x0, max_iters, solver_args, stochastic_max_iters=False, save_trajectory=False, name="forward"):
+    """
+    There are many solvers that all return different metrics and take different arguments.
+    This is a wrapping function that evaluates each solver.
+    solver_args must contain the solver specific arguments like:
+    solver_args.anderson_m = 6 etc.
+
+    returns: n_iters, final_rel_error
+    """
+
+    max_iters = random.randint(1, max_iters) if stochastic_max_iters else max_iters
+    if solver_args.verbose_solver:
+        print(f"----- SOLVER: {solver_args.solver} {name} mi={max_iters}")
+
+    if solver_args.solver == "broyden":
+        results_dict = broyden(
+            f=f,
+            x0=x0,
+            max_iters=max_iters,
+            eps=solver_args.abs_diff_target if solver_args.stop_mode == "abs" else solver_args.rel_diff_target,
+            stop_mode=solver_args.stop_mode,
+            ls=False,
+            verbose=solver_args.verbose_solver,
+            save_trajectory=save_trajectory,
+        )
+        solution, n_iters, final_abs_diff, final_rel_diff, stability, trajectory = (
+            results_dict["result"],
+            results_dict["nstep"],
+            results_dict["lowest_abs_diff"],
+            results_dict["lowest_rel_diff"],
+            results_dict["stability"],
+            results_dict["trajectory"],
+        )
+
+    elif solver_args.solver == "anderson":
+        results_dict = anderson(
+            f=f,
+            x0=x0,
+            m=solver_args.anderson_m,
+            lam=solver_args.anderson_lam,
+            max_iters=max_iters,
+            eps=solver_args.abs_diff_target if solver_args.stop_mode == "abs" else solver_args.rel_diff_target,
+            stop_mode=solver_args.stop_mode,
+            beta=solver_args.anderson_beta,
+            verbose=solver_args.verbose_solver,
+            save_trajectory=save_trajectory,
+        )
+        solution, n_iters, final_abs_diff, final_rel_diff, stability, trajectory = (
+            results_dict["result"],
+            results_dict["nstep"],
+            results_dict["lowest_abs_diff"],
+            results_dict["lowest_rel_diff"],
+            results_dict["stability"],
+            results_dict["trajectory"],
+        )
+
+    elif solver_args.solver == "fpi":
+        solution, n_iters, final_abs_diff, final_rel_diff, stability, trajectory = fpi(
+            f,
+            x0=x0,
+            max_iters=max_iters,
+            eps=solver_args.abs_diff_target if solver_args.stop_mode == "abs" else solver_args.rel_diff_target,
+            stop_mode=solver_args.stop_mode,
+            verbose=solver_args.verbose_solver,
+            save_trajectory=save_trajectory,
+        )
+
+    else:
+        raise NotImplementedError(f"solver {solver_args.solver} unknown")
+
+    # print('stability ', stability)
+
+    solver_logs = {
+        "n_iters": n_iters,
+        "final_abs_diff": final_abs_diff,
+        "final_rel_diff": final_rel_diff,
+        "stability": stability,
+        "trajectory": trajectory,
+        "max_iters": max_iters,
+    }
+
+    return solution, solver_logs
+
+
+def make_cell(args):
+    return eval(args.cell_name)(args)
+
+
+def weights_init(m):
+    if isinstance(m, nn.Conv2d):
+        torch.nn.init.xavier_normal(m.weight.data)
+        torch.nn.init.constant_(m.bias, 0)
+    if isinstance(m, nn.BatchNorm2d):
+        if m.weight is not None:
+            torch.nn.init.constant_(m.weight, 1)
+            torch.nn.init.constant_(m.bias, 0)
+    if isinstance(m, nn.GroupNorm):
+        if m.weight is not None:
+            torch.nn.init.constant_(m.weight, 1)
+            torch.nn.init.constant_(m.bias, 0)
+
+
+####################################################################################
+
+
+class Conv(nn.Module):
+    def __init__(self, inp_dim, out_dim, kernel_size=3, stride=1, norm="BN", GN_groups=1, no_relu=False):
+        super(Conv, self).__init__()
+        assert norm in ["BN", "GN", "None"], f"norm given {norm} unrecognized"
+        self.inp_dim = inp_dim
+        self.out_dim = out_dim
+        self.kernel_size = kernel_size
+        self.conv = nn.Conv2d(inp_dim, out_dim, kernel_size, stride, padding=(kernel_size - 1) // 2, bias=True)
+        self.relu = (lambda x: x) if no_relu else nn.LeakyReLU()
+        self.norm = (
+            (lambda x: x)
+            if norm == "None"
+            else (nn.BatchNorm2d(out_dim) if norm == "BN" else nn.GroupNorm(GN_groups, out_dim))
+        )
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.relu(out)
+        out = self.norm(out)
+        return out
+
+
+class Hourglass(nn.Module):
+    """
+    Write out explicitly since nested formulation is too messy for multi-resolution.
+    Same as default Hourglass for hg_dpeth=4 and n<10 and double_output=True
+    Downside is that this now only supports hg_depth=4
+
+    It was verified to have the same number of parameters (228572, if base_width=16 and increment=8)
+    """
+
+    def __init__(self, base_width, width_increment, norm="GN", GN_groups=8):
+        super().__init__()
+        self.downres = nn.AvgPool2d(2, 2)
+        self.upres = nn.Upsample(scale_factor=2)
+        w, i = base_width, width_increment
+        w1, w2, w3, w4 = w + i, w + 2 * i, w + 3 * i, w + 4 * i
+
+        self.same1 = Conv(w, w, 3, norm=norm, GN_groups=GN_groups)
+        self.upchan1 = Conv(w, w1, 3, norm=norm, GN_groups=GN_groups)
+        self.same2 = Conv(w1, w1, 3, norm=norm, GN_groups=GN_groups)
+        self.upchan2 = Conv(w1, w2, 3, norm=norm, GN_groups=GN_groups)
+        self.same3 = Conv(w2, w2, 3, norm=norm, GN_groups=GN_groups)
+        self.upchan3 = Conv(w2, w3, 3, norm=norm, GN_groups=GN_groups)
+        self.same4 = Conv(w3, w3, 3, norm=norm, GN_groups=GN_groups)
+        self.upchan4 = Conv(w3, w4, 3, norm=norm, GN_groups=GN_groups)
+
+        self.bottlneck = Conv(w4, w4, 3, norm=norm, GN_groups=GN_groups)
+
+        self.downchan1 = Conv(w4, w3, 3, norm=norm, GN_groups=GN_groups)
+        self.downchan2 = Conv(2 * w3, w3, 1, norm=norm, GN_groups=GN_groups)
+        self.same5 = Conv(w3, w3, 5, norm=norm, GN_groups=GN_groups)
+        self.downchan3 = Conv(w3, w2, 3, norm=norm, GN_groups=GN_groups)
+        self.downchan4 = Conv(2 * w2, w2, 1, norm=norm, GN_groups=GN_groups)
+        self.same6 = Conv(w2, w2, 5, norm=norm, GN_groups=GN_groups)
+        self.downchan5 = Conv(w2, w1, 3, norm=norm, GN_groups=GN_groups)
+        self.downchan6 = Conv(2 * w1, w1, 1, norm=norm, GN_groups=GN_groups)
+        self.same7 = Conv(w1, w1, 5, norm=norm, GN_groups=GN_groups)
+        self.downchan7 = Conv(w1, w, 3, norm=norm, GN_groups=GN_groups)
+        self.downchan8 = Conv(2 * w, w, 1, norm=norm, GN_groups=GN_groups)
+        self.same8 = Conv(w, w, 5, norm=norm, GN_groups=GN_groups)
+
+    def forward(self, x):
+        same1 = self.same1(x)
+        downres1 = self.downres(same1)
+        upchan1 = self.upchan1(downres1)
+
+        same2 = self.same2(upchan1)
+        downres2 = self.downres(same2)
+        upchan2 = self.upchan2(downres2)
+
+        same3 = self.same3(upchan2)
+        downres3 = self.downres(same3)
+        upchan3 = self.upchan3(downres3)
+
+        same4 = self.same4(upchan3)
+        downres4 = self.downres(same4)
+        upchan4 = self.upchan4(downres4)
+
+        # -----------------------------
+        bottleneck = self.bottlneck(upchan4)
+        # -----------------------------
+
+        downchan1 = self.downchan1(bottleneck)
+        upres1 = self.upres(downchan1)
+        stack = torch.cat((same4, upres1), 1)
+        downchan2 = self.downchan2(stack)
+        same5 = self.same5(downchan2) + downchan2
+
+        downchan3 = self.downchan3(same5)
+        upres2 = self.upres(downchan3)
+        stack = torch.cat((same3, upres2), 1)
+        downchan4 = self.downchan4(stack)
+        same6 = self.same6(downchan4) + downchan4
+
+        downchan5 = self.downchan5(same6)
+        upres3 = self.upres(downchan5)
+        stack = torch.cat((same2, upres3), 1)
+        downchan6 = self.downchan6(stack)
+        same7 = self.same7(downchan6) + downchan6
+
+        downchan7 = self.downchan7(same7)
+        upres4 = self.upres(downchan7)
+        stack = torch.cat((same1, upres4), 1)
+        downchan8 = self.downchan8(stack)
+        same8 = self.same8(downchan8) + downchan8
+
+        return same8
+
+
+####################################################################################
+
+
+class Cell0(nn.Module):
+    """same as Cell0 but always outputs data in [0,1]. We try various normalization techniques"""
+
+    def __init__(self, args):
+        super().__init__()
+        norm_layer = "BN" if args.cell_use_bn_for_explicit and args.model_mode == "explicit" else "GN"
+        self.tail = Conv(
+            args.z_width + args.injection_width, args.cell_base_width, 1, norm=norm_layer, GN_groups=args.cell_gn_groups
+        )
+        self.hourglass = Hourglass(
+            args.cell_base_width, args.cell_width_increment, norm=norm_layer, GN_groups=args.cell_gn_groups
+        )
+        self.head = Conv(args.cell_base_width, args.z_width, 1, stride=1, norm="None", no_relu=True)
+        self.features_to_heatmaps = (
+            (lambda x: x)
+            if args.cell_norm == "None"
+            else Normalize(
+                args.z_width,
+                mode=args.cell_norm,
+                beta=args.cell_softargmax_beta,
+                learn_beta=args.cell_learn_softargmax_beta,
+            )
+        )
+
+    def forward(self, z, injection):
+        # print(z.shape, injection.shape)
+        out = self.tail(torch.cat([z, injection], dim=1))
+        out = self.hourglass(out)
+        out = self.head(out)  # heatmap size
+        out = self.features_to_heatmaps(out)  # heatmap = normalized features
+
+        return out
+
+
+####################################################################################
+
+
+class DEQLayer(nn.Module):
+    """
+    A DEQ layer applies the same cell with weight-sharing for several iterations.
+    It can do so explicitly (track operations in autograd) or implicitly (only track very last iteration)
+    """
+
+    def __init__(self, cell, args):
+        super().__init__()
+        self.cell, self.heatmap_size, self.z_width = cell, args.heatmap_size, args.z_width
+
+    def _forward_explicit(self, x, args, z0, save_trajectory=False):
+        fwd_logs = None
+        trajectory = [z0] if save_trajectory else []
+        out = z0
+        depth = 2 if args is None else args.explicit_depth  # torchinfo debug
+        for _ in range(depth):
+            out = self.cell(out, injection=x)
+            if save_trajectory:
+                trajectory.append(out.detach())
+        # no need to do one more tracked forward pass here because they're all tracked already
+
+        return out, fwd_logs, trajectory
+
+    def _forward_implicit(self, x, args, z0, save_trajectory=False):
+        trajectory = []
+        z_shape = (x.shape[0], self.z_width, self.heatmap_size, self.heatmap_size)  # agnostic to x dimensions
+        z_shape_solver = (
+            x.shape[0],
+            self.z_width * self.heatmap_size * self.heatmap_size,
+            1,
+        )  # agnostic to x dimensions
+        # func = lambda z: self.cell(z.view(z_shape), injection=x).view(
+        #     z_shape_solver
+        # )  # inputs/outputs vector of shape z_shape_solver
+
+        def func(z):
+            return self.cell(z.view(z_shape), injection=x).view(z_shape_solver)
+
+        stochastic_max_iters = args.stochastic_max_iters if self.training else False
+        max_iters = (
+            max(1, round(args.max_iters / 2)) if (not self.training and args.stochastic_max_iters) else args.max_iters
+        )
+
+        with torch.no_grad():
+            z_star, fwd_logs = root_solver(
+                f=func,
+                x0=z0,
+                max_iters=max_iters,
+                solver_args=args,
+                stochastic_max_iters=stochastic_max_iters,
+                save_trajectory=save_trajectory,
+                name="forward",
+            )
+
+        if self.training:
+            z_star_new = func(z_star.requires_grad_())  # extra tracked step so we create a computational graph
+
+            if args.solver == "fpi":
+                with torch.no_grad():
+                    fwd_logs["final_solver_error"] = float(
+                        torch.norm(z_star_new - z_star) / (torch.norm(z_star_new) + 1e-9)
+                    )  # same as the one in solver_logs if using tracing. But fpi doesn't do tracing.
+
+            if not args.JFB:
+
+                def backward_hook(grad):
+                    if self.hook is not None:
+                        self.hook.remove()
+                        torch.cuda.synchronize()
+
+                    # func = lambda y: torch.autograd.grad(z_star_new, z_star, y, retain_graph=True)[0] + grad
+                    def func(y):
+                        return torch.autograd.grad(z_star_new, z_star, y, retain_graph=True)[0] + grad
+
+                    solution, solver_logs_bwd = root_solver(
+                        f=func,
+                        x0=torch.zeros_like(grad),
+                        max_iters=max(1, round(args.max_iters / 2)) if args.stochastic_max_iters else args.max_iters,
+                        solver_args=args,
+                        stochastic_max_iters=False,
+                        save_trajectory=False,
+                        name="backward",
+                    )
+                    # solution, solver_logs_bwd = root_solver(f=func, x0=torch.rand_like(grad), solver_args=args, stochastic_max_iters=False, save_trajectory=False, name="backward") #not good
+                    if args.verbose_solver:
+                        print(
+                            f"original grad: scale = {torch.mean(torch.abs(grad)):01.1e}, pos sign frac = {100*torch.mean((torch.sign(grad)+1)/2):02.0f}%"
+                        )
+                        print(
+                            f"  new    grad: scale = {torch.mean(torch.abs(solution)):01.1e}, pos sign frac = {100*torch.mean((torch.sign(solution)+1)/2):02.0f}%"
+                        )
+                        print(
+                            f" ---   change: value = {100*torch.mean(torch.abs((solution-grad)))/torch.mean(torch.abs(grad)):02.0f}%, sign: {100*torch.mean((torch.sign(grad)-torch.sign(solution))/2):02.0f}%"
+                        )
+                    return solution
+
+                self.hook = z_star_new.register_hook(
+                    backward_hook
+                )  # WARNING: leads to memory leak if not cleared with .backward() at each batch
+
+        else:
+            if args.take_one_less_inference_step:
+                z_star_new = z_star
+            else:
+                with torch.no_grad():
+                    z_star_new = func(
+                        z_star
+                    )  # usually don't need to take this step at inference if close enough to solution already
+                    if args.solver == "fpi":
+                        fwd_logs["final_solver_error"] = float(
+                            torch.norm(z_star_new - z_star) / (torch.norm(z_star_new) + 1e-9)
+                        )
+                    # Note that when this extra step is performed we are actually taking max_iters+1 iterations
+
+        if save_trajectory:  # change shape and add z0
+            trajectory = [z.view(z_shape) for z in fwd_logs["trajectory"]]
+            trajectory.insert(0, z0.view(z_shape))
+            del fwd_logs["trajectory"]
+
+        return z_star_new.view(z_shape), fwd_logs, trajectory
+
+    def forward(self, x, mode, args, z0=None, save_trajectory=False):
+        z_shape = (x.shape[0], self.z_width, self.heatmap_size, self.heatmap_size)
+        z_shape_solver = (x.shape[0], self.z_width * self.heatmap_size * self.heatmap_size, 1)
+
+        if mode == "explicit":
+            z0 = z0.view(*z_shape)
+            out, fwd_logs, trajectory = self._forward_explicit(x, args, z0, save_trajectory)
+        elif mode == "implicit":
+            z0 = z0.view(*z_shape_solver)
+            out, fwd_logs, trajectory = self._forward_implicit(x, args, z0, save_trajectory)
+        else:
+            raise NotImplementedError
+
+        z_star_copy = out.detach().view(*z_shape)
+        return out, z_star_copy, fwd_logs, trajectory
+
+
+####################################################################################
+
+
+class LDEQ(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        out_width = args.injection_width
+
+        self.tail = nn.Sequential(
+            Conv(3, out_width // 4, 7, 2),
+            Conv(out_width // 4, out_width // 2, 3, 1),
+            nn.MaxPool2d(2, 2),
+            Conv(out_width // 2, out_width, 3, 1),
+        )
+
+        cell = make_cell(args)
+        self.deq_layer = DEQLayer(cell, args)  # outputs potentially already [0,1] normalized heatmaps
+        self.final_features_to_heatmaps = (
+            Normalize(args.z_width, "softargmax", args.cell_softargmax_beta, False)
+            if args.cell_norm == "None"
+            else lambda x: x
+        )  # output of cell possibly already normalized
+        self.heatmaps_to_keypoints = HeatmapsToKeypoints()
+
+    def forward(self, x, mode="implicit", args=None, z0=None, save_trajectory=False):
+        """
+        mode = 'implicit' or 'explicit'. Explicit is done with weight sharing
+        zc only added for mode==implicit_broyden_strategy1_forward_only
+        """
+
+        x = self.tail(x)
+        z_star, z_star_copy, fwd_logs, trajectory = self.deq_layer(
+            x, mode, args, z0, save_trajectory
+        )  # z0 and z_star can be tensors or lists of tensors.
+        out = self.final_features_to_heatmaps(z_star)
+        preds, uncertainty = self.heatmaps_to_keypoints(out[:, : self.args.n_keypoints, :, :])
+        results = {
+            "keypoints": preds,
+            "uncertainty": uncertainty,
+            "fwd_logs": fwd_logs,
+            "z_star": z_star_copy,
+            "trajectory": trajectory,
+        }
+
+        return results
diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py
index f84277b849d..c805b82fd86 100644
--- a/data_juicer/ops/mapper/__init__.py
+++ b/data_juicer/ops/mapper/__init__.py
@@ -102,6 +102,7 @@
 from .tool_success_tagger_mapper import ToolSuccessTaggerMapper
 from .usage_counter_mapper import UsageCounterMapper
 from .vggt_mapper import VggtMapper
+from .video_animal_pose_mapper import VideoAnimalPoseMapper
 from .video_camera_calibration_static_deepcalib_mapper import (
     VideoCameraCalibrationStaticDeepcalibMapper,
 )
@@ -117,6 +118,7 @@
 from .video_depth_estimation_mapper import VideoDepthEstimationMapper
 from .video_extract_frames_mapper import VideoExtractFramesMapper
 from .video_face_blur_mapper import VideoFaceBlurMapper
+from .video_face_keypoints_mapper import VideoFaceKeypointsMapper
 from .video_ffmpeg_wrapped_mapper import VideoFFmpegWrappedMapper
 from .video_hand_reconstruction_hawor_mapper import VideoHandReconstructionHaworMapper
 from .video_hand_reconstruction_mapper import VideoHandReconstructionMapper
@@ -232,6 +234,7 @@
     "ToolSuccessTaggerMapper",
     "UsageCounterMapper",
     "VggtMapper",
+    "VideoAnimalPoseMapper",
     "VideoCameraCalibrationStaticDeepcalibMapper",
     "VideoCameraCalibrationStaticMogeMapper",
     "VideoCaptioningFromAudioMapper",
@@ -245,6 +248,7 @@
     "VideoHandReconstructionHaworMapper",
     "VideoHandReconstructionMapper",
     "VideoFaceBlurMapper",
+    "VideoFaceKeypointsMapper",
     "VideoObjectSegmentingMapper",
     "VideoRemoveWatermarkMapper",
     "VideoResizeAspectRatioMapper",
diff --git a/data_juicer/ops/mapper/video_animal_pose_mapper.py b/data_juicer/ops/mapper/video_animal_pose_mapper.py
new file mode 100644
index 00000000000..905f1d559d8
--- /dev/null
+++ b/data_juicer/ops/mapper/video_animal_pose_mapper.py
@@ -0,0 +1,304 @@
+import importlib
+import os
+import subprocess
+import sys
+
+import cv2
+from loguru import logger
+from pydantic import PositiveInt
+
+import data_juicer
+from data_juicer.ops.load import load_ops
+from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE
+from data_juicer.utils.constant import Fields, MetaKeys
+from data_juicer.utils.lazy_loader import LazyLoader
+from data_juicer.utils.mm_utils import SpecialTokens
+from data_juicer.utils.model_utils import get_model, prepare_model
+
+from ..base_op import OPERATORS, TAGGING_OPS, UNFORKABLE, Mapper
+from ..op_fusion import LOADED_VIDEOS
+
+OP_NAME = "video_animal_pose_mapper"
+
+
+@TAGGING_OPS.register_module(OP_NAME)
+@UNFORKABLE.register_module(OP_NAME)
+@OPERATORS.register_module(OP_NAME)
+@LOADED_VIDEOS.register_module(OP_NAME)
+class VideoAnimalPoseMapper(Mapper):
+    """Detect quadruped animal pose on the video."""
+
+    _accelerator = "cuda"
+
+    def __init__(
+        self,
+        vitpose_model_path: str = "apt36k.pth",
+        vitpose_config: str = "configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/apt36k/ViTPose_huge_apt36k_256x192.py",
+        yoloe_model_path: str = "yoloe-26x-seg.pt",
+        animal_class: list = [],
+        if_save_visualization: bool = True,
+        save_visualization_dir: str = DATA_JUICER_ASSETS_CACHE,
+        frame_num: PositiveInt = 3,
+        duration: float = 0,
+        frame_dir: str = DATA_JUICER_ASSETS_CACHE,
+        *args,
+        **kwargs,
+    ):
+        """
+        Initialization method.
+
+        :param vitpose_model_path: The path to the ViTPose model.
+        :param vitpose_config: Please select the appropriate model configuration.
+        :param yoloe_model_path: The path to the YOLOE model.
+        :param animal_class: Specifies the quadruped animal categories to be
+            detected. If no value is input, the default list will be used.
+        :param if_save_visualization: Whether to save visualization results.
+        :param save_visualization_dir: The path for saving visualization results.
+        :param frame_num: The number of frames to be extracted uniformly from
+            the video. If it's 1, only the middle frame will be extracted. If
+            it's 2, only the first and the last frames will be extracted. If
+            it's larger than 2, in addition to the first and the last frames,
+            other frames will be extracted uniformly within the video duration.
+            If "duration" > 0, frame_num is the number of frames per segment.
+        :param duration: The duration of each segment in seconds.
+            If 0, frames are extracted from the entire video.
+            If duration > 0, the video is segmented into multiple segments
+            based on duration, and frames are extracted from each segment.
+        :param frame_dir: Output directory to save extracted frames.
+
+        """
+        super().__init__(*args, **kwargs)
+        LazyLoader.check_packages(["ultralytics"])
+        self._install_required_packages()
+
+        vitpose_repo_path = os.path.join(DATA_JUICER_ASSETS_CACHE, "ViTPose")
+        if not os.path.exists(vitpose_repo_path):
+            subprocess.run(
+                [
+                    "git",
+                    "clone",
+                    "https://github.com/ViTAE-Transformer/ViTPose.git",
+                    vitpose_repo_path,
+                ],
+                check=True,
+            )
+
+        try:
+            importlib.import_module("mmpose")
+        except Exception:
+            subprocess.run(
+                [sys.executable, "-m", "pip", "install", "chumpy", "--no-build-isolation", "--no-deps"], check=True
+            )
+            subprocess.run(["pip", "install", "-e", vitpose_repo_path], check=True)
+        subprocess.run(["pip", "install", "numpy==1.26.4"], check=True)
+
+        from mmpose.apis import inference_top_down_pose_model
+
+        self.inference_top_down_pose_model = inference_top_down_pose_model
+
+        self.model_key = prepare_model(
+            model_type="vitpose_animal_pose", model_path=vitpose_model_path, vitpose_config=vitpose_config
+        )
+        self.yolo_model_key = prepare_model(model_type="yolo", model_path=yoloe_model_path)
+        self.if_save_visualization = if_save_visualization
+        self.save_visualization_dir = save_visualization_dir
+        self.frame_field = MetaKeys.video_frames
+        self.tag_field_name = MetaKeys.video_animal_pose_tags
+        self.frame_num = frame_num
+        self.duration = duration
+        self.frame_dir = frame_dir
+
+        self.skeleton = [
+            [0, 2],
+            [1, 2],
+            [2, 3],
+            [3, 5],
+            [5, 6],
+            [6, 7],
+            [3, 8],
+            [8, 9],
+            [9, 10],
+            [3, 4],
+            [4, 11],
+            [11, 12],
+            [12, 13],
+            [4, 14],
+            [14, 15],
+            [15, 16],
+        ]
+
+        if isinstance(animal_class, list) and len(animal_class) == 0:
+            self.animal_class = [
+                "bear",
+                "cat",
+                "cougar",
+                "cow",
+                "deer",
+                "dog",
+                "elephant",
+                "goat",
+                "hippo",
+                "horse",
+                "moose",
+                "panther",
+                "pig",
+                "rabbit",
+                "rhino",
+                "sheep",
+                "tiger",
+                "wolf",
+                "zebra",
+            ]
+        elif isinstance(animal_class, list):
+            self.animal_class = animal_class
+        else:
+            raise ValueError("The 'animal_class' must be in list format.")
+
+        self.video_extract_frames_mapper_args = {
+            "frame_sampling_method": "uniform",
+            "frame_num": frame_num,
+            "duration": duration,
+            "frame_dir": frame_dir,
+            "frame_key": MetaKeys.video_frames,
+            "num_proc": None,  # Disable multiprocessing to avoid nested process pool issue
+            "auto_op_parallelism": False,  # Disable auto parallelism to avoid nested process pool issue
+        }
+        self.fused_ops = load_ops([{"video_extract_frames_mapper": self.video_extract_frames_mapper_args}])
+
+    def _install_required_packages(self):
+        subprocess.run(["pip", "install", "numpy==1.26.4"], check=True)
+        try:
+            importlib.import_module("mim")
+        except ImportError:
+            logger.info("Installing openmim...")
+            try:
+                subprocess.run(["pip", "install", "openmim"], check=True)
+            except Exception:
+                raise ValueError(
+                    "Failed to install openmim, please refer to the documentation at "
+                    "https://github.com/open-mmlab/mim/blob/main/docs/en/installation.md for installation instructions."
+                )
+
+        try:
+            importlib.import_module("mmcv")
+        except ImportError:
+            logger.info("Installing mmcv using mim...")
+            try:
+                subprocess.run(["mim", "install", "mmcv==1.3.9", "--no-build-isolation"], check=True)
+            except Exception:
+                raise ValueError(
+                    "Failed to install mmcv, please refer to the documentation at "
+                    "https://mmdetection.readthedocs.io/en/latest/get_started.html#installation for installation instructions."
+                )
+
+    def draw_pose(self, img, keypoints, scores, threshold=0.3):
+
+        for i in range(len(keypoints)):
+            x, y = int(keypoints[i][0]), int(keypoints[i][1])
+            score = scores[i]
+            if score > threshold:
+                cv2.circle(img, (x, y), 5, (0, 255, 0), -1)
+
+        for p1, p2 in self.skeleton:
+            if scores[p1] > threshold and scores[p2] > threshold:
+                cv2.line(
+                    img,
+                    (int(keypoints[p1][0]), int(keypoints[p1][1])),
+                    (int(keypoints[p2][0]), int(keypoints[p2][1])),
+                    (255, 0, 0),
+                    2,
+                )
+        return img
+
+    def process_single(self, sample=None, rank=None):
+
+        # check if it's generated already
+        if self.tag_field_name in sample[Fields.meta]:
+            return sample
+
+        # there is no video in this sample
+        if (self.video_key not in sample or not sample[self.video_key]) and self.frame_field not in sample:
+            sample[Fields.meta][self.tag_field_name] = {"pose_list": [], "pose_score_list": [], "animal_bboxes": []}
+            return sample
+
+        pose_inferencer = get_model(model_key=self.model_key, rank=rank, use_cuda=self.use_cuda())
+        yolo_model = get_model(model_key=self.yolo_model_key, rank=rank, use_cuda=self.use_cuda())
+        yolo_model.set_classes(self.animal_class, yolo_model.get_text_pe(self.animal_class))
+
+        if self.frame_field in sample:
+            frames_path = sample[self.frame_field]
+            video_name = frames_path[0].split("/")[-2]
+        else:
+            # load videos
+            ds_list = [{"text": SpecialTokens.video, "videos": sample[self.video_key]}]
+
+            dataset = data_juicer.core.data.NestedDataset.from_list(ds_list)
+            dataset = self.fused_ops[0].run(dataset)
+
+            temp_frame_name = os.path.splitext(os.path.basename(sample[self.video_key][0]))[0]
+            frames_root = os.path.join(self.frame_dir, temp_frame_name)
+            frame_names = os.listdir(frames_root)
+            frames_path = sorted([os.path.join(frames_root, frame_name) for frame_name in frame_names])
+            video_name = os.path.splitext(os.path.basename(sample[self.video_key][0]))[0]
+
+        if self.if_save_visualization:
+            os.makedirs(os.path.join(self.save_visualization_dir, video_name), exist_ok=True)
+
+        final_pose_list = []
+        final_pose_score_list = []
+        final_bboxes = []
+
+        for temp_img_path_id, temp_img_path in enumerate(frames_path):
+            img = cv2.imread(temp_img_path)
+
+            temp_results = yolo_model.predict(img, verbose=False)[0]
+            bboxes = []
+            bboxes_only_num = []
+            for box in temp_results.boxes:
+                x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
+                bboxes.append({"bbox": [x1, y1, x2, y2]})
+                bboxes_only_num.append([x1, y1, x2, y2])
+
+            if not bboxes:
+                final_pose_list.append([])
+                final_pose_score_list.append([])
+                final_bboxes.append([])
+                continue
+
+            pose_results, _ = self.inference_top_down_pose_model(pose_inferencer, img, bboxes, format="xyxy")
+
+            temp_pose_list = []
+            temp_score_list = []
+
+            for res in pose_results:
+                keypoints = res["keypoints"][:, :2]
+                scores = res["keypoints"][:, 2]
+
+                temp_pose_list.append(keypoints)
+                temp_score_list.append(scores)
+
+                if self.if_save_visualization:
+                    cv2.rectangle(
+                        img,
+                        (int(res["bbox"][0]), int(res["bbox"][1])),
+                        (int(res["bbox"][2]), int(res["bbox"][3])),
+                        (255, 0, 0),
+                        2,
+                    )
+                    img = self.draw_pose(img, keypoints, scores)
+
+            if self.if_save_visualization:
+                cv2.imwrite(
+                    os.path.join(self.save_visualization_dir, video_name, f"vis_{str(temp_img_path_id)}.jpg"), img
+                )
+
+            final_pose_list.append(temp_pose_list)
+            final_pose_score_list.append(temp_score_list)
+            final_bboxes.append(bboxes_only_num)
+
+        sample[Fields.meta][self.tag_field_name] = {}
+        sample[Fields.meta][self.tag_field_name]["pose_list"] = final_pose_list
+        sample[Fields.meta][self.tag_field_name]["pose_score_list"] = final_pose_score_list
+        sample[Fields.meta][self.tag_field_name]["animal_bboxes"] = final_bboxes
+
+        return sample
diff --git a/data_juicer/ops/mapper/video_camera_calibration_static_moge_mapper.py b/data_juicer/ops/mapper/video_camera_calibration_static_moge_mapper.py
index cce977dc1f3..93af82b220b 100644
--- a/data_juicer/ops/mapper/video_camera_calibration_static_moge_mapper.py
+++ b/data_juicer/ops/mapper/video_camera_calibration_static_moge_mapper.py
@@ -93,6 +93,7 @@ def __init__(
 
         self.frame_num = frame_num
         self.duration = duration
+        self.frame_field = MetaKeys.video_frames
         self.tag_field_name = tag_field_name
         self.frame_dir = frame_dir
         self.output_info_dir = output_info_dir
@@ -108,18 +109,39 @@ def process_single(self, sample=None, rank=None):
             return sample
 
         # there is no video in this sample
-        if self.video_key not in sample or not sample[self.video_key]:
-            return []
+        if (self.video_key not in sample or not sample[self.video_key]) and self.frame_field not in sample:
+            sample[Fields.meta][self.tag_field_name] = {
+                "frames_folder": "",
+                "frame_names": [],
+                "intrinsics_list": [],
+                "hfov_list": [],
+                "vfov_list": [],
+                "points_list": [],
+                "depth_list": [],
+                "mask_list": [],
+            }
+            return sample
+
+        if self.frame_field in sample:
+            frames_path = sample[self.frame_field]
+            frame_names = []
+            for temp_frame_name in sample[self.frame_field]:
+                frame_names.append(temp_frame_name.split("/")[-1])
+            frames_root = os.path.dirname(frames_path[0])
+            video_name = frames_path[0].split("/")[-2]
+
+        else:
+            # load videos
+            ds_list = [{"text": SpecialTokens.video, "videos": sample[self.video_key]}]
 
-        # load videos
-        ds_list = [{"text": SpecialTokens.video, "videos": sample[self.video_key]}]
+            dataset = data_juicer.core.data.NestedDataset.from_list(ds_list)
+            dataset = self.fused_ops[0].run(dataset)
 
-        dataset = data_juicer.core.data.NestedDataset.from_list(ds_list)
-        dataset = self.fused_ops[0].run(dataset)
+            frames_root = os.path.join(self.frame_dir, os.path.splitext(os.path.basename(sample[self.video_key][0]))[0])
+            frame_names = os.listdir(frames_root)
+            frames_path = sorted([os.path.join(frames_root, frame_name) for frame_name in frame_names])
+            video_name = os.path.splitext(os.path.basename(sample[self.video_key][0]))[0]
 
-        frames_root = os.path.join(self.frame_dir, os.path.splitext(os.path.basename(sample[self.video_key][0]))[0])
-        frame_names = os.listdir(frames_root)
-        frames_path = sorted([os.path.join(frames_root, frame_name) for frame_name in frame_names])
         model = get_model(self.model_key, rank, self.use_cuda())
 
         final_k_list = []
@@ -183,9 +205,7 @@ def process_single(self, sample=None, rank=None):
         if self.if_output_info:
             os.makedirs(self.output_info_dir, exist_ok=True)
             with open(
-                os.path.join(
-                    self.output_info_dir, os.path.splitext(os.path.basename(sample[self.video_key][0]))[0] + ".json"
-                ),
+                os.path.join(self.output_info_dir, video_name + ".json"),
                 "w",
             ) as f:
                 json.dump(sample[Fields.meta][self.tag_field_name], f)
diff --git a/data_juicer/ops/mapper/video_depth_estimation_mapper.py b/data_juicer/ops/mapper/video_depth_estimation_mapper.py
index bcdd3b2ffb8..d01c9b82e5d 100644
--- a/data_juicer/ops/mapper/video_depth_estimation_mapper.py
+++ b/data_juicer/ops/mapper/video_depth_estimation_mapper.py
@@ -31,6 +31,7 @@ class VideoDepthEstimationMapper(Mapper):
     def __init__(
         self,
         video_depth_model_path: str = "video_depth_anything_vitb.pth",
+        if_save_point_cloud: bool = True,
         point_cloud_dir_for_metric: str = DATA_JUICER_ASSETS_CACHE,
         max_res: int = 1280,
         torch_dtype: str = "fp16",
@@ -45,8 +46,9 @@ def __init__(
 
         :param video_depth_model_path: The path to the Video-Depth-Anything model.
             If the model is a 'metric' model, the code will automatically switch
-            to metric mode, and the user should input the path for storing point
-            clouds.
+            to metric mode.
+        :param if_save_point_cloud: Whether to save point cloud results (the user
+            should input the path for storing point clouds).
         :param point_cloud_dir_for_metric: The path for storing point
             clouds (for a 'metric' model).
         :param max_res: The maximum resolution threshold for videos; videos exceeding
@@ -90,19 +92,22 @@ def __init__(
         self.tag_field_name = MetaKeys.video_depth_tags
         self.max_res = max_res
         self.torch_dtype = torch_dtype
+        self.if_save_point_cloud = if_save_point_cloud
         self.point_cloud_dir_for_metric = point_cloud_dir_for_metric
         self.if_save_visualization = if_save_visualization
         self.save_visualization_dir = save_visualization_dir
         self.grayscale = grayscale
+        self.frame_field = MetaKeys.video_frames
         self.model_key = prepare_model(model_type="video_depth_anything", model_path=video_depth_model_path)
 
     def process_single(self, sample=None, rank=None):
+
         # check if it's generated already
         if self.tag_field_name in sample[Fields.meta]:
             return sample
 
         # there is no video in this sample
-        if self.video_key not in sample or not sample[self.video_key]:
+        if (self.video_key not in sample or not sample[self.video_key]) and self.frame_field not in sample:
             sample[Fields.meta][self.tag_field_name] = {"depth_data": [], "fps": -1}
             return sample
 
@@ -112,7 +117,33 @@ def process_single(self, sample=None, rank=None):
             device = f"cuda:{str(rank)}"
         else:
             device = "cuda"
-        frames, target_fps = self.read_video_frames(sample[self.video_key][0], -1, -1, self.max_res)
+
+        if self.frame_field in sample:
+            if "fps" not in sample:
+                raise ValueError("If inputting extracted frames instead of a video, the 'fps' must be provided.")
+            target_fps = sample["fps"]
+
+            first_frame = cv2.imread(sample[self.frame_field][0])
+            original_height = int(first_frame.shape[0])
+            original_width = int(first_frame.shape[1])
+
+            if self.max_res > 0 and max(original_height, original_width) > self.max_res:
+                scale = self.max_res / max(original_height, original_width)
+                height = round(original_height * scale)
+                width = round(original_width * scale)
+
+            frames = []
+            for temp_frame_path in sample[self.frame_field]:
+                temp_frame = cv2.imread(temp_frame_path)
+                temp_frame = cv2.cvtColor(temp_frame, cv2.COLOR_BGR2RGB)
+                if self.max_res > 0 and max(original_height, original_width) > self.max_res:
+                    temp_frame = cv2.resize(temp_frame, (width, height))
+                frames.append(temp_frame)
+            frames = np.stack(frames, axis=0)
+
+        else:
+            frames, target_fps = self.read_video_frames(sample[self.video_key][0], -1, -1, self.max_res)
+
         depths, fps = video_depth_anything_model.infer_video_depth(
             frames,
             target_fps,
@@ -122,7 +153,10 @@ def process_single(self, sample=None, rank=None):
         )
 
         if self.if_save_visualization:
-            video_name = os.path.basename(sample[self.video_key][0])
+            if self.video_key in sample:
+                video_name = os.path.basename(sample[self.video_key][0])
+            else:
+                video_name = sample[self.frame_field][0].split("/")[-2]
             os.makedirs(self.save_visualization_dir, exist_ok=True)
             processed_video_path = os.path.join(
                 self.save_visualization_dir, os.path.splitext(video_name)[0] + "_src.mp4"
@@ -131,7 +165,7 @@ def process_single(self, sample=None, rank=None):
             self.save_video(frames, processed_video_path, fps=fps)
             self.save_video(depths, depth_vis_path, fps=fps, is_depths=True, grayscale=self.grayscale)
 
-        if self.metric:
+        if self.metric and self.if_save_point_cloud:
             os.makedirs(self.point_cloud_dir_for_metric, exist_ok=True)
             width, height = depths[0].shape[-1], depths[0].shape[-2]
             x, y = np.meshgrid(np.arange(width), np.arange(height))
diff --git a/data_juicer/ops/mapper/video_face_keypoints_mapper.py b/data_juicer/ops/mapper/video_face_keypoints_mapper.py
new file mode 100644
index 00000000000..e8827b8348f
--- /dev/null
+++ b/data_juicer/ops/mapper/video_face_keypoints_mapper.py
@@ -0,0 +1,230 @@
+import os
+
+import cv2
+import numpy as np
+from pydantic import PositiveInt
+
+import data_juicer
+from data_juicer.ops.load import load_ops
+from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE
+from data_juicer.utils.constant import Fields, MetaKeys
+from data_juicer.utils.lazy_loader import LazyLoader
+from data_juicer.utils.mm_utils import SpecialTokens
+from data_juicer.utils.model_utils import get_model, prepare_model
+
+from ..base_op import OPERATORS, TAGGING_OPS, UNFORKABLE, Mapper
+from ..op_fusion import LOADED_VIDEOS
+
+OP_NAME = "video_face_keypoints_mapper"
+
+torch = LazyLoader("torch")
+
+
+@TAGGING_OPS.register_module(OP_NAME)
+@UNFORKABLE.register_module(OP_NAME)
+@OPERATORS.register_module(OP_NAME)
+@LOADED_VIDEOS.register_module(OP_NAME)
+class VideoFaceKeypointsMapper(Mapper):
+    """Detect face keypoints (98 points) on the video."""
+
+    _accelerator = "cuda"
+
+    def __init__(
+        self,
+        ldeq_model_path: str = "final.pth.tar",
+        if_save_visualization: bool = False,
+        save_visualization_dir: str = DATA_JUICER_ASSETS_CACHE,
+        frame_num: PositiveInt = 3,
+        duration: float = 0,
+        frame_dir: str = DATA_JUICER_ASSETS_CACHE,
+        *args,
+        **kwargs,
+    ):
+        """
+        Initialization method.
+
+        :param ldeq_model_path: The path to the LDEQ model.
+        :param if_save_visualization: Whether to save visualization results.
+        :param save_visualization_dir: The path for saving visualization results.
+        :param frame_num: The number of frames to be extracted uniformly from
+            the video. If it's 1, only the middle frame will be extracted. If
+            it's 2, only the first and the last frames will be extracted. If
+            it's larger than 2, in addition to the first and the last frames,
+            other frames will be extracted uniformly within the video duration.
+            If "duration" > 0, frame_num is the number of frames per segment.
+        :param duration: The duration of each segment in seconds.
+            If 0, frames are extracted from the entire video.
+            If duration > 0, the video is segmented into multiple segments
+            based on duration, and frames are extracted from each segment.
+        :param frame_dir: Output directory to save extracted frames.
+
+        """
+
+        super().__init__(*args, **kwargs)
+        LazyLoader.check_packages(["insightface", "torchinfo"])
+
+        self.model_key = prepare_model(model_type="face_keypoints_ldeq", model_path=ldeq_model_path)
+        self.if_save_visualization = if_save_visualization
+        self.save_visualization_dir = save_visualization_dir
+        self.frame_field = MetaKeys.video_frames
+        self.tag_field_name = MetaKeys.video_face_keypoints_tags
+        self.frame_num = frame_num
+        self.duration = duration
+        self.frame_dir = frame_dir
+
+        self.video_extract_frames_mapper_args = {
+            "frame_sampling_method": "uniform",
+            "frame_num": frame_num,
+            "duration": duration,
+            "frame_dir": frame_dir,
+            "frame_key": MetaKeys.video_frames,
+            "num_proc": None,  # Disable multiprocessing to avoid nested process pool issue
+            "auto_op_parallelism": False,  # Disable auto parallelism to avoid nested process pool issue
+        }
+        self.fused_ops = load_ops([{"video_extract_frames_mapper": self.video_extract_frames_mapper_args}])
+
+    def preprocess(self, face_crop):
+        img = face_crop.transpose(2, 0, 1) / 255.0
+
+        mean = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1)
+        std = np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1)
+        img = (img - mean) / std
+
+        return torch.from_numpy(img).float().unsqueeze(0)
+
+    def crop_and_pad(self, image, bbox, target_size=256, padding_ratio=0.05):
+
+        x1, y1, x2, y2 = bbox
+        w, h = x2 - x1, y2 - y1
+        center_x, center_y = (x1 + x2) // 2, (y1 + y2) // 2
+
+        # 1. Side length of the square (the maximum of the width and height) and add padding.
+        side = max(w, h)
+        side = int(side * (1 + padding_ratio))
+
+        # 2. Calculate the new coordinates
+        new_x1 = center_x - side // 2
+        new_y1 = center_y - side // 2
+        new_x2 = new_x1 + side
+        new_y2 = new_y1 + side
+
+        # 3. Handling cases that exceed the original image boundaries
+        img_h, img_w = image.shape[:2]
+
+        pad_top = max(0, -new_y1)
+        pad_bottom = max(0, new_y2 - img_h)
+        pad_left = max(0, -new_x1)
+        pad_right = max(0, new_x2 - img_w)
+
+        crop_x1 = max(0, new_x1)
+        crop_y1 = max(0, new_y1)
+        crop_x2 = min(img_w, new_x2)
+        crop_y2 = min(img_h, new_y2)
+
+        crop = image[crop_y1:crop_y2, crop_x1:crop_x2]
+
+        # 4. If it goes out of bounds, fill with black borders.
+        if pad_top > 0 or pad_bottom > 0 or pad_left > 0 or pad_right > 0:
+            crop = cv2.copyMakeBorder(
+                crop, pad_top, pad_bottom, pad_left, pad_right, cv2.BORDER_CONSTANT, value=[0, 0, 0]
+            )
+
+        final_crop = cv2.resize(crop, (target_size, target_size))
+
+        return final_crop, (new_x1, new_y1), side
+
+    def draw_landmarks_on_image(self, image, landmarks_list, color=(0, 255, 0)):
+        vis_img = image.copy()
+
+        for kpts in landmarks_list:
+            for i in range(kpts.shape[0]):
+                x, y = int(kpts[i][0]), int(kpts[i][1])
+                cv2.circle(vis_img, (x, y), 2, color, -1)
+
+        return vis_img
+
+    def process_single(self, sample=None, rank=None):
+
+        # check if it's generated already
+        if self.tag_field_name in sample[Fields.meta]:
+            return sample
+
+        # there is no video in this sample
+        if (self.video_key not in sample or not sample[self.video_key]) and self.frame_field not in sample:
+            sample[Fields.meta][self.tag_field_name] = {"face_keypoints": [], "face_bboxes": []}
+            return sample
+
+        ldeq_model, detector, train_args = get_model(model_key=self.model_key, rank=rank, use_cuda=self.use_cuda())
+
+        if rank is not None:
+            device = f"cuda:{str(rank)}"
+        else:
+            device = "cuda"
+
+        if self.frame_field in sample:
+            frames_path = sample[self.frame_field]
+            video_name = frames_path[0].split("/")[-2]
+        else:
+            # load videos
+            ds_list = [{"text": SpecialTokens.video, "videos": sample[self.video_key]}]
+
+            dataset = data_juicer.core.data.NestedDataset.from_list(ds_list)
+            dataset = self.fused_ops[0].run(dataset)
+
+            temp_frame_name = os.path.splitext(os.path.basename(sample[self.video_key][0]))[0]
+            frames_root = os.path.join(self.frame_dir, temp_frame_name)
+            frame_names = os.listdir(frames_root)
+            frames_path = sorted([os.path.join(frames_root, frame_name) for frame_name in frame_names])
+            video_name = os.path.splitext(os.path.basename(sample[self.video_key][0]))[0]
+
+        if self.if_save_visualization:
+            os.makedirs(os.path.join(self.save_visualization_dir, video_name), exist_ok=True)
+
+        final_keypoints = []
+        final_bboxes = []
+
+        for temp_img_path_id, temp_img_path in enumerate(frames_path):
+
+            img = cv2.imread(temp_img_path)
+            faces = detector.get(img)
+            temp_results = []
+            temp_bboxes = []
+
+            for face in faces:
+                bbox = face.bbox.astype(int)  # [x1, y1, x2, y2]
+                temp_bboxes.append(bbox)
+
+                crop, (new_x1, new_y1), side = self.crop_and_pad(img, bbox)
+
+                input_tensor = self.preprocess(crop).to(device)
+
+                with torch.no_grad():
+                    output = ldeq_model(
+                        input_tensor,
+                        mode=train_args.model_mode,
+                        args=train_args,
+                        z0=torch.zeros(1, train_args.z_width, train_args.heatmap_size, train_args.heatmap_size).to(
+                            device
+                        ),
+                    )
+
+                    pred_keypoints = output["keypoints"][0].cpu().numpy()
+
+                final_kpts = pred_keypoints * [side, side] + [new_x1, new_y1]
+                temp_results.append(final_kpts)
+
+            final_keypoints.append(temp_results)
+            final_bboxes.append(temp_bboxes)
+
+            if self.if_save_visualization:
+                final_image = self.draw_landmarks_on_image(img, temp_results)
+                cv2.imwrite(
+                    os.path.join(self.save_visualization_dir, video_name, f"vis_{str(temp_img_path_id)}.jpg"),
+                    final_image,
+                )
+
+        sample[Fields.meta][self.tag_field_name] = {}
+        sample[Fields.meta][self.tag_field_name]["face_keypoints"] = final_keypoints
+        sample[Fields.meta][self.tag_field_name]["face_bboxes"] = final_bboxes
+
+        return sample
diff --git a/data_juicer/ops/mapper/video_hand_reconstruction_hawor_mapper.py b/data_juicer/ops/mapper/video_hand_reconstruction_hawor_mapper.py
index 892aa965e8a..2ca0b5a208b 100644
--- a/data_juicer/ops/mapper/video_hand_reconstruction_hawor_mapper.py
+++ b/data_juicer/ops/mapper/video_hand_reconstruction_hawor_mapper.py
@@ -148,6 +148,7 @@ def __init__(
         self.hawor_detector_path = hawor_detector_path
         self.frame_num = frame_num
         self.duration = duration
+        self.frame_field = MetaKeys.video_frames
         self.tag_field_name = tag_field_name
         self.frame_dir = frame_dir
         self.thresh = thresh
@@ -348,24 +349,52 @@ def process_single(self, sample=None, rank=None):
             return sample
 
         # there is no video in this sample
-        if self.video_key not in sample or not sample[self.video_key]:
-            return []
+        if (self.video_key not in sample or not sample[self.video_key]) and self.frame_field not in sample:
+            sample[Fields.meta][self.tag_field_name] = {
+                "fov_x": -1,
+                "left_frame_id_list": [],
+                "left_beta_list": [],
+                "left_hand_pose_list": [],
+                "left_global_orient_list": [],
+                "left_transl_list": [],
+                "right_frame_id_list": [],
+                "right_beta_list": [],
+                "right_hand_pose_list": [],
+                "right_global_orient_list": [],
+                "right_transl_list": [],
+            }
+            return sample
 
         # --- 1. FoV Estimation (MoGe) ---
-        ds_list = [{"videos": sample[self.video_key]}]
 
-        dataset = data_juicer.core.data.NestedDataset.from_list(ds_list)
-        if Fields.meta not in dataset.features:
-            dataset = dataset.add_column(name=Fields.meta, column=[{}] * dataset.num_rows)
-        dataset = dataset.map(self.fused_ops[0].process, num_proc=None, with_rank=True)
-        res_list = dataset.to_list()
+        if self.frame_field in sample:
+            ds_list = [{MetaKeys.video_frames: sample[self.frame_field]}]
+
+            dataset = data_juicer.core.data.NestedDataset.from_list(ds_list)
+            if Fields.meta not in dataset.features:
+                dataset = dataset.add_column(name=Fields.meta, column=[{}] * dataset.num_rows)
+            dataset = dataset.map(self.fused_ops[0].process, num_proc=1, with_rank=True)
+            res_list = dataset.to_list()
+
+            all_fov_x = res_list[0][Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["hfov_list"]
+
+            frames_path = sample[self.frame_field]
+
+        else:
+            ds_list = [{"videos": sample[self.video_key]}]
+
+            dataset = data_juicer.core.data.NestedDataset.from_list(ds_list)
+            if Fields.meta not in dataset.features:
+                dataset = dataset.add_column(name=Fields.meta, column=[{}] * dataset.num_rows)
+            dataset = dataset.map(self.fused_ops[0].process, num_proc=1, with_rank=True)
+            res_list = dataset.to_list()
 
-        all_fov_x = res_list[0][Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["hfov_list"]
+            all_fov_x = res_list[0][Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["hfov_list"]
 
-        temp_frame_name = os.path.splitext(os.path.basename(sample[self.video_key][0]))[0]
-        frames_root = os.path.join(self.frame_dir, temp_frame_name)
-        frame_names = os.listdir(frames_root)
-        frames_path = sorted([os.path.join(frames_root, frame_name) for frame_name in frame_names])
+            temp_frame_name = os.path.splitext(os.path.basename(sample[self.video_key][0]))[0]
+            frames_root = os.path.join(self.frame_dir, temp_frame_name)
+            frame_names = os.listdir(frames_root)
+            frames_path = sorted([os.path.join(frames_root, frame_name) for frame_name in frame_names])
 
         images = []
         for temp_frame_path in frames_path:
diff --git a/data_juicer/ops/mapper/video_whole_body_pose_estimation_mapper.py b/data_juicer/ops/mapper/video_whole_body_pose_estimation_mapper.py
index 5e03ab07192..d28becb9ae7 100644
--- a/data_juicer/ops/mapper/video_whole_body_pose_estimation_mapper.py
+++ b/data_juicer/ops/mapper/video_whole_body_pose_estimation_mapper.py
@@ -84,6 +84,7 @@ def __init__(
         )
 
         self.frame_num = frame_num
+        self.frame_field = MetaKeys.video_frames
         self.duration = duration
         self.tag_field_name = tag_field_name
         self.frame_dir = frame_dir
@@ -97,20 +98,31 @@ def process_single(self, sample=None, rank=None):
             return sample
 
         # there is no video in this sample
-        if self.video_key not in sample or not sample[self.video_key]:
-            return []
+        if (self.video_key not in sample or not sample[self.video_key]) and self.frame_field not in sample:
+            sample[Fields.meta][self.tag_field_name] = {
+                "body_keypoints": [],
+                "foot_keypoints": [],
+                "faces_keypoints": [],
+                "hands_keypoints": [],
+                "bbox_results_list": [],
+            }
+            return sample
 
-        # load videos
-        ds_list = [{"text": SpecialTokens.video, "videos": sample[self.video_key]}]
+        if self.frame_field in sample:
+            frames_path = sample[self.frame_field]
+            frames_root = frames_path[0].split("/")[-2]
+        else:
+            # load videos
+            ds_list = [{"text": SpecialTokens.video, "videos": sample[self.video_key]}]
 
-        dataset = data_juicer.core.data.NestedDataset.from_list(ds_list)
-        dataset = self.fused_ops[0].run(dataset)
+            dataset = data_juicer.core.data.NestedDataset.from_list(ds_list)
+            dataset = self.fused_ops[0].run(dataset)
 
-        dwpose_model = get_model(self.model_key, rank, self.use_cuda())
+            frames_root = os.path.join(self.frame_dir, os.path.splitext(os.path.basename(sample[self.video_key][0]))[0])
+            frame_names = os.listdir(frames_root)
+            frames_path = sorted([os.path.join(frames_root, frame_name) for frame_name in frame_names])
 
-        frames_root = os.path.join(self.frame_dir, os.path.splitext(os.path.basename(sample[self.video_key][0]))[0])
-        frame_names = os.listdir(frames_root)
-        frames_path = sorted([os.path.join(frames_root, frame_name) for frame_name in frame_names])
+        dwpose_model = get_model(self.model_key, rank, self.use_cuda())
 
         body_keypoints = []
         foot_keypoints = []
diff --git a/data_juicer/utils/constant.py b/data_juicer/utils/constant.py
index 0dc8b8b9550..e247ac4251a 100644
--- a/data_juicer/utils/constant.py
+++ b/data_juicer/utils/constant.py
@@ -62,6 +62,10 @@ class MetaKeys(object):
     video_frame_tags = "video_frame_tags"
     # # video-audio tags
     video_audio_tags = "video_audio_tags"
+    # # video animal pose tags
+    video_animal_pose_tags = "video_animal_pose_tags"
+    # # video face keypoints tags
+    video_face_keypoints_tags = "video_face_keypoints_tags"
     # # video frames
     video_frames = "video_frames"
     # # object segment info in video
diff --git a/data_juicer/utils/model_utils.py b/data_juicer/utils/model_utils.py
index 751553dc17c..7860f084d48 100644
--- a/data_juicer/utils/model_utils.py
+++ b/data_juicer/utils/model_utils.py
@@ -632,6 +632,34 @@ def _get_model_path(model_path, default_filename, download_key):
     return dwpose_model
 
 
+def prepare_face_keypoints_ldeq_model(model_path, **model_params):
+    device = model_params.pop("device", "cpu")
+
+    from data_juicer.ops.common.ldeq_face_keypoints_func import LDEQ
+
+    if not os.path.exists(model_path):
+        if not os.path.exists(DJMC):
+            os.makedirs(DJMC)
+        LazyLoader.check_packages(["gdown"])
+        import gdown
+
+        model_path = os.path.join(DJMC, "final.pth.tar")
+        gdown.download("https://drive.google.com/uc?id=1w73vFdN2IZf4AcNfIULx695ptj1LHe_J", model_path)
+
+    ckpt = torch.load(model_path, map_location="cuda", weights_only=False)
+    train_args = ckpt["args"]
+    ldeq_model = LDEQ(train_args).to(device)
+    ldeq_model.load_state_dict(ckpt["state_dict"], strict=False)
+    ldeq_model.eval()
+
+    from insightface.app import FaceAnalysis
+
+    detector = FaceAnalysis(name="buffalo_l", allowed_modules=["detection"])
+    detector.prepare(ctx_id=0, det_size=(1280, 1280))
+
+    return ldeq_model, detector, train_args
+
+
 def prepare_fastsam_model(model_path, **model_params):
     device = model_params.pop("device", "cpu")
     model = ultralytics.FastSAM(check_model(model_path)).to(device)
@@ -1280,6 +1308,46 @@ def prepare_vggt_model(model_path, **model_params):
     return model
 
 
+def prepare_vitpose_animal_pose_model(model_path, vitpose_config, **model_params):
+    device = model_params.pop("device", "cpu")
+
+    vitpose_repo_path = os.path.join(DATA_JUICER_ASSETS_CACHE, "ViTPose")
+    if not os.path.exists(vitpose_repo_path):
+        subprocess.run(
+            [
+                "git",
+                "clone",
+                "https://github.com/ViTAE-Transformer/ViTPose.git",
+                vitpose_repo_path,
+            ],
+            check=True,
+        )
+    sys.path.append(vitpose_repo_path)
+
+    from mmpose.apis import init_pose_model
+
+    if os.path.exists(model_path):
+        pose_inferencer = init_pose_model(os.path.join(vitpose_repo_path, vitpose_config), model_path, device=device)
+    else:
+        vitpose_config = "ViTPose_huge_apt36k_256x192"
+        if not os.path.exists(DJMC):
+            os.makedirs(DJMC)
+
+        model_path = os.path.join(DJMC, "vitpose_huge.pth")
+        wget.download("https://download.cs.stanford.edu/viscam/AiM/ckpt/apt36k.pth", model_path)
+
+        pose_inferencer = init_pose_model(
+            os.path.join(
+                vitpose_repo_path,
+                "configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/apt36k/ViTPose_huge_apt36k_256x192.py",
+            ),
+            model_path,
+            device=device,
+        )
+
+    return pose_inferencer
+
+
 def prepare_vllm_model(pretrained_model_name_or_path, return_processor=False, **model_params):
     """
     Prepare and load a HuggingFace model with the corresponding processor.
@@ -1754,6 +1822,7 @@ def _download_model(local_dir):
     "deepcalib": prepare_deepcalib_model,
     "diffusion": prepare_diffusion_model,
     "dwpose": prepare_dwpose_model,
+    "face_keypoints_ldeq": prepare_face_keypoints_ldeq_model,
     "fasttext": prepare_fasttext_model,
     "fastsam": prepare_fastsam_model,
     "hawor": prepare_hawor_model,
@@ -1769,6 +1838,7 @@ def _download_model(local_dir):
     "simple_aesthetics": prepare_simple_aesthetics_model,
     "spacy": prepare_spacy_model,
     "vggt": prepare_vggt_model,
+    "vitpose_animal_pose": prepare_vitpose_animal_pose_model,
     "video_blip": prepare_video_blip_model,
     "video_depth_anything": prepare_video_depth_anything,
     "vllm": prepare_vllm_model,
diff --git a/docs/Operators.md b/docs/Operators.md
index 186667ba2f9..696a8f333a2 100644
--- a/docs/Operators.md
+++ b/docs/Operators.md
@@ -46,7 +46,7 @@ Data-Juicer 中的算子分为以下 8 种类型。
 | [filter](#filter) | 57 | Filters out low-quality samples. 过滤低质量样本。 |
 | [formatter](#formatter) | 8 | Discovers, loads, and canonicalizes source data. 发现、加载、规范化原始数据。 |
 | [grouper](#grouper) | 3 | Group samples to batched samples. 将样本分组，每一组组成一个批量样本。 |
-| [mapper](#mapper) | 123 | Edits and transforms samples. 对数据样本进行编辑和转换。 |
+| [mapper](#mapper) | 125 | Edits and transforms samples. 对数据样本进行编辑和转换。 |
 | [pipeline](#pipeline) | 2 | Applies dataset-level processing; both input and output are datasets. 执行数据集级别的操作，输入和输出均为完整数据集。 |
 | [selector](#selector) | 5 | Selects top samples based on ranking. 基于排序选取高质量样本。 |
 
@@ -278,6 +278,7 @@ All the specific operators are listed below, each featured with several capabili
 | tool_success_tagger_mapper | 💻CPU 🔴Alpha | Set meta tool_success_count, tool_fail_count, tool_success_ratio. 设置meta tool_success_count、tool_fail_count、tool_success_ratio。 | - | - |
 | usage_counter_mapper | 💻CPU 🟡Beta | Write token usage to meta from choices/usage (OpenAI/Anthropic-style). 从选择/用法 (OpenAI/Anthropic风格) 将令牌用法写入meta。 | - | - |
 | vggt_mapper | 🎬Video 🚀GPU 🟡Beta | Input a video of a single scene, and use VGGT to extract information including Camera Pose, Depth Maps, Point Maps, and 3D Point Tracks. 输入单个场景的视频，并使用VGGT提取包括相机姿态、深度图、点图和3D点轨迹的信息。 | [info](operators/mapper/vggt_mapper.md) | - |
+| video_animal_pose_mapper | 🎬Video 🚀GPU 🟡Beta | Detect quadruped animal pose on the video. - | - | - |
 | video_camera_calibration_static_deepcalib_mapper | 🎬Video 🚀GPU 🟡Beta | Compute the camera intrinsics and field of view (FOV) for a static camera using DeepCalib. 使用DeepCalib计算静态摄像机的摄像机内部和视场 (FOV)。 | [info](operators/mapper/video_camera_calibration_static_deepcalib_mapper.md) | - |
 | video_camera_calibration_static_moge_mapper | 🎬Video 🚀GPU 🟡Beta | Compute the camera intrinsics and field of view (FOV) for a static camera using Moge-2 (more accurate than DeepCalib). 使用Moge-2 (比DeepCalib更准确) 计算静态摄像机的摄像机内部函数和视场 (FOV)。 | [info](operators/mapper/video_camera_calibration_static_moge_mapper.md) | - |
 | video_camera_pose_mapper | 🎬Video 🚀GPU 🟡Beta | Extract camera poses by leveraging MegaSaM and MoGe-2. 通过利用MegaSaM和MoGe-2提取相机姿势。 | - | - |
@@ -289,6 +290,7 @@ All the specific operators are listed below, each featured with several capabili
 | video_depth_estimation_mapper | 🎬Video 🚀GPU 🟡Beta | Perform depth estimation on the video. 对视频进行深度估计。 | [info](operators/mapper/video_depth_estimation_mapper.md) | - |
 | video_extract_frames_mapper | 🔮Multimodal 💻CPU 🟢Stable | Mapper to extract frames from video files according to specified methods. 映射器根据指定的方法从视频文件中提取帧。 | [info](operators/mapper/video_extract_frames_mapper.md) | - |
 | video_face_blur_mapper | 🎬Video 💻CPU 🟢Stable | Mapper to blur faces detected in videos. 映射器模糊在视频中检测到的人脸。 | [info](operators/mapper/video_face_blur_mapper.md) | - |
+| video_face_keypoints_mapper | 🎬Video 🚀GPU 🟡Beta | Detect face keypoints (98 points) on the video. - | - | - |
 | video_ffmpeg_wrapped_mapper | 🎬Video 💻CPU 🟢Stable | Wraps FFmpeg video filters for processing video files in a dataset. 包装FFmpeg视频过滤器，用于处理数据集中的视频文件。 | [info](operators/mapper/video_ffmpeg_wrapped_mapper.md) | - |
 | video_hand_reconstruction_hawor_mapper | 🎬Video 🚀GPU 🟡Beta | Use HaWoR and MoGe-2 for hand reconstruction. 使用HaWoR和MoGe-2进行手部重建。 | - | - |
 | video_hand_reconstruction_mapper | 🎬Video 🚀GPU 🟡Beta | Use the WiLoR model for hand localization and reconstruction. 使用WiLoR模型进行手部定位和重建。 | [info](operators/mapper/video_hand_reconstruction_mapper.md) | - |
diff --git a/tests/ops/data/video10_frames/frame_0.jpg b/tests/ops/data/video10_frames/frame_0.jpg
new file mode 100644
index 00000000000..92b4568bdb4
Binary files /dev/null and b/tests/ops/data/video10_frames/frame_0.jpg differ
diff --git a/tests/ops/data/video10_frames/frame_1.jpg b/tests/ops/data/video10_frames/frame_1.jpg
new file mode 100644
index 00000000000..41ddee137c1
Binary files /dev/null and b/tests/ops/data/video10_frames/frame_1.jpg differ
diff --git a/tests/ops/data/video10_frames/frame_10.jpg b/tests/ops/data/video10_frames/frame_10.jpg
new file mode 100644
index 00000000000..145d743d9c7
Binary files /dev/null and b/tests/ops/data/video10_frames/frame_10.jpg differ
diff --git a/tests/ops/data/video10_frames/frame_11.jpg b/tests/ops/data/video10_frames/frame_11.jpg
new file mode 100644
index 00000000000..81716ca4974
Binary files /dev/null and b/tests/ops/data/video10_frames/frame_11.jpg differ
diff --git a/tests/ops/data/video10_frames/frame_12.jpg b/tests/ops/data/video10_frames/frame_12.jpg
new file mode 100644
index 00000000000..e5bc88c3772
Binary files /dev/null and b/tests/ops/data/video10_frames/frame_12.jpg differ
diff --git a/tests/ops/data/video10_frames/frame_13.jpg b/tests/ops/data/video10_frames/frame_13.jpg
new file mode 100644
index 00000000000..4f635e0a761
Binary files /dev/null and b/tests/ops/data/video10_frames/frame_13.jpg differ
diff --git a/tests/ops/data/video10_frames/frame_14.jpg b/tests/ops/data/video10_frames/frame_14.jpg
new file mode 100644
index 00000000000..df027d0b8b5
Binary files /dev/null and b/tests/ops/data/video10_frames/frame_14.jpg differ
diff --git a/tests/ops/data/video10_frames/frame_15.jpg b/tests/ops/data/video10_frames/frame_15.jpg
new file mode 100644
index 00000000000..13ed8662009
Binary files /dev/null and b/tests/ops/data/video10_frames/frame_15.jpg differ
diff --git a/tests/ops/data/video10_frames/frame_16.jpg b/tests/ops/data/video10_frames/frame_16.jpg
new file mode 100644
index 00000000000..b091dabff0c
Binary files /dev/null and b/tests/ops/data/video10_frames/frame_16.jpg differ
diff --git a/tests/ops/data/video10_frames/frame_17.jpg b/tests/ops/data/video10_frames/frame_17.jpg
new file mode 100644
index 00000000000..4af241d210a
Binary files /dev/null and b/tests/ops/data/video10_frames/frame_17.jpg differ
diff --git a/tests/ops/data/video10_frames/frame_18.jpg b/tests/ops/data/video10_frames/frame_18.jpg
new file mode 100644
index 00000000000..c992c8587c1
Binary files /dev/null and b/tests/ops/data/video10_frames/frame_18.jpg differ
diff --git a/tests/ops/data/video10_frames/frame_2.jpg b/tests/ops/data/video10_frames/frame_2.jpg
new file mode 100644
index 00000000000..83f4f4dca43
Binary files /dev/null and b/tests/ops/data/video10_frames/frame_2.jpg differ
diff --git a/tests/ops/data/video10_frames/frame_3.jpg b/tests/ops/data/video10_frames/frame_3.jpg
new file mode 100644
index 00000000000..366036d458a
Binary files /dev/null and b/tests/ops/data/video10_frames/frame_3.jpg differ
diff --git a/tests/ops/data/video10_frames/frame_4.jpg b/tests/ops/data/video10_frames/frame_4.jpg
new file mode 100644
index 00000000000..b561429e2ce
Binary files /dev/null and b/tests/ops/data/video10_frames/frame_4.jpg differ
diff --git a/tests/ops/data/video10_frames/frame_5.jpg b/tests/ops/data/video10_frames/frame_5.jpg
new file mode 100644
index 00000000000..325eded4929
Binary files /dev/null and b/tests/ops/data/video10_frames/frame_5.jpg differ
diff --git a/tests/ops/data/video10_frames/frame_6.jpg b/tests/ops/data/video10_frames/frame_6.jpg
new file mode 100644
index 00000000000..e1a8b10add7
Binary files /dev/null and b/tests/ops/data/video10_frames/frame_6.jpg differ
diff --git a/tests/ops/data/video10_frames/frame_7.jpg b/tests/ops/data/video10_frames/frame_7.jpg
new file mode 100644
index 00000000000..2e4763a65e7
Binary files /dev/null and b/tests/ops/data/video10_frames/frame_7.jpg differ
diff --git a/tests/ops/data/video10_frames/frame_8.jpg b/tests/ops/data/video10_frames/frame_8.jpg
new file mode 100644
index 00000000000..f1f908ad610
Binary files /dev/null and b/tests/ops/data/video10_frames/frame_8.jpg differ
diff --git a/tests/ops/data/video10_frames/frame_9.jpg b/tests/ops/data/video10_frames/frame_9.jpg
new file mode 100644
index 00000000000..a6253c7cc3c
Binary files /dev/null and b/tests/ops/data/video10_frames/frame_9.jpg differ
diff --git a/tests/ops/data/video11_frames/frame_0.jpg b/tests/ops/data/video11_frames/frame_0.jpg
new file mode 100644
index 00000000000..0ea5258e987
Binary files /dev/null and b/tests/ops/data/video11_frames/frame_0.jpg differ
diff --git a/tests/ops/data/video11_frames/frame_1.jpg b/tests/ops/data/video11_frames/frame_1.jpg
new file mode 100644
index 00000000000..e44b01893b9
Binary files /dev/null and b/tests/ops/data/video11_frames/frame_1.jpg differ
diff --git a/tests/ops/data/video11_frames/frame_10.jpg b/tests/ops/data/video11_frames/frame_10.jpg
new file mode 100644
index 00000000000..387b5a06c8d
Binary files /dev/null and b/tests/ops/data/video11_frames/frame_10.jpg differ
diff --git a/tests/ops/data/video11_frames/frame_2.jpg b/tests/ops/data/video11_frames/frame_2.jpg
new file mode 100644
index 00000000000..6f2f71994f7
Binary files /dev/null and b/tests/ops/data/video11_frames/frame_2.jpg differ
diff --git a/tests/ops/data/video11_frames/frame_3.jpg b/tests/ops/data/video11_frames/frame_3.jpg
new file mode 100644
index 00000000000..c4c56f2a1f1
Binary files /dev/null and b/tests/ops/data/video11_frames/frame_3.jpg differ
diff --git a/tests/ops/data/video11_frames/frame_4.jpg b/tests/ops/data/video11_frames/frame_4.jpg
new file mode 100644
index 00000000000..9c2dd30eb93
Binary files /dev/null and b/tests/ops/data/video11_frames/frame_4.jpg differ
diff --git a/tests/ops/data/video11_frames/frame_5.jpg b/tests/ops/data/video11_frames/frame_5.jpg
new file mode 100644
index 00000000000..b614641e220
Binary files /dev/null and b/tests/ops/data/video11_frames/frame_5.jpg differ
diff --git a/tests/ops/data/video11_frames/frame_6.jpg b/tests/ops/data/video11_frames/frame_6.jpg
new file mode 100644
index 00000000000..8140d369029
Binary files /dev/null and b/tests/ops/data/video11_frames/frame_6.jpg differ
diff --git a/tests/ops/data/video11_frames/frame_7.jpg b/tests/ops/data/video11_frames/frame_7.jpg
new file mode 100644
index 00000000000..95caf5b9d8b
Binary files /dev/null and b/tests/ops/data/video11_frames/frame_7.jpg differ
diff --git a/tests/ops/data/video11_frames/frame_8.jpg b/tests/ops/data/video11_frames/frame_8.jpg
new file mode 100644
index 00000000000..7940f220bc2
Binary files /dev/null and b/tests/ops/data/video11_frames/frame_8.jpg differ
diff --git a/tests/ops/data/video11_frames/frame_9.jpg b/tests/ops/data/video11_frames/frame_9.jpg
new file mode 100644
index 00000000000..1ce34e43e9a
Binary files /dev/null and b/tests/ops/data/video11_frames/frame_9.jpg differ
diff --git a/tests/ops/data/video12_frames/frame_0.jpg b/tests/ops/data/video12_frames/frame_0.jpg
new file mode 100644
index 00000000000..0a2770b8afd
Binary files /dev/null and b/tests/ops/data/video12_frames/frame_0.jpg differ
diff --git a/tests/ops/data/video12_frames/frame_1.jpg b/tests/ops/data/video12_frames/frame_1.jpg
new file mode 100644
index 00000000000..d9badf58d54
Binary files /dev/null and b/tests/ops/data/video12_frames/frame_1.jpg differ
diff --git a/tests/ops/data/video12_frames/frame_2.jpg b/tests/ops/data/video12_frames/frame_2.jpg
new file mode 100644
index 00000000000..9899749365b
Binary files /dev/null and b/tests/ops/data/video12_frames/frame_2.jpg differ
diff --git a/tests/ops/data/video13.mp4 b/tests/ops/data/video13.mp4
new file mode 100644
index 00000000000..07dadab9777
Binary files /dev/null and b/tests/ops/data/video13.mp4 differ
diff --git a/tests/ops/data/video13_frames/frame_0.jpg b/tests/ops/data/video13_frames/frame_0.jpg
new file mode 100644
index 00000000000..69fdbdb95db
Binary files /dev/null and b/tests/ops/data/video13_frames/frame_0.jpg differ
diff --git a/tests/ops/data/video13_frames/frame_1.jpg b/tests/ops/data/video13_frames/frame_1.jpg
new file mode 100644
index 00000000000..26a3aebc8a4
Binary files /dev/null and b/tests/ops/data/video13_frames/frame_1.jpg differ
diff --git a/tests/ops/data/video13_frames/frame_2.jpg b/tests/ops/data/video13_frames/frame_2.jpg
new file mode 100644
index 00000000000..14d4cf4e40c
Binary files /dev/null and b/tests/ops/data/video13_frames/frame_2.jpg differ
diff --git a/tests/ops/data/video13_frames/frame_3.jpg b/tests/ops/data/video13_frames/frame_3.jpg
new file mode 100644
index 00000000000..de4b33627db
Binary files /dev/null and b/tests/ops/data/video13_frames/frame_3.jpg differ
diff --git a/tests/ops/data/video13_frames/frame_4.jpg b/tests/ops/data/video13_frames/frame_4.jpg
new file mode 100644
index 00000000000..62010c2da79
Binary files /dev/null and b/tests/ops/data/video13_frames/frame_4.jpg differ
diff --git a/tests/ops/data/video13_frames/frame_5.jpg b/tests/ops/data/video13_frames/frame_5.jpg
new file mode 100644
index 00000000000..80eff35f165
Binary files /dev/null and b/tests/ops/data/video13_frames/frame_5.jpg differ
diff --git a/tests/ops/data/video13_frames/frame_6.jpg b/tests/ops/data/video13_frames/frame_6.jpg
new file mode 100644
index 00000000000..2db14afb6e7
Binary files /dev/null and b/tests/ops/data/video13_frames/frame_6.jpg differ
diff --git a/tests/ops/data/video3_frames/frame_0.jpg b/tests/ops/data/video3_frames/frame_0.jpg
new file mode 100644
index 00000000000..a83ce7ebbf9
Binary files /dev/null and b/tests/ops/data/video3_frames/frame_0.jpg differ
diff --git a/tests/ops/data/video3_frames/frame_1.jpg b/tests/ops/data/video3_frames/frame_1.jpg
new file mode 100644
index 00000000000..cdb782bf89c
Binary files /dev/null and b/tests/ops/data/video3_frames/frame_1.jpg differ
diff --git a/tests/ops/data/video3_frames/frame_10.jpg b/tests/ops/data/video3_frames/frame_10.jpg
new file mode 100644
index 00000000000..25a225101c3
Binary files /dev/null and b/tests/ops/data/video3_frames/frame_10.jpg differ
diff --git a/tests/ops/data/video3_frames/frame_11.jpg b/tests/ops/data/video3_frames/frame_11.jpg
new file mode 100644
index 00000000000..6a6991faa14
Binary files /dev/null and b/tests/ops/data/video3_frames/frame_11.jpg differ
diff --git a/tests/ops/data/video3_frames/frame_12.jpg b/tests/ops/data/video3_frames/frame_12.jpg
new file mode 100644
index 00000000000..5cfa3a8b97b
Binary files /dev/null and b/tests/ops/data/video3_frames/frame_12.jpg differ
diff --git a/tests/ops/data/video3_frames/frame_13.jpg b/tests/ops/data/video3_frames/frame_13.jpg
new file mode 100644
index 00000000000..a838f30cfda
Binary files /dev/null and b/tests/ops/data/video3_frames/frame_13.jpg differ
diff --git a/tests/ops/data/video3_frames/frame_14.jpg b/tests/ops/data/video3_frames/frame_14.jpg
new file mode 100644
index 00000000000..73706b82699
Binary files /dev/null and b/tests/ops/data/video3_frames/frame_14.jpg differ
diff --git a/tests/ops/data/video3_frames/frame_15.jpg b/tests/ops/data/video3_frames/frame_15.jpg
new file mode 100644
index 00000000000..ec5fde5bdfd
Binary files /dev/null and b/tests/ops/data/video3_frames/frame_15.jpg differ
diff --git a/tests/ops/data/video3_frames/frame_2.jpg b/tests/ops/data/video3_frames/frame_2.jpg
new file mode 100644
index 00000000000..880fe38d2b0
Binary files /dev/null and b/tests/ops/data/video3_frames/frame_2.jpg differ
diff --git a/tests/ops/data/video3_frames/frame_3.jpg b/tests/ops/data/video3_frames/frame_3.jpg
new file mode 100644
index 00000000000..4e34c6f67b6
Binary files /dev/null and b/tests/ops/data/video3_frames/frame_3.jpg differ
diff --git a/tests/ops/data/video3_frames/frame_4.jpg b/tests/ops/data/video3_frames/frame_4.jpg
new file mode 100644
index 00000000000..0df1ff488a9
Binary files /dev/null and b/tests/ops/data/video3_frames/frame_4.jpg differ
diff --git a/tests/ops/data/video3_frames/frame_5.jpg b/tests/ops/data/video3_frames/frame_5.jpg
new file mode 100644
index 00000000000..80beb69203c
Binary files /dev/null and b/tests/ops/data/video3_frames/frame_5.jpg differ
diff --git a/tests/ops/data/video3_frames/frame_6.jpg b/tests/ops/data/video3_frames/frame_6.jpg
new file mode 100644
index 00000000000..a63c000d4cc
Binary files /dev/null and b/tests/ops/data/video3_frames/frame_6.jpg differ
diff --git a/tests/ops/data/video3_frames/frame_7.jpg b/tests/ops/data/video3_frames/frame_7.jpg
new file mode 100644
index 00000000000..d6fb419ec2c
Binary files /dev/null and b/tests/ops/data/video3_frames/frame_7.jpg differ
diff --git a/tests/ops/data/video3_frames/frame_8.jpg b/tests/ops/data/video3_frames/frame_8.jpg
new file mode 100644
index 00000000000..01f05c8d89e
Binary files /dev/null and b/tests/ops/data/video3_frames/frame_8.jpg differ
diff --git a/tests/ops/data/video3_frames/frame_9.jpg b/tests/ops/data/video3_frames/frame_9.jpg
new file mode 100644
index 00000000000..6b1e628346e
Binary files /dev/null and b/tests/ops/data/video3_frames/frame_9.jpg differ
diff --git a/tests/ops/data/video4_frames/frame_0.jpg b/tests/ops/data/video4_frames/frame_0.jpg
new file mode 100644
index 00000000000..e8d0c2db6fe
Binary files /dev/null and b/tests/ops/data/video4_frames/frame_0.jpg differ
diff --git a/tests/ops/data/video4_frames/frame_1.jpg b/tests/ops/data/video4_frames/frame_1.jpg
new file mode 100644
index 00000000000..6fcfd5cb65b
Binary files /dev/null and b/tests/ops/data/video4_frames/frame_1.jpg differ
diff --git a/tests/ops/data/video4_frames/frame_2.jpg b/tests/ops/data/video4_frames/frame_2.jpg
new file mode 100644
index 00000000000..b50f3f46c8f
Binary files /dev/null and b/tests/ops/data/video4_frames/frame_2.jpg differ
diff --git a/tests/ops/data/video4_frames/frame_3.jpg b/tests/ops/data/video4_frames/frame_3.jpg
new file mode 100644
index 00000000000..cb236bf0f42
Binary files /dev/null and b/tests/ops/data/video4_frames/frame_3.jpg differ
diff --git a/tests/ops/data/video4_frames/frame_4.jpg b/tests/ops/data/video4_frames/frame_4.jpg
new file mode 100644
index 00000000000..f20342acc95
Binary files /dev/null and b/tests/ops/data/video4_frames/frame_4.jpg differ
diff --git a/tests/ops/data/video4_frames/frame_5.jpg b/tests/ops/data/video4_frames/frame_5.jpg
new file mode 100644
index 00000000000..a2fcf29f845
Binary files /dev/null and b/tests/ops/data/video4_frames/frame_5.jpg differ
diff --git a/tests/ops/data/video4_frames/frame_6.jpg b/tests/ops/data/video4_frames/frame_6.jpg
new file mode 100644
index 00000000000..0e2f58805aa
Binary files /dev/null and b/tests/ops/data/video4_frames/frame_6.jpg differ
diff --git a/tests/ops/mapper/test_vggt_mapper.py b/tests/ops/mapper/test_vggt_mapper.py
index 909d8265e61..8be87e4f606 100644
--- a/tests/ops/mapper/test_vggt_mapper.py
+++ b/tests/ops/mapper/test_vggt_mapper.py
@@ -1,13 +1,14 @@
 import os
 import unittest
 import numpy as np
+import tempfile
+import shutil
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.mapper.vggt_mapper import VggtMapper
 from data_juicer.utils.mm_utils import SpecialTokens
 from data_juicer.utils.constant import Fields, MetaKeys
 from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
-from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE
 
 
 class VggtMapperTest(DataJuicerTestCaseBase):
@@ -16,6 +17,17 @@ class VggtMapperTest(DataJuicerTestCaseBase):
     vid11_path = os.path.join(data_path, 'video11.mp4')
     vid10_path = os.path.join(data_path, 'video10.mp4')
 
+
+    def setUp(self):
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        super().setUp()
+
+    def tearDown(self):
+        super().tearDown()
+        if os.path.exists(self.tmp_dir):
+            shutil.rmtree(self.tmp_dir)
+            
+
     def test(self):
         ds_list = [{
             'query_points': [[320.0, 200.0], [500.72, 100.94]],
@@ -29,7 +41,7 @@ def test(self):
             vggt_model_path="facebook/VGGT-1B",
             frame_num=2,
             duration=2,
-            frame_dir=DATA_JUICER_ASSETS_CACHE,
+            frame_dir=os.path.join(self.tmp_dir, "vggt_test1"),
             if_output_camera_parameters=True,
             if_output_depth_maps=True,
             if_output_point_maps_from_projection=True,
@@ -90,7 +102,7 @@ def test_mul_proc(self):
             vggt_model_path="facebook/VGGT-1B",
             frame_num=2,
             duration=2,
-            frame_dir=DATA_JUICER_ASSETS_CACHE,
+            frame_dir=os.path.join(self.tmp_dir, "vggt_test2"),
             if_output_camera_parameters=True,
             if_output_depth_maps=True,
             if_output_point_maps_from_projection=True,
@@ -151,7 +163,7 @@ def test_point_maps_from_unprojection(self):
             vggt_model_path="facebook/VGGT-1B",
             frame_num=2,
             duration=2,
-            frame_dir=DATA_JUICER_ASSETS_CACHE,
+            frame_dir=os.path.join(self.tmp_dir, "vggt_test3"),
             if_output_camera_parameters=False,
             if_output_depth_maps=False,
             if_output_point_maps_from_projection=False,
diff --git a/tests/ops/mapper/test_video_animal_pose_mapper.py b/tests/ops/mapper/test_video_animal_pose_mapper.py
new file mode 100644
index 00000000000..20aa2e26393
--- /dev/null
+++ b/tests/ops/mapper/test_video_animal_pose_mapper.py
@@ -0,0 +1,109 @@
+import os
+import unittest
+import numpy as np
+import tempfile
+import shutil
+
+from data_juicer.core.data import NestedDataset as Dataset
+from data_juicer.ops.mapper.video_animal_pose_mapper import \
+    VideoAnimalPoseMapper
+from data_juicer.utils.constant import Fields, MetaKeys
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
+
+@unittest.skip('The current code can automatically configure the environment, but after the initial setup (including mmpose), the user need to re-run the command for it to work properly.')
+class VideoAnimalPoseMapperTest(DataJuicerTestCaseBase):
+    data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
+                             'data')
+    vid13_path = os.path.join(data_path, 'video13.mp4')
+    vid13_frames_dir = os.path.join(data_path, 'video13_frames')
+    vid13_frames_path = []
+    for x in os.listdir(vid13_frames_dir):
+        vid13_frames_path.append(os.path.join(vid13_frames_dir, x))
+
+
+    tgt_list = [{
+        "frame_length": 7,
+        "frame3_pose_list_shape": [3, 17, 2],
+        "frame3_pose_score_list_shape": [3, 17],
+        "frame3_animal_bboxes_shape": [3, 4],
+    }, {
+        "frame_length": 7,
+        "frame3_pose_list_shape": [3, 17, 2],
+        "frame3_pose_score_list_shape": [3, 17],
+        "frame3_animal_bboxes_shape": [3, 4],
+    }]
+
+
+    def setUp(self):
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        super().setUp()
+
+    def tearDown(self):
+        super().tearDown()
+        if os.path.exists(self.tmp_dir):
+            shutil.rmtree(self.tmp_dir)
+
+
+    def test(self):
+        ds_list = [{
+            'videos': [self.vid13_path]
+        },  {
+            'videos': [self.vid13_path]
+        }]
+
+        op = VideoAnimalPoseMapper(
+            vitpose_model_path="apt36k.pth",
+            vitpose_config="configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/apt36k/ViTPose_huge_apt36k_256x192.py",
+            yoloe_model_path="yoloe-26x-seg.pt",
+            if_save_visualization=True,
+            save_visualization_dir=os.path.join(self.tmp_dir, "animal_pose_vis1"),
+            frame_num=1,
+            duration=1,
+            frame_dir=os.path.join(self.tmp_dir, "animal_pose_test")
+        )
+
+        dataset = Dataset.from_list(ds_list)
+        if Fields.meta not in dataset.features:
+            dataset = dataset.add_column(name=Fields.meta,
+                                         column=[{}] * dataset.num_rows)
+        dataset = dataset.map(op.process, num_proc=2, with_rank=True)
+        res_list = dataset.to_list()
+
+        for sample, target in zip(res_list, self.tgt_list):
+            self.assertEqual(len(sample[Fields.meta][MetaKeys.video_animal_pose_tags]["pose_list"]), target["frame_length"])
+            self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.video_animal_pose_tags]["pose_list"][3]).shape), target["frame3_pose_list_shape"])
+            self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.video_animal_pose_tags]["pose_score_list"][3]).shape), target["frame3_pose_score_list_shape"])
+            self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.video_animal_pose_tags]["animal_bboxes"][3]).shape), target["frame3_animal_bboxes_shape"])
+
+
+    def test_from_extracted_frames(self):
+        ds_list = [{
+            MetaKeys.video_frames: self.vid13_frames_path
+        },  {
+            MetaKeys.video_frames: self.vid13_frames_path
+        }]
+
+        op = VideoAnimalPoseMapper(
+            vitpose_model_path="apt36k.pth",
+            vitpose_config="configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/apt36k/ViTPose_huge_apt36k_256x192.py",
+            yoloe_model_path="yoloe-26x-seg.pt",
+            if_save_visualization=True,
+            save_visualization_dir=os.path.join(self.tmp_dir, "animal_pose_vis2"),
+        )
+
+        dataset = Dataset.from_list(ds_list)
+        if Fields.meta not in dataset.features:
+            dataset = dataset.add_column(name=Fields.meta,
+                                         column=[{}] * dataset.num_rows)
+        dataset = dataset.map(op.process, num_proc=1, with_rank=True)
+        res_list = dataset.to_list()
+
+        for sample, target in zip(res_list, self.tgt_list):
+            self.assertEqual(len(sample[Fields.meta][MetaKeys.video_animal_pose_tags]["pose_list"]), target["frame_length"])
+            self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.video_animal_pose_tags]["pose_list"][3]).shape), target["frame3_pose_list_shape"])
+            self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.video_animal_pose_tags]["pose_score_list"][3]).shape), target["frame3_pose_score_list_shape"])
+            self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.video_animal_pose_tags]["animal_bboxes"][3]).shape), target["frame3_animal_bboxes_shape"])
+
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/tests/ops/mapper/test_video_camera_calibration_static_deepcalib_mapper.py b/tests/ops/mapper/test_video_camera_calibration_static_deepcalib_mapper.py
index ab816e67098..66cb8931fa8 100644
--- a/tests/ops/mapper/test_video_camera_calibration_static_deepcalib_mapper.py
+++ b/tests/ops/mapper/test_video_camera_calibration_static_deepcalib_mapper.py
@@ -1,13 +1,14 @@
 import os
 import unittest
 import numpy as np
+import tempfile
+import shutil
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.mapper.video_camera_calibration_static_deepcalib_mapper import VideoCameraCalibrationStaticDeepcalibMapper
 from data_juicer.utils.mm_utils import SpecialTokens
 from data_juicer.utils.constant import Fields, MetaKeys
 from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
-from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE
 
 
 class VideoCameraCalibrationStaticDeepcalibMapperTest(DataJuicerTestCaseBase):
@@ -17,7 +18,16 @@ class VideoCameraCalibrationStaticDeepcalibMapperTest(DataJuicerTestCaseBase):
     vid4_path = os.path.join(data_path, 'video4.mp4')
     vid12_path = os.path.join(data_path, 'video12.mp4')
 
-    def _run_and_assert(self, num_proc):
+    def setUp(self):
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        super().setUp()
+
+    def tearDown(self):
+        super().tearDown()
+        if os.path.exists(self.tmp_dir):
+            shutil.rmtree(self.tmp_dir)
+
+    def _run_and_assert(self, num_proc, output_frame_dir):
         ds_list = [{
             'videos': [self.vid3_path]
         },  {
@@ -46,9 +56,9 @@ def _run_and_assert(self, num_proc):
             model_path="weights_10_0.02.h5",
             frame_num=1,
             duration=1,
-            frame_dir=DATA_JUICER_ASSETS_CACHE,
+            frame_dir=output_frame_dir,
             if_output_info=True,
-            output_info_dir=DATA_JUICER_ASSETS_CACHE,
+            output_info_dir=os.path.join(self.tmp_dir, "deepcalib_info"),
         )
         dataset = Dataset.from_list(ds_list)
         if Fields.meta not in dataset.features:
@@ -66,10 +76,10 @@ def _run_and_assert(self, num_proc):
 
 
     def test(self):
-        self._run_and_assert(num_proc=1)
+        self._run_and_assert(num_proc=1, output_frame_dir=os.path.join(self.tmp_dir, "deepcalib_test1"))
 
     def test_mul_proc(self):
-        self._run_and_assert(num_proc=2)
+        self._run_and_assert(num_proc=2, output_frame_dir=os.path.join(self.tmp_dir, "deepcalib_test2"))
 
 
 if __name__ == '__main__':
diff --git a/tests/ops/mapper/test_video_camera_calibration_static_moge_mapper.py b/tests/ops/mapper/test_video_camera_calibration_static_moge_mapper.py
index 1aa03a5fbd4..97028aac7f6 100644
--- a/tests/ops/mapper/test_video_camera_calibration_static_moge_mapper.py
+++ b/tests/ops/mapper/test_video_camera_calibration_static_moge_mapper.py
@@ -1,13 +1,14 @@
 import os
 import unittest
 import numpy as np
+import tempfile
+import shutil
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.mapper.video_camera_calibration_static_moge_mapper import VideoCameraCalibrationStaticMogeMapper
 from data_juicer.utils.mm_utils import SpecialTokens
 from data_juicer.utils.constant import Fields, MetaKeys
 from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
-from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE
 
 
 class VideoCameraCalibrationStaticMogeMapperTest(DataJuicerTestCaseBase):
@@ -16,8 +17,31 @@ class VideoCameraCalibrationStaticMogeMapperTest(DataJuicerTestCaseBase):
     vid3_path = os.path.join(data_path, 'video3.mp4')
     vid4_path = os.path.join(data_path, 'video4.mp4')
     vid12_path = os.path.join(data_path, 'video12.mp4')
+    vid3_frames_dir = os.path.join(data_path, 'video3_frames')
+    vid4_frames_dir = os.path.join(data_path, 'video4_frames')
+    vid12_frames_dir = os.path.join(data_path, 'video12_frames')
+    vid3_frames_path = []
+    vid4_frames_path = []
+    vid12_frames_path = []
+    for x in os.listdir(vid3_frames_dir):
+        vid3_frames_path.append(os.path.join(vid3_frames_dir, x))
+    for x in os.listdir(vid4_frames_dir):
+        vid4_frames_path.append(os.path.join(vid4_frames_dir, x))
+    for x in os.listdir(vid12_frames_dir):
+        vid12_frames_path.append(os.path.join(vid12_frames_dir, x))
 
-    def _run_and_assert(self, num_proc):
+    
+    def setUp(self):
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        super().setUp()
+
+    def tearDown(self):
+        super().tearDown()
+        if os.path.exists(self.tmp_dir):
+            shutil.rmtree(self.tmp_dir)
+
+
+    def _run_and_assert(self, num_proc, output_frame_dir):
         ds_list = [{
             'videos': [self.vid3_path]
         },  {
@@ -52,9 +76,9 @@ def _run_and_assert(self, num_proc):
             model_path="Ruicheng/moge-2-vitl",
             frame_num=1,
             duration=1,
-            frame_dir=DATA_JUICER_ASSETS_CACHE,
+            frame_dir=output_frame_dir,
             if_output_info=True,
-            output_info_dir=DATA_JUICER_ASSETS_CACHE,
+            output_info_dir=os.path.join(self.tmp_dir, "moge_info"),
             if_output_points_info=True,
             if_output_depth_info=True,
             if_output_mask_info=True,
@@ -79,10 +103,74 @@ def _run_and_assert(self, num_proc):
 
 
     def test(self):
-        self._run_and_assert(num_proc=1)
+        self._run_and_assert(num_proc=1, output_frame_dir=os.path.join(self.tmp_dir, "moge_test1"))
 
     def test_mul_proc(self):
-        self._run_and_assert(num_proc=2)
+        self._run_and_assert(num_proc=2, output_frame_dir=os.path.join(self.tmp_dir, "moge_test2"))
+
+
+    def _run_and_assert_for_extracted_frames(self, num_proc):
+        ds_list = [{
+            MetaKeys.video_frames: self.vid3_frames_path,
+        },  {
+            MetaKeys.video_frames: self.vid4_frames_path,
+        },  {
+            MetaKeys.video_frames: self.vid12_frames_path,
+        }]
+
+        tgt_list = [{"frame_names_shape": [16],
+            "intrinsics_list_shape": [16, 3, 3],
+            "hfov_list_shape": [16],
+            "vfov_list_shape": [16],
+            "points_list_shape": [16, 640, 362, 3],
+            "depth_list_shape": [16, 640, 362],
+            "mask_list_shape": [16, 640, 362]},
+            {"frame_names_shape": [7],
+            "intrinsics_list_shape": [7, 3, 3],
+            "hfov_list_shape": [7],
+            "vfov_list_shape": [7],
+            "points_list_shape": [7, 360, 480, 3],
+            "depth_list_shape": [7, 360, 480],
+            "mask_list_shape": [7, 360, 480]},
+            {"frame_names_shape": [3],
+            "intrinsics_list_shape": [3, 3, 3],
+            "hfov_list_shape": [3],
+            "vfov_list_shape": [3],
+            "points_list_shape": [3, 1080, 1920, 3],
+            "depth_list_shape": [3, 1080, 1920],
+            "mask_list_shape": [3, 1080, 1920]}]
+
+        op = VideoCameraCalibrationStaticMogeMapper(
+            model_path="Ruicheng/moge-2-vitl",
+            if_output_info=True,
+            output_info_dir=os.path.join(self.tmp_dir, "moge_info"),
+            if_output_points_info=True,
+            if_output_depth_info=True,
+            if_output_mask_info=True,
+        )
+
+        dataset = Dataset.from_list(ds_list)
+        if Fields.meta not in dataset.features:
+            dataset = dataset.add_column(name=Fields.meta,
+                                         column=[{}] * dataset.num_rows)
+        dataset = dataset.map(op.process, num_proc=num_proc, with_rank=True)
+        res_list = dataset.to_list()
+
+
+        for sample, target in zip(res_list, tgt_list):
+            self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["frame_names"]).shape), target["frame_names_shape"])
+            self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["intrinsics_list"]).shape), target["intrinsics_list_shape"])
+            self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["hfov_list"]).shape), target["hfov_list_shape"])
+            self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["vfov_list"]).shape), target["vfov_list_shape"])
+            self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["points_list"]).shape), target["points_list_shape"])
+            self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["depth_list"]).shape), target["depth_list_shape"])
+            self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["mask_list"]).shape), target["mask_list_shape"])
+
+    def test_for_extracted_frames(self):
+        self._run_and_assert_for_extracted_frames(num_proc=1)
+
+    def test_mul_proc_for_extracted_frames(self):
+        self._run_and_assert_for_extracted_frames(num_proc=2)
 
 
 if __name__ == '__main__':
diff --git a/tests/ops/mapper/test_video_camera_pose_mapper.py b/tests/ops/mapper/test_video_camera_pose_mapper.py
index 178a2f6b4bb..4f321232028 100644
--- a/tests/ops/mapper/test_video_camera_pose_mapper.py
+++ b/tests/ops/mapper/test_video_camera_pose_mapper.py
@@ -1,13 +1,14 @@
 import os
 import unittest
 import numpy as np
+import tempfile
+import shutil
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.mapper.video_camera_pose_mapper import VideoCameraPoseMapper
 from data_juicer.utils.mm_utils import SpecialTokens
 from data_juicer.utils.constant import Fields, MetaKeys
 from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
-from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE
 
 
 
@@ -18,8 +19,17 @@ class VideoCameraPoseMapperTest(DataJuicerTestCaseBase):
     vid11_path = os.path.join(data_path, 'video11.mp4')
     vid12_path = os.path.join(data_path, 'video12.mp4')
 
+    def setUp(self):
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        super().setUp()
 
-    def _run_and_assert(self, num_proc):
+    def tearDown(self):
+        super().tearDown()
+        if os.path.exists(self.tmp_dir):
+            shutil.rmtree(self.tmp_dir)
+
+
+    def _run_and_assert(self, num_proc, output_frame_dir):
         ds_list = [{
             'videos': [self.vid3_path]
         },  {
@@ -45,11 +55,11 @@ def _run_and_assert(self, num_proc):
             moge_model_path="Ruicheng/moge-2-vitl",
             frame_num=1,
             duration=1,
-            frame_dir=DATA_JUICER_ASSETS_CACHE,
+            frame_dir=output_frame_dir,
             if_output_moge_info=False,
-            moge_output_info_dir=DATA_JUICER_ASSETS_CACHE,
+            moge_output_info_dir=os.path.join(self.tmp_dir, "moge_info"),
             if_save_info=True,
-            output_info_dir=DATA_JUICER_ASSETS_CACHE,
+            output_info_dir=os.path.join(self.tmp_dir, "camera_pose_info"),
             num_proc=num_proc,
         )
 
@@ -68,10 +78,10 @@ def _run_and_assert(self, num_proc):
 
 
     def test(self):
-        self._run_and_assert(num_proc=1)
+        self._run_and_assert(num_proc=1, output_frame_dir=os.path.join(self.tmp_dir, "camera_pose_test1"))
 
     def test_mul_proc(self):
-        self._run_and_assert(num_proc=2)
+        self._run_and_assert(num_proc=2, output_frame_dir=os.path.join(self.tmp_dir, "camera_pose_test2"))
 
 
 if __name__ == '__main__':
diff --git a/tests/ops/mapper/test_video_depth_estimation_mapper.py b/tests/ops/mapper/test_video_depth_estimation_mapper.py
index 8209f5339bd..53b8c17cd55 100644
--- a/tests/ops/mapper/test_video_depth_estimation_mapper.py
+++ b/tests/ops/mapper/test_video_depth_estimation_mapper.py
@@ -1,13 +1,15 @@
 import os
 import unittest
 import numpy as np
+import tempfile
+import shutil
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.mapper.video_depth_estimation_mapper import \
     VideoDepthEstimationMapper
 from data_juicer.utils.constant import Fields, MetaKeys
 from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
-from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE
+
 
 @unittest.skip("sys.path.append works locally but fails in the unittest pipeline.")
 class VideoDepthEstimationMapperTest(DataJuicerTestCaseBase):
@@ -15,6 +17,14 @@ class VideoDepthEstimationMapperTest(DataJuicerTestCaseBase):
                              'data')
     vid3_path = os.path.join(data_path, 'video3.mp4')
     vid4_path = os.path.join(data_path, 'video4.mp4')
+    vid10_frames_dir = os.path.join(data_path, 'video10_frames')
+    vid11_frames_dir = os.path.join(data_path, 'video11_frames')
+    vid10_frames_path = []
+    vid11_frames_path = []
+    for x in os.listdir(vid10_frames_dir):
+        vid10_frames_path.append(os.path.join(vid10_frames_dir, x))
+    for x in os.listdir(vid11_frames_dir):
+        vid11_frames_path.append(os.path.join(vid11_frames_dir, x))
 
     tgt_list = [{
         "depth_data": [673, 360, 480],
@@ -24,6 +34,23 @@ class VideoDepthEstimationMapperTest(DataJuicerTestCaseBase):
         "fps": 24.0
     }]
 
+    tgt_list_for_frames_test = [{
+        "depth_data": [19, 756, 1008],
+        "fps": 30.0
+    }, {
+        "depth_data": [11, 360, 640],
+        "fps": 30.0
+    }]
+
+    def setUp(self):
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        super().setUp()
+
+    def tearDown(self):
+        super().tearDown()
+        if os.path.exists(self.tmp_dir):
+            shutil.rmtree(self.tmp_dir)
+
     def test(self):
         ds_list = [{
             'videos': [self.vid4_path]
@@ -33,11 +60,11 @@ def test(self):
 
         op = VideoDepthEstimationMapper(
             video_depth_model_path="video_depth_anything_vits.pth",
-            point_cloud_dir_for_metric=DATA_JUICER_ASSETS_CACHE,
+            point_cloud_dir_for_metric=self.tmp_dir,
             max_res=1280,
             torch_dtype="fp16",
             if_save_visualization=True,
-            save_visualization_dir=DATA_JUICER_ASSETS_CACHE,
+            save_visualization_dir=self.tmp_dir,
             grayscale=False,
         )
 
@@ -62,11 +89,11 @@ def test_metric(self):
 
         op = VideoDepthEstimationMapper(
             video_depth_model_path="metric_video_depth_anything_vits.pth",
-            point_cloud_dir_for_metric=DATA_JUICER_ASSETS_CACHE,
+            point_cloud_dir_for_metric=self.tmp_dir,
             max_res=1280,
             torch_dtype="fp16",
             if_save_visualization=True,
-            save_visualization_dir=DATA_JUICER_ASSETS_CACHE,
+            save_visualization_dir=self.tmp_dir,
             grayscale=False,
         )
 
@@ -91,11 +118,11 @@ def test_mul_proc(self):
 
         op = VideoDepthEstimationMapper(
             video_depth_model_path="video_depth_anything_vits.pth",
-            point_cloud_dir_for_metric=DATA_JUICER_ASSETS_CACHE,
+            point_cloud_dir_for_metric=self.tmp_dir,
             max_res=1280,
             torch_dtype="fp16",
             if_save_visualization=True,
-            save_visualization_dir=DATA_JUICER_ASSETS_CACHE,
+            save_visualization_dir=self.tmp_dir,
             grayscale=False,
         )
 
@@ -120,11 +147,11 @@ def test_metric_mul_proc(self):
 
         op = VideoDepthEstimationMapper(
             video_depth_model_path="metric_video_depth_anything_vits.pth",
-            point_cloud_dir_for_metric=DATA_JUICER_ASSETS_CACHE,
+            point_cloud_dir_for_metric=self.tmp_dir,
             max_res=1280,
             torch_dtype="fp16",
             if_save_visualization=True,
-            save_visualization_dir=DATA_JUICER_ASSETS_CACHE,
+            save_visualization_dir=self.tmp_dir,
             grayscale=False,
         )
 
@@ -140,5 +167,37 @@ def test_metric_mul_proc(self):
             self.assertEqual(sample[Fields.meta][MetaKeys.video_depth_tags]["fps"], target["fps"])
 
 
+    def test_from_extracted_frames(self):
+        ds_list = [{
+            MetaKeys.video_frames: self.vid10_frames_path,
+            "fps": 30,
+        },  {
+            MetaKeys.video_frames: self.vid11_frames_path,
+            "fps": 30,
+        }]
+
+        op = VideoDepthEstimationMapper(
+            video_depth_model_path="metric_video_depth_anything_vits.pth",
+            point_cloud_dir_for_metric=self.tmp_dir,
+            if_save_point_cloud=True,
+            max_res=1280,
+            torch_dtype="fp16",
+            if_save_visualization=True,
+            save_visualization_dir=self.tmp_dir,
+            grayscale=False,
+        )
+
+        dataset = Dataset.from_list(ds_list)
+        if Fields.meta not in dataset.features:
+            dataset = dataset.add_column(name=Fields.meta,
+                                         column=[{}] * dataset.num_rows)
+        dataset = dataset.map(op.process, num_proc=1, with_rank=True)
+        res_list = dataset.to_list()
+
+        for sample, target in zip(res_list, self.tgt_list_for_frames_test):
+            self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.video_depth_tags]["depth_data"]).shape), target["depth_data"])
+            self.assertEqual(sample[Fields.meta][MetaKeys.video_depth_tags]["fps"], target["fps"])
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/ops/mapper/test_video_face_keypoints_mapper.py b/tests/ops/mapper/test_video_face_keypoints_mapper.py
new file mode 100644
index 00000000000..3f760da2950
--- /dev/null
+++ b/tests/ops/mapper/test_video_face_keypoints_mapper.py
@@ -0,0 +1,100 @@
+import os
+import unittest
+import numpy as np
+import tempfile
+import shutil
+
+from data_juicer.core.data import NestedDataset as Dataset
+from data_juicer.ops.mapper.video_face_keypoints_mapper import \
+    VideoFaceKeypointsMapper
+from data_juicer.utils.constant import Fields, MetaKeys
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
+
+
+class VideoFaceKeypointsMapperTest(DataJuicerTestCaseBase):
+    data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
+                             'data')
+    vid3_path = os.path.join(data_path, 'video3.mp4')
+    vid4_path = os.path.join(data_path, 'video4.mp4')
+    vid3_frames_dir = os.path.join(data_path, 'video3_frames')
+    vid4_frames_dir = os.path.join(data_path, 'video4_frames')
+    vid3_frames_path = []
+    vid4_frames_path = []
+    for x in os.listdir(vid3_frames_dir):
+        vid3_frames_path.append(os.path.join(vid3_frames_dir, x))
+    for x in os.listdir(vid4_frames_dir):
+        vid4_frames_path.append(os.path.join(vid4_frames_dir, x))
+
+
+    tgt_list = [{
+        "keypoints_list_shape": [98, 2],
+        "face_bboxes_shape": [4]
+    }, {
+        "keypoints_list_shape": [98, 2],
+        "face_bboxes_shape": [4]
+    }]
+
+    def setUp(self):
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        super().setUp()
+
+    def tearDown(self):
+        super().tearDown()
+        if os.path.exists(self.tmp_dir):
+            shutil.rmtree(self.tmp_dir)
+
+    def test(self):
+        ds_list = [{
+            'videos': [self.vid3_path]
+        },  {
+            'videos': [self.vid4_path]
+        }]
+
+        op = VideoFaceKeypointsMapper(
+            ldeq_model_path="final.pth.tar",
+            if_save_visualization=True,
+            save_visualization_dir=os.path.join(self.tmp_dir, "facekeypoints_vis"),
+            frame_num = 1,
+            duration = 3,
+            frame_dir = os.path.join(self.tmp_dir, "facekeypoints_test")
+        )
+
+        dataset = Dataset.from_list(ds_list)
+        if Fields.meta not in dataset.features:
+            dataset = dataset.add_column(name=Fields.meta,
+                                         column=[{}] * dataset.num_rows)
+        dataset = dataset.map(op.process, num_proc=2, with_rank=True)
+        res_list = dataset.to_list()
+
+        for sample, target in zip(res_list, self.tgt_list):
+            self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.video_face_keypoints_tags]["face_keypoints"][0]).shape[1:]), target["keypoints_list_shape"])
+            self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.video_face_keypoints_tags]["face_bboxes"][0]).shape[1:]), target["face_bboxes_shape"])
+
+
+    def test_from_extracted_frames(self):
+
+        ds_list = [{
+            MetaKeys.video_frames: self.vid3_frames_path,
+        },  {
+            MetaKeys.video_frames: self.vid4_frames_path,
+        }]
+
+        op = VideoFaceKeypointsMapper(
+            ldeq_model_path="final.pth.tar",
+            if_save_visualization=True,
+            save_visualization_dir=os.path.join(self.tmp_dir, "facekeypoints_vis")
+        )
+
+        dataset = Dataset.from_list(ds_list)
+        if Fields.meta not in dataset.features:
+            dataset = dataset.add_column(name=Fields.meta,
+                                         column=[{}] * dataset.num_rows)
+        dataset = dataset.map(op.process, num_proc=1, with_rank=True)
+        res_list = dataset.to_list()
+
+        for sample, target in zip(res_list, self.tgt_list):
+            self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.video_face_keypoints_tags]["face_keypoints"][0]).shape[1:]), target["keypoints_list_shape"])
+            self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.video_face_keypoints_tags]["face_bboxes"][0]).shape[1:]), target["face_bboxes_shape"])
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/tests/ops/mapper/test_video_hand_reconstruction_hawor_mapper.py b/tests/ops/mapper/test_video_hand_reconstruction_hawor_mapper.py
index 4a9cd07890c..2ee829c7add 100644
--- a/tests/ops/mapper/test_video_hand_reconstruction_hawor_mapper.py
+++ b/tests/ops/mapper/test_video_hand_reconstruction_hawor_mapper.py
@@ -1,13 +1,14 @@
 import os
 import unittest
 import numpy as np
+import tempfile
+import shutil
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.mapper.video_hand_reconstruction_hawor_mapper import VideoHandReconstructionHaworMapper
 from data_juicer.utils.mm_utils import SpecialTokens
 from data_juicer.utils.constant import Fields, MetaKeys
 from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
-from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE
 
 
 @unittest.skip('Users need to download MANO_RIGHT.pkl.')
@@ -16,6 +17,14 @@ class VideoHandReconstructionHaworMapperTest(DataJuicerTestCaseBase):
                              'data')
     vid3_path = os.path.join(data_path, 'video3.mp4')
     vid4_path = os.path.join(data_path, 'video4.mp4')
+    vid3_frames_dir = os.path.join(data_path, 'video3_frames')
+    vid4_frames_dir = os.path.join(data_path, 'video4_frames')
+    vid3_frames_path = []
+    vid4_frames_path = []
+    for x in os.listdir(vid3_frames_dir):
+        vid3_frames_path.append(os.path.join(vid3_frames_dir, x))
+    for x in os.listdir(vid4_frames_dir):
+        vid4_frames_path.append(os.path.join(vid4_frames_dir, x))
 
     ds_list = [{
         'videos': [vid3_path]
@@ -23,6 +32,12 @@ class VideoHandReconstructionHaworMapperTest(DataJuicerTestCaseBase):
         'videos': [vid4_path]
     }]
 
+    ds_for_extracted_frames_list = [{
+        MetaKeys.video_frames: vid3_frames_path,
+    },  {
+        MetaKeys.video_frames: vid4_frames_path,
+    }]
+
     tgt_list = [{
         "fov_x": 0.7572688730116571,
         "left_frame_id_list": [2, 7, 8, 9, 10, 28, 33, 34, 36, 38, 39, 43, 44, 45, 46, 47, 48],
@@ -49,6 +64,38 @@ class VideoHandReconstructionHaworMapperTest(DataJuicerTestCaseBase):
         "right_transl_list_shape": (4, 3),
     }]
 
+    tgt_for_extracted_frames_list = [{
+        "fov_x": 0.7623036428395666,
+        "left_beta_list_shape": (6, 10),
+        "left_hand_pose_list_shape": (6, 15, 3, 3),
+        "left_global_orient_list_shape": (6, 3, 3),
+        "left_transl_list_shape": (6, 3),
+        "right_beta_list_shape": (6, 10),
+        "right_hand_pose_list_shape": (6, 15, 3, 3),
+        "right_global_orient_list_shape": (6, 3, 3),
+        "right_transl_list_shape": (6, 3),
+    }, {
+        "fov_x": 0.6412188912675015,
+        "left_beta_list_shape": (4, 10),
+        "left_hand_pose_list_shape": (4, 15, 3, 3),
+        "left_global_orient_list_shape": (4, 3, 3),
+        "left_transl_list_shape": (4, 3),
+        "right_beta_list_shape": (2, 10),
+        "right_hand_pose_list_shape": (2, 15, 3, 3),
+        "right_global_orient_list_shape": (2, 3, 3),
+        "right_transl_list_shape": (2, 3),
+    }]
+
+
+    def setUp(self):
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        super().setUp()
+
+    def tearDown(self):
+        super().tearDown()
+        if os.path.exists(self.tmp_dir):
+            shutil.rmtree(self.tmp_dir)
+
     def test(self):
 
         op = VideoHandReconstructionHaworMapper(
@@ -60,8 +107,8 @@ def test(self):
             frame_num=1,
             duration=1,
             thresh=0.2,
-            frame_dir=DATA_JUICER_ASSETS_CACHE,
-            moge_output_info_dir=DATA_JUICER_ASSETS_CACHE,
+            frame_dir=self.tmp_dir,
+            moge_output_info_dir=self.tmp_dir,
         )
         dataset = Dataset.from_list(self.ds_list)
         if Fields.meta not in dataset.features:
@@ -93,8 +140,8 @@ def test_mul_proc(self):
             frame_num=1,
             duration=1,
             thresh=0.2,
-            frame_dir=DATA_JUICER_ASSETS_CACHE,
-            moge_output_info_dir=DATA_JUICER_ASSETS_CACHE,
+            frame_dir=self.tmp_dir,
+            moge_output_info_dir=self.tmp_dir,
         )
         dataset = Dataset.from_list(self.ds_list)
         if Fields.meta not in dataset.features:
@@ -115,5 +162,35 @@ def test_mul_proc(self):
             self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["right_transl_list"]).shape[1:], target["right_transl_list_shape"][1:])
 
 
+    def test_for_extracted_frames(self):
+
+        op = VideoHandReconstructionHaworMapper(
+            hawor_model_path="hawor.ckpt",
+            hawor_config_path="model_config.yaml",
+            hawor_detector_path="detector.pt",
+            moge_model_path="Ruicheng/moge-2-vitl",
+            mano_right_path="path_to_mano_right_pkl",
+            thresh=0.2,
+            moge_output_info_dir=self.tmp_dir,
+        )
+        dataset = Dataset.from_list(self.ds_for_extracted_frames_list)
+        if Fields.meta not in dataset.features:
+            dataset = dataset.add_column(name=Fields.meta,
+                                         column=[{}] * dataset.num_rows)
+        dataset = dataset.map(op.process, num_proc=1, with_rank=True)
+        res_list = dataset.to_list()
+
+        for sample, target in zip(res_list, self.tgt_for_extracted_frames_list):
+            self.assertEqual(abs(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["fov_x"] - target["fov_x"]) < 0.01, True)
+            self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["left_beta_list"]).shape[1:], target["left_beta_list_shape"][1:])
+            self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["left_hand_pose_list"]).shape[1:], target["left_hand_pose_list_shape"][1:])
+            self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["left_global_orient_list"]).shape[1:], target["left_global_orient_list_shape"][1:])
+            self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["left_transl_list"]).shape[1:], target["left_transl_list_shape"][1:])
+            self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["right_beta_list"]).shape[1:], target["right_beta_list_shape"][1:])
+            self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["right_hand_pose_list"]).shape[1:], target["right_hand_pose_list_shape"][1:])
+            self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["right_global_orient_list"]).shape[1:], target["right_global_orient_list_shape"][1:])
+            self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["right_transl_list"]).shape[1:], target["right_transl_list_shape"][1:])
+
+
 if __name__ == '__main__':
     unittest.main()
\ No newline at end of file
diff --git a/tests/ops/mapper/test_video_whole_body_pose_estimation_mapper.py b/tests/ops/mapper/test_video_whole_body_pose_estimation_mapper.py
index d82e704f75c..7f6e8982464 100644
--- a/tests/ops/mapper/test_video_whole_body_pose_estimation_mapper.py
+++ b/tests/ops/mapper/test_video_whole_body_pose_estimation_mapper.py
@@ -1,13 +1,14 @@
 import os
 import unittest
 import numpy as np
+import tempfile
+import shutil
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.mapper.video_whole_body_pose_estimation_mapper import VideoWholeBodyPoseEstimationMapper
 from data_juicer.utils.mm_utils import SpecialTokens
 from data_juicer.utils.constant import Fields, MetaKeys
 from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
-from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE
 
 
 class VideoWholeBodyPoseEstimationMapperTest(DataJuicerTestCaseBase):
@@ -15,6 +16,14 @@ class VideoWholeBodyPoseEstimationMapperTest(DataJuicerTestCaseBase):
                              'data')
     vid3_path = os.path.join(data_path, 'video3.mp4')
     vid4_path = os.path.join(data_path, 'video4.mp4')
+    vid3_frames_dir = os.path.join(data_path, 'video3_frames')
+    vid4_frames_dir = os.path.join(data_path, 'video4_frames')
+    vid3_frames_path = []
+    vid4_frames_path = []
+    for x in os.listdir(vid3_frames_dir):
+        vid3_frames_path.append(os.path.join(vid3_frames_dir, x))
+    for x in os.listdir(vid4_frames_dir):
+        vid4_frames_path.append(os.path.join(vid4_frames_dir, x))
 
     ds_list = [{
         'videos': [vid3_path]
@@ -22,22 +31,49 @@ class VideoWholeBodyPoseEstimationMapperTest(DataJuicerTestCaseBase):
         'videos': [vid4_path]
     }]
 
+    ds_from_frames_list = [{
+        MetaKeys.video_frames: vid3_frames_path,
+    },  {
+        MetaKeys.video_frames: vid4_frames_path,
+    }]
+
     tgt_list = [{
         "body_keypoints_shape": [2, 18, 2],
         "foot_keypoints_shape": [2, 6, 2],
         "faces_keypoints_shape": [2, 68, 2],
         "hands_keypoints_shape": [4, 21, 2],
-        "bbox_results_list_length": 49,
         "bbox_shape": [2, 4]
     }, {
         "body_keypoints_shape": [2, 18, 2],
         "foot_keypoints_shape": [2, 6, 2],
         "faces_keypoints_shape": [2, 68, 2],
         "hands_keypoints_shape": [4, 21, 2],
-        "bbox_results_list_length": 22,
         "bbox_shape": [2, 4]
     }]
 
+    tgt_from_frames_list = [{
+        "body_keypoints_shape": [2, 18, 2],
+        "foot_keypoints_shape": [2, 6, 2],
+        "faces_keypoints_shape": [2, 68, 2],
+        "hands_keypoints_shape": [4, 21, 2],
+        "bbox_shape": [2, 4]
+    }, {
+        "body_keypoints_shape": [2, 18, 2],
+        "foot_keypoints_shape": [2, 6, 2],
+        "faces_keypoints_shape": [2, 68, 2],
+        "hands_keypoints_shape": [4, 21, 2],
+        "bbox_shape": [2, 4]
+    }]
+
+    def setUp(self):
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        super().setUp()
+
+    def tearDown(self):
+        super().tearDown()
+        if os.path.exists(self.tmp_dir):
+            shutil.rmtree(self.tmp_dir)
+
     def test(self):
 
         op = VideoWholeBodyPoseEstimationMapper(
@@ -46,9 +82,9 @@ def test(self):
             frame_num=1,
             duration=1,
             tag_field_name=MetaKeys.pose_estimation_tags,
-            frame_dir=DATA_JUICER_ASSETS_CACHE,
+            frame_dir=os.path.join(self.tmp_dir, "dwpose_test1"),
             if_save_visualization=True,
-            save_visualization_dir=DATA_JUICER_ASSETS_CACHE
+            save_visualization_dir=os.path.join(self.tmp_dir, "dwpose_vis1")
         )
         dataset = Dataset.from_list(self.ds_list)
         if Fields.meta not in dataset.features:
@@ -62,7 +98,6 @@ def test(self):
             self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.pose_estimation_tags]["foot_keypoints"][2]).shape), target["foot_keypoints_shape"])
             self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.pose_estimation_tags]["faces_keypoints"][2]).shape), target["faces_keypoints_shape"])
             self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.pose_estimation_tags]["hands_keypoints"][2]).shape), target["hands_keypoints_shape"])
-            self.assertEqual(len(sample[Fields.meta][MetaKeys.pose_estimation_tags]["bbox_results_list"]), target["bbox_results_list_length"])
             self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.pose_estimation_tags]["bbox_results_list"][2]).shape), target["bbox_shape"])
 
 
@@ -74,9 +109,9 @@ def test_mul_proc(self):
             frame_num=1,
             duration=1,
             tag_field_name=MetaKeys.pose_estimation_tags,
-            frame_dir=DATA_JUICER_ASSETS_CACHE,
+            frame_dir=os.path.join(self.tmp_dir, "dwpose_test2"),
             if_save_visualization=True,
-            save_visualization_dir=DATA_JUICER_ASSETS_CACHE
+            save_visualization_dir=os.path.join(self.tmp_dir, "dwpose_vis2")
         )
         dataset = Dataset.from_list(self.ds_list)
         if Fields.meta not in dataset.features:
@@ -90,9 +125,32 @@ def test_mul_proc(self):
             self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.pose_estimation_tags]["foot_keypoints"][2]).shape), target["foot_keypoints_shape"])
             self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.pose_estimation_tags]["faces_keypoints"][2]).shape), target["faces_keypoints_shape"])
             self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.pose_estimation_tags]["hands_keypoints"][2]).shape), target["hands_keypoints_shape"])
-            self.assertEqual(len(sample[Fields.meta][MetaKeys.pose_estimation_tags]["bbox_results_list"]), target["bbox_results_list_length"])
             self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.pose_estimation_tags]["bbox_results_list"][2]).shape), target["bbox_shape"])
 
 
+    def test_from_extracted_frames(self):
+
+        op = VideoWholeBodyPoseEstimationMapper(
+            onnx_det_model="yolox_l.onnx",
+            onnx_pose_model="dw-ll_ucoco_384.onnx",
+            tag_field_name=MetaKeys.pose_estimation_tags,
+            if_save_visualization=True,
+            save_visualization_dir=os.path.join(self.tmp_dir, "dwpose_vis3")
+        )
+        dataset = Dataset.from_list(self.ds_from_frames_list)
+        if Fields.meta not in dataset.features:
+            dataset = dataset.add_column(name=Fields.meta,
+                                         column=[{}] * dataset.num_rows)
+        dataset = dataset.map(op.process, num_proc=1, with_rank=True)
+        res_list = dataset.to_list()
+
+        for sample, target in zip(res_list, self.tgt_from_frames_list):
+            self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.pose_estimation_tags]["body_keypoints"][1]).shape), target["body_keypoints_shape"])
+            self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.pose_estimation_tags]["foot_keypoints"][1]).shape), target["foot_keypoints_shape"])
+            self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.pose_estimation_tags]["faces_keypoints"][1]).shape), target["faces_keypoints_shape"])
+            self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.pose_estimation_tags]["hands_keypoints"][1]).shape), target["hands_keypoints_shape"])
+            self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.pose_estimation_tags]["bbox_results_list"][1]).shape), target["bbox_shape"])
+
+
 if __name__ == '__main__':
     unittest.main()
\ No newline at end of file