datajuicer · SYSUzhouting · Mar 13, 2026 · gemini-code-assist · Mar 13, 2026 · gemini-code-assist
diff --git a/README_this_pr.md b/README_this_pr.md
@@ -0,0 +1,30 @@
+# Data-Juicer-HumanVbench-ops
+
+This is the operator contribution page for the paper: **HumanVBench: Probing Human-Centric Video Understanding in MLLMs with Automatically Synthesized Benchmarks (CVPR'26)**.
+
+## Related Operator Documentation Locations
+
+* **Example Recipe:** `demos/video_humanvbench_simple/analyzer.yaml`
+* **Operator Definition:** `data_juicer/config/config_all.yaml`
+
+## Quick Start
+
+As HumanVBench operators involve modifications to external repositories, these adjusted repositories are currently stored in:
+`thirdparty/humanvbench_models`
+
+To use these operators, you can choose:
+
+1. **Manual Mode:** Follow the instructions in `thirdparty/humanvbench_models/README.md` to manually complete the `git clone` and `.diff` patch merging, then run:
+
+```shell
+dj-process --config demos/video_humanvbench_simple/analyzer.yaml
+
+```
+
+2. **Automatic Mode (Recommended):** Start running directly:
+
+```shell
+dj-process --config demos/video_humanvbench_simple/analyzer.yaml
+
+```
+The relevant operators already cover the logic for automatic `git clone` and `merge diff`, making manual intervention non-essential.
diff --git a/README_this_pr_CH.md b/README_this_pr_CH.md
@@ -0,0 +1,29 @@
+# Data-Juicer-HumanVbench-ops
+
+这是论文：**HumanVBench: Probing Human-Centric Video Understanding in MLLMs with Automatically Synthesized Benchmarks (CVPR'26)** 的算子贡献页。
+
+## 相关算子介绍文件位置
+
+* **范例 Recipe：** `demos/video_humanvbench_simple/analyzer.yaml`
+* **算子定义：** `data_juicer/config/config_all.yaml`
+
+## 快速开始
+
+由于 HumanVBench 算子涉及外部仓库的修改，这些经过调整的仓库目前存储在：
+`thirdparty/humanvbench_models`
+
+为了使用这些算子，你可以选择：
+
+1. **手动模式：** 按照 `thirdparty/humanvbench_models/README.md` 下的指引手动完成 `git clone` 和 `.diff` 补丁合并，然后运行：
+```shell
+dj-process --config demos/video_humanvbench_simple/analyzer.yaml
+
+```
+
+
+2. **自动模式（推荐）：** 直接开始运行：
+```shell
+dj-process --config demos/video_humanvbench_simple/analyzer.yaml
+
+```
+我们在相关算子已经涵盖了自动 `git clone` 和 `merge diff` 的逻辑，手动干预是非必须的。
diff --git a/data_juicer/config/config_all.yaml b/data_juicer/config/config_all.yaml
@@ -199,19 +199,6 @@ process:
       model_params: {}                                        # Parameters for initializing the API model.
       sampling_params: {}                                     # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
   - expand_macro_mapper:                                    # expand macro definitions in Latex text.
-  - latex_figure_context_extractor_mapper:                  # Extract figures and their citing context from LaTeX source.
-      citation_commands: ['\ref', '\cref', '\Cref', '\autoref']  # LaTeX reference commands to search for citing paragraphs.
-      paragraph_separator: '\n\n'                               # Pattern for splitting LaTeX text into paragraphs.
-      caption_key: 'caption'                                  # Output field name for the figure caption.
-      label_key: 'label'                                      # Output field name for the LaTeX label.
-      context_key: 'citing_paragraphs'                        # Output field name for citing paragraphs.
-      parent_caption_key: 'parent_caption'                    # Output field name for the parent figure's caption (subfigures only).
-      parent_label_key: 'parent_label'                        # Output field name for the parent figure's label (for grouping subfigures).
-  - latex_merge_tex_mapper:                                 # Extract and concatenate all .tex files from a compressed LaTeX project archive.
-      compressed_file_key: 'compressed_file'                   # Field storing the archive path.
-      separator: '\n\n'                                       # Separator between concatenated .tex files.
-      max_file_size: 52428800                                 # 50 MB; skip .tex entries larger than this (zip bomb protection).
-      max_total_size: 104857600                                 # 100 MB; cumulative limit for all extracted .tex content.
   - extract_entity_attribute_mapper:                        # Extract attributes for given entities from the text.
       api_model: 'gpt-4o'                                     # API model name.
       query_entities: ["孙悟空", "猪八戒"]                      # Entity list to be queried.
@@ -698,7 +685,61 @@ process:
       save_visualization_dir: None                            # The path for saving visualization results.
   - whitespace_normalization_mapper:                        # normalize different kinds of whitespaces to English whitespace.
 
+
+# When use HumanVBench mapper, keep_stats_in_res_ds should be set true
+
+  - video_human_tracks_extraction_mapper:                   # Get the body and face trajectory bounding box of people in one shot of the video. To ensure correctness, it should be applied after video_split_by_scene_mapper
+      face_track_bbox_path: /your_path/bounding_box_track       # The storage location of the bounding box tracks of the characters in the video
+      mem_required: '10GB'
+
+  # video_human_tracks_face_demographic_mapper should be operated after video_human_tracks_extraction_mapper.
+  - video_human_tracks_face_demographic_mapper:             # Get the facial demographics of each person based on the results of video_human_tracks_extraction_mapper
+      original_data_save_path: your_path/bounding_box_track    # The location where the specific results of each frame's detection are stored
+      detect_interval: 5
+
+  # video_audio_detect_age_gender_mapper should be operated after video_tagging_from_audio_mapper.
+  - video_audio_detect_age_gender_mapper:                   # If the audio is speech, classify the gender and age of the speech
+      hf_audio_mapper: 'audeering/wav2vec2-large-robust-24-ft-age-gender'   # Huggingface model name for speech age and gender classification
+      mem_required: '7GB' 
+
+  # video_captioning_from_human_tracks_mapper should be operated after video_human_tracks_extraction_mapper.
+  - video_captioning_from_human_tracks_mapper:              # Based on the results of video_human_tracks_extraction_mapper, focus on the single person in the video for captioning
+      video_describe_model_path: DAMO-NLP-SG/VideoLLaMA3-7B     # model path to VideoLLaMA3-7B
+      trust_remote_code: true
+      temp_video_path: ./temp_video_path                        # Used to store temporary videos that will be removed finally.
+      mem_required: '25GB'     
+
+  # video_captioning_face_attribute_emotion_mapper should be operated after video_human_tracks_extraction_mapper.
+  - video_captioning_face_attribute_emotion_mapper:         # Based on the results of video_human_tracks_extraction_mapper, focus on judging the gender, age, and race of a single person in the video
+      face_track_query: Please only describe the appearance and facial emotions of the person in the video in detail. Don't mention the background. Less than 80 words.
+      trust_remote_code: true
+      cropping_face_video_temp_path: ./temp_video_path          # Used to store temporary videos
+      video_describe_model_path: DAMO-NLP-SG/VideoLLaMA3-7B     # Huggingface model DAMO-NLP-SG/VideoLLaMA3-7B
+      mem_required: '25GB' 
+
+  # video_active_speaker_detect_mapper must be operated after video_tagging_from_audio_mapper and video_human_tracks_extraction_mapper.
+  - video_active_speaker_detect_mapper:                     # Based on the results of video_human_tracks_extraction_mapper, determine whether each person is an active speaker
+      temp_save_path: ./temp_path                               # Used to store temporary videos
+      active_threshold: 15                                      # Higher values are stricter, reducing false positives from noise but potentially increasing missed detections
+      mem_required: '10GB' 
+
+  - video_audio_ASR_mapper:                                 # Automatic speech recognition from video speech
+      model_dir_ASR: 'FunAudioLLM/SenseVoiceSmall'              # Huggingface model FunAudioLLM/SenseVoiceSmall
+      mem_required: '20GB' 
+
+  - video_audio_speech_emotion_mapper:                      # Speech emotion recognition from video speech
+      model_dir_emo: 'FunAudioLLM/SenseVoiceSmall'              # Huggingface model FunAudioLLM/SenseVoiceSmall
+      mem_required: '20GB' 
+
+
+
+
   # Filter ops
+  - video_face_ratio_filter:                                # Filter to retain human-centric videos
+      threshold: 0.65                                           # The lower limit of the ratio of frames with faces to the total number of video frames
+      detect_interval: 4
+      any_or_all: any 
+
   - alphanumeric_filter:                                    # filter text with alphabet/numeric ratio out of specific range.
       tokenization: false                                     # whether to count the ratio of alphanumeric to the total number of tokens.
       min_ratio: 0.0                                          # the min ratio of filter range

diff --git a/data_juicer/core/data/load_strategy.py b/data_juicer/core/data/load_strategy.py
@@ -631,7 +631,7 @@ def load_data(self, **kwargs):
 
             # Use ray.data functions directly with PyArrow filesystem support
             # Ray's read functions support filesystem parameter via PyArrow
-            if data_format in {"json", "jsonl", "json.gz", "jsonl.gz", "json.zst", "jsonl.zst"}:
+            if data_format in {"json", "jsonl"}:
                 # For JSON, we need to use read_json_stream with filesystem
                 from data_juicer.core.data.ray_dataset import read_json_stream
 

diff --git a/data_juicer/core/data/ray_dataset.py b/data_juicer/core/data/ray_dataset.py
@@ -355,7 +355,7 @@ def count(self) -> int:
 
     @classmethod
     def read(cls, data_format: str, paths: Union[str, List[str]]) -> RayDataset:
-        if data_format in {"json", "jsonl", "json.gz", "jsonl.gz", "json.zst", "jsonl.zst"}:
+        if data_format in {"json", "jsonl"}:
             return RayDataset.read_json(paths)
         elif data_format == "webdataset":
             return RayDataset.read_webdataset(paths)
@@ -453,7 +453,7 @@ def read_json_stream(
     include_paths: bool = False,
     ignore_missing_paths: bool = False,
     shuffle: Union[Literal["files"], None] = None,
-    file_extensions: Optional[List[str]] = ["json", "jsonl", "json.gz", "jsonl.gz", "json.zst", "jsonl.zst"],
+    file_extensions: Optional[List[str]] = ["json", "jsonl"],
     concurrency: Optional[int] = None,
     override_num_blocks: Optional[int] = None,
     **arrow_json_args,

diff --git a/data_juicer/format/json_formatter.py b/data_juicer/format/json_formatter.py
@@ -6,10 +6,10 @@ class JsonFormatter(LocalFormatter):
     """
     The class is used to load and format json-type files.
 
-    Default suffixes is `['.json', '.jsonl', '.json.gz', '.jsonl.gz', '.json.zst', '.jsonl.zst']`
+    Default suffixes is `['.json', '.jsonl', '.jsonl.zst']`
     """
 
-    SUFFIXES = [".json", ".jsonl", ".json.gz", ".jsonl.gz", ".json.zst", ".jsonl.zst"]
+    SUFFIXES = [".json", ".jsonl", ".jsonl.zst"]
 
     def __init__(self, dataset_path, suffixes=None, **kwargs):
         """

diff --git a/data_juicer/ops/filter/__init__.py b/data_juicer/ops/filter/__init__.py
@@ -56,6 +56,7 @@
 from .video_watermark_filter import VideoWatermarkFilter
 from .word_repetition_filter import WordRepetitionFilter
 from .words_num_filter import WordsNumFilter
+from .video_face_ratio_filter import VideoFaceRatioFilter
 
 __all__ = [
     "AlphanumericFilter",
@@ -114,6 +115,7 @@
     "WordRepetitionFilter",
     "WordsNumFilter",
     "GeneralFieldFilter",
+    "VideoFaceRatioFilter"
 ]
 
 NON_STATS_FILTERS = [

diff --git a/data_juicer/ops/filter/image_face_count_filter.py b/data_juicer/ops/filter/image_face_count_filter.py
@@ -67,8 +67,10 @@ def __init__(
         self.min_face_count = min_face_count
         self.max_face_count = max_face_count
 
-        self.extra_kwargs = self._default_kwargs.copy()
-        self.extra_kwargs.update((k, v) for k, v in kwargs.items() if k in self.extra_kwargs)
+        self.extra_kwargs = self._default_kwargs
+        for key in kwargs:
+            if key in self.extra_kwargs:
+                self.extra_kwargs[key] = kwargs[key]
-        self.extra_kwargs = self._default_kwargs
-        for key in kwargs:
-            if key in self.extra_kwargs:
-                self.extra_kwargs[key] = kwargs[key]
+        self.extra_kwargs = self._default_kwargs.copy()
+        for key in kwargs:
+            if key in self.extra_kwargs:
+                self.extra_kwargs[key] = kwargs[key]
-        self.extra_kwargs = self._default_kwargs
-        for key in kwargs:
-            if key in self.extra_kwargs:
-                self.extra_kwargs[key] = kwargs[key]
+        self.extra_kwargs = self._default_kwargs.copy()
+        for key in kwargs:
+            if key in self.extra_kwargs:
+                self.extra_kwargs[key] = kwargs[key]
 
         if any_or_all not in ["any", "all"]:
             raise ValueError(f"Keep strategy [{any_or_all}] is not supported. " f'Can only be one of ["any", "all"].')
@@ -96,10 +98,13 @@ def compute_stats_single(self, sample, context=False):
 
         # count the number of detected faces in each image
         face_counts = {}
-        for key, image in images.items():
-            dets = detect_faces(image, model, **self.extra_kwargs)
-            face_counts[key] = len(dets)
-        logger.debug(f"face counts: {face_counts}")
+        try:
+            for key, image in images.items():
+                dets = detect_faces(image, model, **self.extra_kwargs)
+                face_counts[key] = len(dets)
+            logger.debug(f"face counts: {face_counts}")
+        except Exception as e:
+            logger.exception(e)
 
         sample[Fields.stats][StatsKeys.face_counts] = [face_counts[key] for key in loaded_image_keys]
         return sample

diff --git a/data_juicer/ops/filter/image_face_ratio_filter.py b/data_juicer/ops/filter/image_face_ratio_filter.py
@@ -67,8 +67,10 @@ def __init__(
         self.min_ratio = min_ratio
         self.max_ratio = max_ratio
 
-        self.extra_kwargs = self._default_kwargs.copy()
-        self.extra_kwargs.update((k, v) for k, v in kwargs.items() if k in self.extra_kwargs)
+        self.extra_kwargs = self._default_kwargs
+        for key in kwargs:
+            if key in self.extra_kwargs:
+                self.extra_kwargs[key] = kwargs[key]
 
         if any_or_all not in ["any", "all"]:
             raise ValueError(f"Keep strategy [{any_or_all}] is not supported. " f'Can only be one of ["any", "all"].')

diff --git a/data_juicer/ops/filter/video_face_ratio_filter.py b/data_juicer/ops/filter/video_face_ratio_filter.py
@@ -0,0 +1,143 @@
+import av
+import numpy as np
+from data_juicer.utils.constant import Fields, StatsKeys
+from data_juicer.utils.mm_utils import (load_data_with_context, load_video,
+                                        pil_to_opencv, pil_to_opencv, process_each_frame)
+from ..base_op import OPERATORS, Filter
+from ..op_fusion import LOADED_VIDEOS
+from ..op_fusion import INTER_SAMPLED_FRAMES
+
+import psutil
+import gc,os
+
+
+import cv2,dlib
+from PIL import ImageFilter
+
+OP_NAME = 'video_face_ratio_filter'
+@OPERATORS.register_module(OP_NAME)
+@LOADED_VIDEOS.register_module(OP_NAME)
+
+class VideoFaceRatioFilter(Filter):
+    """
+    Keep data samples whose videos' durations are within a specified range.
-    Keep data samples whose videos' durations are within a specified range.
+    Keep data samples whose videos' face-to-frame ratios are within a specified range.
-    Keep data samples whose videos' durations are within a specified range.
+    Keep data samples whose videos' face-to-frame ratios are within a specified range.
+
+    Source: This operator is a part of HumanVBench (CVPR 2026).
+    """
+
+    def __init__(self,
+                 threshold: float = 0.8,
+                 detect_interval: int = 1,
+                 any_or_all: str = 'all',
+                 *args,
+                 **kwargs):
+        """
+        Initialization method.
+
+        :param any_or_all: keep this sample with 'any' or 'all' strategy of
+            all videos. 'any': keep this sample if any videos meet the
+            condition. 'all': keep this sample only if all videos meet the
+            condition.
+        :param args: extra args
+        :param kwargs: extra args
+        """
+        super().__init__(*args, **kwargs)
+        self.threshold = threshold
+
+        if any_or_all not in ['any', 'all']:
+            raise ValueError(f'Keep strategy [{any_or_all}] is not supported. '
+                             f'Can only be one of ["any", "all"].')
+        self.any = (any_or_all == 'any')
+
+        # Initialize face detector
+        self.detector = dlib.get_frontal_face_detector()
+
+
+        self.detect_interval = detect_interval
+
+
+    def compute_stats_single(self, sample, rank=None, context=False):
+        # check if it's computed already
+        if StatsKeys.video_face_exist in sample[Fields.stats]:
+            return sample
+
+        # load videos
+        loaded_video_keys = sample[self.video_key]
+        video_faces_ratio = {}
+
+        # face_detect_S3FD = get_model(self.detector_key, rank=rank)
+
+        process = psutil.Process(os.getpid())
+        # memory_before = process.memory_info().rss / 1024 ** 2  # MB
+
+
+        for video_key in loaded_video_keys:
+            try:
+                with av.open(video_key) as container:
+                    # getting video stream
+                    video_stream = next(s for s in container.streams if s.type == 'video')
+                    # iterate over the video frame and detect faces
+                    frame_counter = 0  
+                    total_frames = 0
+                    frames_with_face = 0
+                    detect_num = 0
+                    for packet in container.demux(video_stream):
+                        try:
+                            for frame in packet.decode():
+                                total_frames += 1
+                                frame_counter += 1  
+
+                                if frame_counter % self.detect_interval == 0:
+                                    detect_num = detect_num + 1
+                                    img = frame.to_image()
+                                    image = pil_to_opencv(img)
+                                    # imageNumpy = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+                                    # faces = face_detect_S3FD.detect_faces(imageNumpy, conf_th=0.9, scales=[0.25])
+                                    faces = self.detector(image)
+                                    if len(faces) > 0:
+                                        frames_with_face += 1
+                        except Exception as e:
+                            print(f"Frame decoding error in video {video_key}: {e}")
+                            frames_with_face = 0
+                            detect_num = 0
+
+                    # calculate the proportion of the number of face frames
+                    if detect_num > 0:
+                        face_ratio = frames_with_face / detect_num
+                    else:
+                        face_ratio = 0.0
+                    video_faces_ratio[video_key] = face_ratio
+            except av.AVError as e:
+                print(f"Error opening video {video_key}: {e}")
+                video_faces_ratio[video_key] = 0.0
+            finally:
+                container.close()
+
+            video_faces_ratio[video_key] = face_ratio
+
+        # get video faces ratio
+        sample[Fields.stats][StatsKeys.video_face_exist] = [
+            video_faces_ratio[video_key] for video_key in sample[self.video_key]
+        ]
+
+        memory_after = process.memory_info().rss / 1024 ** 2  # MB
+        print(f"Memory Usage: {memory_after:.2f} MB")
+
+        gc.collect()
+
+        return sample
+
+    def process_single(self, sample):
+        video_faces_ratio = sample[Fields.stats][StatsKeys.video_face_exist]
+        keep_bools = np.array([
+            duration >= self.threshold
+            for duration in video_faces_ratio
+        ])
+        if len(keep_bools) <= 0:
+            return True
+
+        # different strategies
+        if self.any:
+            return keep_bools.any()
+        else:
+            return keep_bools.all()
diff --git a/data_juicer/ops/filter/video_motion_score_filter.py b/data_juicer/ops/filter/video_motion_score_filter.py
@@ -115,8 +115,10 @@ def __init__(
         self.divisible = divisible
         self.relative = relative
 
-        self.extra_kwargs = self._default_kwargs.copy()
-        self.extra_kwargs.update((k, v) for k, v in kwargs.items() if k in self.extra_kwargs)
+        self.extra_kwargs = self._default_kwargs
+        for key in kwargs:
+            if key in self.extra_kwargs:
+                self.extra_kwargs[key] = kwargs[key]
 
         if any_or_all not in ["any", "all"]:
             raise ValueError(f"Keep strategy [{any_or_all}] is not supported. " f'Can only be one of ["any", "all"].')