diff --git a/data_juicer/core/data/ray_dataset.py b/data_juicer/core/data/ray_dataset.py
index 2d8b198565..acca1a8b8f 100644
--- a/data_juicer/core/data/ray_dataset.py
+++ b/data_juicer/core/data/ray_dataset.py
@@ -237,7 +237,8 @@ def process_batch_arrow(table: pyarrow.Table):
 
                 try:
                     if op.use_ray_actor():
-                        compute = get_compute_strategy(op.__class__, concurrency=op.num_proc)
+                        # Use concurrency= directly for better GPU utilization
+                        # (get_compute_strategy may limit parallelism)
                         self.data = self.data.map_batches(
                             op.__class__,
                             fn_args=None,
@@ -247,7 +248,7 @@ def process_batch_arrow(table: pyarrow.Table):
                             batch_size=batch_size,
                             num_cpus=op.num_cpus,
                             num_gpus=op.num_gpus,
-                            compute=compute,
+                            concurrency=op.num_proc,
                             batch_format="pyarrow",
                             runtime_env=op.runtime_env,
                         )
@@ -280,7 +281,7 @@ def process_batch_arrow(table: pyarrow.Table):
                     )
                     cached_columns.add(Fields.stats)
                 if op.use_ray_actor():
-                    compute = get_compute_strategy(op.__class__, concurrency=op.num_proc)
+                    # Use concurrency= directly for better GPU utilization
                     self.data = self.data.map_batches(
                         op.__class__,
                         fn_args=None,
@@ -290,7 +291,7 @@ def process_batch_arrow(table: pyarrow.Table):
                         batch_size=batch_size,
                         num_cpus=op.num_cpus,
                         num_gpus=op.num_gpus,
-                        compute=compute,
+                        concurrency=op.num_proc,
                         batch_format="pyarrow",
                         runtime_env=op.runtime_env,
                     )
diff --git a/data_juicer/core/executor/concurrency_scoping.py b/data_juicer/core/executor/concurrency_scoping.py
new file mode 100644
index 0000000000..906f8cbe37
--- /dev/null
+++ b/data_juicer/core/executor/concurrency_scoping.py
@@ -0,0 +1,20 @@
+"""Utility for scoping op concurrency when running partitions concurrently."""
+
+
+def scope_op_concurrency(op, max_concurrent_partitions: int) -> int:
+    """Returns the concurrency a single partition should use for this op.
+
+    When multiple partitions run concurrently, each partition should use a
+    fraction of the total GPU/actor resources to avoid over-subscription.
+
+    Args:
+        op: An operator instance with ``use_ray_actor()`` and ``num_proc``.
+        max_concurrent_partitions: How many partitions will run in parallel.
+
+    Returns:
+        The concurrency value the partition should pass through to
+        ``map_batches``.
+    """
+    if not op.use_ray_actor() or not op.num_proc or op.num_proc <= 0:
+        return op.num_proc  # CPU ops or auto-mode unchanged
+    return max(1, op.num_proc // max_concurrent_partitions)
diff --git a/data_juicer/core/executor/ray_executor_partitioned.py b/data_juicer/core/executor/ray_executor_partitioned.py
index ddcb1fc442..ff63f01e00 100644
--- a/data_juicer/core/executor/ray_executor_partitioned.py
+++ b/data_juicer/core/executor/ray_executor_partitioned.py
@@ -273,16 +273,61 @@ def _configure_partitioning(self):
                 logger.warning("Legacy num_partitions detected, overriding partition configuration")
 
         self.partition_mode = mode
-        self.num_partitions = num_of_partitions
         self.partition_size = partition_size
         self.max_size_mb = max_size_mb
 
+        # Resolve max_concurrent_partitions.
+        # "auto" (default) → detect from Ray cluster GPU count, fall back to 1.
+        # Explicit int → use as-is.
+        raw_max_conc = ConfigAccessor.get(partition_cfg, "max_concurrent_partitions", "auto")
+        self.max_concurrent_partitions = self._resolve_max_concurrent(raw_max_conc)
+
+        # Ensure we have at least as many partitions as concurrent slots,
+        # otherwise some GPUs would sit idle.
+        if self.max_concurrent_partitions > num_of_partitions:
+            logger.info(
+                f"num_of_partitions ({num_of_partitions}) < "
+                f"max_concurrent_partitions ({self.max_concurrent_partitions}), "
+                f"raising num_of_partitions to {self.max_concurrent_partitions}"
+            )
+            num_of_partitions = self.max_concurrent_partitions
+
+        self.num_partitions = num_of_partitions
+
         if mode == "manual":
             logger.info(f"Manual partition mode: using {self.num_partitions} partitions")
         else:  # auto mode
             logger.info(f"Auto partition mode: will determine optimal partitioning based on data characteristics")
             logger.info(f"Fallback partition size: {self.partition_size} samples, max {self.max_size_mb} MB")
 
+        if self.max_concurrent_partitions > 1:
+            logger.info(
+                f"Concurrent partition processing enabled: "
+                f"max_concurrent_partitions={self.max_concurrent_partitions}"
+            )
+
+    @staticmethod
+    def _resolve_max_concurrent(raw_value) -> int:
+        """Resolve max_concurrent_partitions from config value.
+
+        * ``"auto"`` → number of GPUs visible to Ray (falls back to 1).
+        * An explicit int is returned as-is (minimum 1).
+        """
+        if isinstance(raw_value, str) and raw_value.lower() == "auto":
+            try:
+                num_gpus = int(ray.cluster_resources().get("GPU", 0))
+            except Exception as e:
+                logger.warning(f"Could not get GPU resources from Ray cluster, defaulting to 0. Error: {e}")
+                num_gpus = 0
+            if num_gpus > 1:
+                logger.info(
+                    f"Auto-detected {num_gpus} GPUs in Ray cluster, " f"setting max_concurrent_partitions={num_gpus}"
+                )
+                return num_gpus
+            # No GPUs or single GPU → sequential
+            return 1
+        return max(1, int(raw_value))
+
     def _configure_auto_partitioning(self, dataset, ops):
         """Configure partitioning using the partition size optimizer for auto mode."""
         try:
@@ -498,6 +543,10 @@ def _process_with_simple_partitioning(self, dataset: RayDataset, ops: List):
             f"{partitioning_info.total_rows} total rows"
         )
 
+        # Branch: concurrent vs sequential partition processing
+        if self.max_concurrent_partitions > 1:
+            return self._process_partitions_concurrent(partitions, ops, partitioning_info)
+
         # Process each partition separately with checkpointing
         logger.info("Processing partitions with checkpointing support...")
         processed_partitions = []
@@ -541,6 +590,197 @@ def _process_with_simple_partitioning(self, dataset: RayDataset, ops: List):
         # Return as RayDataset wrapper
         return RayDataset(merged_dataset, cfg=self.cfg)
 
+    def _process_partitions_concurrent(self, partitions, ops, partitioning_info):
+        """Process partitions concurrently as Ray remote tasks.
+
+        Each partition is submitted as a Ray remote task that independently
+        loads ops from config, scopes concurrency, and processes data with
+        its own checkpoint manager.  Results are collected and unioned.
+        """
+        max_conc = min(self.max_concurrent_partitions, len(partitions))
+        logger.info(f"Processing {len(partitions)} partitions concurrently " f"(max_concurrent_partitions={max_conc})")
+
+        # Serialisable values extracted from self (avoid serialising the executor)
+        cfg = self.cfg
+        ckpt_enabled = self.ckpt_manager.checkpoint_enabled
+        ckpt_strategy = self.ckpt_manager.checkpoint_strategy
+        ckpt_dir = self.ckpt_manager.ckpt_dir
+        ckpt_n_ops = getattr(self.ckpt_manager, "checkpoint_n_ops", 1)
+        ckpt_op_names = getattr(self.ckpt_manager, "checkpoint_op_names", [])
+        op_fusion_enabled = getattr(cfg, "op_fusion", False)
+
+        @ray.remote(num_cpus=0)
+        def _process_single_partition_task(
+            partition_data,
+            partition_id,
+            cfg,
+            max_concurrent_partitions,
+            ckpt_enabled,
+            ckpt_strategy,
+            ckpt_dir,
+            ckpt_n_ops,
+            ckpt_op_names,
+            op_fusion_enabled,
+        ):
+            """Ray remote task that processes one partition end-to-end."""
+            from loguru import logger as task_logger
+
+            from data_juicer.core.data.ray_dataset import RayDataset
+            from data_juicer.core.executor.concurrency_scoping import (
+                scope_op_concurrency,
+            )
+            from data_juicer.ops import load_ops
+            from data_juicer.ops.op_fusion import fuse_operators
+            from data_juicer.utils.ckpt_utils import RayCheckpointManager
+
+            task_logger.info(f"[Partition {partition_id}] Starting remote processing")
+
+            # Re-create ops from config to avoid serialisation issues
+            task_ops = load_ops(cfg.process)
+            if op_fusion_enabled:
+                task_ops = fuse_operators(task_ops)
+
+            # Scope concurrency and fix actor mode for each op.
+            # The remote task has no GPU, so use_cuda() returns False and
+            # ops default to task mode (model reloads per batch). Force
+            # actor mode for GPU ops so the model loads once per actor.
+            for op in task_ops:
+                if getattr(op, "num_gpus", 0) and op.num_gpus > 0:
+                    op.ray_execution_mode = "actor"
+                op.num_proc = scope_op_concurrency(op, max_concurrent_partitions)
+
+            # Create local checkpoint manager
+            ckpt_manager = RayCheckpointManager(
+                ckpt_dir=ckpt_dir,
+                checkpoint_enabled=ckpt_enabled,
+                checkpoint_strategy=ckpt_strategy,
+                checkpoint_n_ops=ckpt_n_ops,
+                checkpoint_op_names=ckpt_op_names,
+            )
+
+            # Check for existing checkpoint
+            latest_checkpoint = ckpt_manager.find_latest_checkpoint(partition_id)
+
+            # If all ops are already checkpointed, load from checkpoint
+            if latest_checkpoint and latest_checkpoint[0] >= len(task_ops) - 1:
+                task_logger.info(f"[Partition {partition_id}] All ops checkpointed, " f"loading from checkpoint")
+                loaded = ckpt_manager.load_checkpoint(
+                    latest_checkpoint[0],
+                    latest_checkpoint[1],
+                    partition_id,
+                    cfg=cfg,
+                )
+                if loaded is not None:
+                    return loaded.data.materialize()
+
+            # Determine resume point
+            start_op_idx = 0
+            partition_dataset = RayDataset(partition_data, cfg=cfg)
+
+            if latest_checkpoint:
+                loaded = ckpt_manager.load_checkpoint(
+                    latest_checkpoint[0],
+                    latest_checkpoint[1],
+                    partition_id,
+                    cfg=cfg,
+                )
+                if loaded is not None:
+                    partition_dataset = loaded
+                    start_op_idx = latest_checkpoint[0] + 1
+                    task_logger.info(f"[Partition {partition_id}] Resuming from op " f"{start_op_idx}")
+
+            # Process ops one-by-one with checkpointing
+            remaining_ops = task_ops[start_op_idx:]
+            for rel_idx, op in enumerate(remaining_ops):
+                abs_idx = start_op_idx + rel_idx
+                task_logger.info(f"[Partition {partition_id}] Processing op {abs_idx}: " f"{op._name}")
+                partition_dataset = partition_dataset.process([op])
+
+                # Checkpoint if needed
+                if ckpt_manager.should_checkpoint(abs_idx, op._name):
+                    partition_dataset.data = partition_dataset.data.materialize()
+                    ckpt_manager.save_checkpoint(
+                        partition_dataset.data,
+                        abs_idx,
+                        partition_id,
+                    )
+
+            # Final materialize
+            partition_dataset.data = partition_dataset.data.materialize()
+            return partition_dataset.data
+
+        # Submit tasks (skip empty partitions)
+        futures = {}
+        for i, partition in enumerate(partitions):
+            # Skip empty partitions to avoid wasting GPU resources
+            try:
+                row_count = partition.count()
+            except Exception:
+                row_count = -1  # can't determine, submit anyway
+            if row_count == 0:
+                logger.info(f"Partition {i}: empty (0 rows), skipping")
+                continue
+
+            # Check if partition is fully checkpointed before submitting
+            latest_ckpt = self.ckpt_manager.find_latest_checkpoint(i)
+            if latest_ckpt and latest_ckpt[0] >= len(ops) - 1:
+                logger.info(f"Partition {i}: already fully checkpointed, " f"loading from checkpoint")
+                loaded = self.ckpt_manager.load_checkpoint(latest_ckpt[0], latest_ckpt[1], i, cfg=self.cfg)
+                if loaded is not None:
+                    futures[i] = loaded.data.materialize()
+                    continue
+
+            self._log_event(
+                event_type=EventType.PARTITION_START,
+                message=f"Starting concurrent processing of partition " f"{i + 1}/{len(partitions)}",
+                partition_id=i,
+            )
+            futures[i] = _process_single_partition_task.remote(
+                partition,
+                i,
+                cfg,
+                max_conc,
+                ckpt_enabled,
+                ckpt_strategy,
+                ckpt_dir,
+                ckpt_n_ops,
+                ckpt_op_names,
+                op_fusion_enabled,
+            )
+
+        # Collect results
+        processed_partitions = []
+        for i in sorted(futures.keys()):
+            result = futures[i]
+            if isinstance(result, ray.ObjectRef):
+                try:
+                    result = ray.get(result)
+                    logger.info(f"Partition {i}: completed successfully")
+                except Exception as e:
+                    logger.error(f"Partition {i}: failed with error: {e}")
+                    raise
+            processed_partitions.append(result)
+            self._log_event(
+                event_type=EventType.PARTITION_COMPLETE,
+                message=f"Completed concurrent processing of partition " f"{i + 1}/{len(partitions)}",
+                partition_id=i,
+            )
+
+        # Union results
+        logger.info("Merging concurrently processed partitions...")
+        if not processed_partitions:
+            logger.warning("All partitions were empty or skipped. Returning an empty dataset.")
+            return RayDataset(ray.data.from_items([]), cfg=self.cfg)
+
+        if len(processed_partitions) == 1:
+            merged_dataset = processed_partitions[0]
+        else:
+            merged_dataset = processed_partitions[0]
+            for partition in processed_partitions[1:]:
+                merged_dataset = merged_dataset.union(partition)
+
+        return RayDataset(merged_dataset, cfg=self.cfg)
+
     def _process_with_convergence(self, dataset: RayDataset, ops: List, convergence_points: List[int]):
         """
         Process dataset with convergence support for global operations.
@@ -954,7 +1194,14 @@ def _split_dataset_deterministic(self, dataset: RayDataset) -> tuple:
         # Check for existing partitioning info (resumption case)
         saved_info = self._load_partitioning_info()
 
-        # Split the dataset
+        # Split using the dataset's natural block structure.  split()
+        # distributes existing blocks round-robin, so partitions inherit
+        # multiple blocks and Ray Data's streaming executor can pipeline
+        # stages within each partition.  Avoid repartition() here — it
+        # adds a costly shuffle and may reduce block count (e.g. 96 source
+        # blocks repartitioned to 32 loses parallelism).  If there are
+        # fewer blocks than partitions, some partitions will be empty —
+        # that's handled downstream (empty partitions are skipped).
         logger.info(f"Splitting dataset into {self.num_partitions} partitions (deterministic mode)...")
         partitions = dataset.data.split(self.num_partitions)
         logger.info(f"Created {len(partitions)} partitions")
@@ -974,24 +1221,16 @@ def _split_dataset_deterministic(self, dataset: RayDataset) -> tuple:
                 self._clear_invalid_checkpoints()
                 saved_info = None
 
-        # Collect metadata for new partitions
-        logger.info("Collecting partition metadata for checkpoint validation...")
-        total_rows = sum(p.count() for p in partitions)
-        partition_metadata = []
-
-        for i, partition in enumerate(partitions):
-            meta = self._collect_partition_metadata(partition, i)
-            partition_metadata.append(meta)
-            logger.debug(f"Partition {i}: {meta.row_count} rows, hash={meta.first_row_hash[:8]}...")
-
+        # On first run, skip expensive metadata collection (count(), take())
+        # which triggers redundant pipeline executions on lazy datasets.
+        # Save only the partition count; full metadata is not needed until
+        # resume validation.
         partitioning_info = PartitioningInfo(
             num_partitions=self.num_partitions,
-            total_rows=total_rows,
-            partitions=partition_metadata,
+            total_rows=-1,  # unknown until processing completes
+            partitions=[],
             deterministic=True,
         )
-
-        # Save partitioning info
         self._save_partitioning_info(partitioning_info)
 
         return partitions, partitioning_info
diff --git a/docs/design/parallel_partition_actor_reuse.md b/docs/design/parallel_partition_actor_reuse.md
new file mode 100644
index 0000000000..7284922467
--- /dev/null
+++ b/docs/design/parallel_partition_actor_reuse.md
@@ -0,0 +1,363 @@
+# Design Doc: Concurrent Partition Processing with GPU Scoping
+
+**Author:** Data-Juicer Team
+**Created:** 2026-03-09
+**Updated:** 2026-03-17
+**Status:** Implemented
+**Branch:** `feat/cyrusz/parallel-partition-actor-reuse`
+
+---
+
+## 1. Problem Statement
+
+### Current Behavior (Before This Change)
+
+The `PartitionedRayExecutor` processes partitions **sequentially**, creating new GPU actors for each partition:
+
+```
+Partition 1 → [Create Actors] → [Load Models] → [Process] → [Actors GC'd]
+Partition 2 → [Create Actors] → [Load Models] → [Process] → [Actors GC'd]
+Partition 3 → [Create Actors] → [Load Models] → [Process] → [Actors GC'd]
+```
+
+### Problems
+
+1. **Repeated Model Loading**: Heavy GPU models (e.g., VideoBLIP ~20GB) are loaded N times for N partitions
+2. **GPU Idle Time**: GPUs sit idle between partitions during actor teardown/creation
+3. **Poor Scalability**: Processing time scales linearly with partition count due to model loading overhead
+
+### Impact
+
+For a typical video processing pipeline with 3 GPU operators and 10 partitions:
+- Model loading time: ~60s per operator × 3 operators × 10 partitions = **30 minutes of pure overhead**
+- This overhead can exceed actual processing time for smaller datasets
+
+---
+
+## 2. Implemented Solution: Concurrent Partition Processing
+
+### Overview
+
+Instead of sequential processing with shared actor pools (originally proposed), we implemented **concurrent partition processing** where all partitions run in parallel as independent Ray remote tasks, each with its own scoped GPU actors:
+
+```
+┌──────────────────────────────────────────────────────────────────┐
+│                  Concurrent Partition Processing                   │
+│                                                                    │
+│  ┌──────────┐  ┌──────────┐  ┌──────────┐       ┌──────────┐    │
+│  │ Task P0  │  │ Task P1  │  │ Task P2  │  ...  │ Task P7  │    │
+│  │ 1 GPU    │  │ 1 GPU    │  │ 1 GPU    │       │ 1 GPU    │    │
+│  │ Actor    │  │ Actor    │  │ Actor    │       │ Actor    │    │
+│  └──────────┘  └──────────┘  └──────────┘       └──────────┘    │
+│       ↕              ↕              ↕                  ↕          │
+│    GPU 0          GPU 1          GPU 2             GPU 7          │
+└──────────────────────────────────────────────────────────────────┘
+
+All partitions processed concurrently, each with its own scoped actor
+```
+
+### Why Concurrent Instead of Sequential + Actor Reuse
+
+The original design proposed sequential processing with detached shared actor pools. During implementation, we chose concurrent processing because:
+
+1. **Simpler architecture**: No need for detached actor lifecycle management, pool coordination, or cross-partition actor sharing
+2. **Better GPU utilization**: All GPUs are busy simultaneously instead of sequentially
+3. **Natural Ray fit**: Each partition is a self-contained Ray remote task — no complex orchestration
+4. **Same model loading cost**: Each GPU loads the model once per partition, but all load concurrently (~60s wall time vs. N × 60s sequential)
+5. **Maintained benefits**: Checkpointing, resume, and memory control per partition are all preserved
+
+### Key Design Principles
+
+1. **Concurrent partition processing**: All partitions run in parallel (up to `max_concurrent_partitions`)
+2. **Concurrency scoping**: Each partition's GPU ops get `num_proc = total_gpus // max_concurrent_partitions` actors
+3. **Forced actor mode**: GPU ops are set to `ray_execution_mode = "actor"` inside the remote task (where CUDA is not visible)
+4. **Per-partition checkpointing**: Each remote task manages its own checkpoint state
+5. **Resume support**: Skip completed partitions on restart
+
+---
+
+## 3. Detailed Design
+
+### 3.1 Architecture
+
+```
+┌─────────────────────────────────────────────────────────────────────────┐
+│                    PartitionedRayExecutor                                │
+├─────────────────────────────────────────────────────────────────────────┤
+│                                                                         │
+│  ┌─────────────────────────────────────────────────────────────────┐   │
+│  │                 _process_partitions_concurrent()                  │   │
+│  │                                                                   │   │
+│  │  1. Extract serializable config values                           │   │
+│  │  2. Submit Ray remote tasks (one per partition)                  │   │
+│  │  3. Collect results, union partitions                            │   │
+│  └─────────────────────────────────────────────────────────────────┘   │
+│                              │                                          │
+│              ┌───────────────┼───────────────┐                          │
+│              ▼               ▼               ▼                          │
+│  ┌─────────────────┐ ┌─────────────┐ ┌─────────────┐                  │
+│  │ Remote Task P0  │ │ Remote Task │ │ Remote Task │ ...              │
+│  │                 │ │ P1          │ │ P2          │                  │
+│  │ - load_ops()    │ │             │ │             │                  │
+│  │ - force actor   │ │  (same)     │ │  (same)     │                  │
+│  │   mode for GPU  │ │             │ │             │                  │
+│  │ - scope conc.   │ │             │ │             │                  │
+│  │ - process data  │ │             │ │             │                  │
+│  │ - checkpoint    │ │             │ │             │                  │
+│  └─────────────────┘ └─────────────┘ └─────────────┘                  │
+│                                                                         │
+└─────────────────────────────────────────────────────────────────────────┘
+```
+
+### 3.2 Execution Flow
+
+```
+Phase 1: Dataset Splitting
+──────────────────────────
+
+Job Start
+    │
+    ▼
+┌──────────────────────────┐
+│ Repartition to N blocks  │  Ensure enough blocks for N partitions
+└───────────┬──────────────┘
+            │
+            ▼
+┌──────────────────────────┐
+│ Split into N partitions  │  Each partition gets ~equal rows
+└───────────┬──────────────┘
+
+
+Phase 2: Concurrent Processing
+──────────────────────────────
+            │
+    ┌───────┼───────┬───────┬───── ... ─────┐
+    ▼       ▼       ▼       ▼               ▼
+┌──────┐┌──────┐┌──────┐┌──────┐       ┌──────┐
+│ P0   ││ P1   ││ P2   ││ P3   │       │ P7   │
+│1 GPU ││1 GPU ││1 GPU ││1 GPU │       │1 GPU │
+└──┬───┘└──┬───┘└──┬───┘└──┬───┘       └──┬───┘
+   │       │       │       │               │
+   ▼       ▼       ▼       ▼               ▼
+ [Load]  [Load]  [Load]  [Load]  ...    [Load]   ← Models load concurrently
+   │       │       │       │               │
+   ▼       ▼       ▼       ▼               ▼
+[Process][Process][Process][Process]    [Process] ← All GPUs busy
+   │       │       │       │               │
+   ▼       ▼       ▼       ▼               ▼
+ [Ckpt]  [Ckpt]  [Ckpt]  [Ckpt]  ...  [Ckpt]    ← Per-partition checkpoint
+
+
+Phase 3: Merge Results
+──────────────────────
+    └───────┴───────┴───────┴───── ... ─────┘
+                        │
+                        ▼
+              ┌──────────────────┐
+              │  Union partitions │
+              └──────────────────┘
+                        │
+                        ▼
+                    Job End
+```
+
+### 3.3 Concurrency Scoping
+
+The critical mechanism that prevents GPU over-allocation:
+
+```python
+# Inside each remote task:
+for op in task_ops:
+    # Step 1: Force actor mode (MUST be before scope_op_concurrency)
+    if getattr(op, "num_gpus", 0) and op.num_gpus > 0:
+        op.ray_execution_mode = "actor"
+
+    # Step 2: Scope concurrency — divides num_proc by max_concurrent_partitions
+    op.num_proc = scope_op_concurrency(op, max_concurrent_partitions)
+```
+
+**Why order matters:**
+- The remote task runs on a CPU-only node (no GPU assigned to the task itself)
+- `torch.cuda.is_available()` returns `False` in the remote task
+- Without explicitly setting `ray_execution_mode = "actor"`, `use_ray_actor()` returns `False`
+- `scope_op_concurrency()` only divides `num_proc` for actor-mode ops
+- If actor mode is not set first, `num_proc` stays at the full value (e.g., 8), causing each partition to request all 8 GPUs → deadlock
+
+**Example with 8 GPUs, 8 partitions:**
+- `num_proc` original = 8 (wants 8 GPU actors)
+- `scope_op_concurrency(op, 8)` → `8 // 8 = 1` (1 GPU actor per partition)
+- 8 partitions × 1 GPU = 8 GPUs total → fits exactly
+
+### 3.4 Remote Task Design
+
+Each partition is processed by an independent `@ray.remote(num_cpus=0)` task that:
+
+1. **Re-creates ops from config** — avoids serialization issues with GPU operator state
+2. **Forces actor mode** — sets `ray_execution_mode = "actor"` for GPU ops
+3. **Scopes concurrency** — divides `num_proc` by `max_concurrent_partitions`
+4. **Manages its own checkpoints** — creates a local `RayCheckpointManager`
+5. **Handles resume** — checks for existing checkpoints before processing
+
+The task requests `num_cpus=0` because the actual compute is done by Ray Data actors/tasks spawned within.
+
+### 3.5 Dataset Splitting
+
+```python
+# Repartition to ensure enough blocks, then split
+dataset.data = dataset.data.repartition(self.num_partitions)
+partitions = dataset.data.split(self.num_partitions)
+```
+
+- `repartition(N)` ensures at least N blocks exist (lazy, adds a shuffle stage)
+- `split(N)` distributes blocks across N independent `Dataset` objects
+- Without repartition, split may produce empty partitions if there are fewer blocks than partitions
+
+---
+
+## 4. Configuration
+
+```yaml
+partition:
+  mode: 'auto'                           # 'auto' | 'manual'
+  num_of_partitions: 8                   # Number of partitions
+  max_concurrent_partitions: 8           # Max partitions running in parallel
+
+checkpoint:
+  enabled: true
+  dir: './checkpoints'
+  strategy: 'per_op'                     # Checkpoint after each operator
+```
+
+The `max_concurrent_partitions` parameter controls how many partitions run simultaneously and how GPU resources are divided. Setting it equal to the number of GPUs (one partition per GPU) is typical for GPU-bound workloads.
+
+---
+
+## 5. Performance Comparison
+
+### Timeline: Sequential vs Concurrent
+
+**Before (Sequential, no actor reuse):**
+```
+Time ────────────────────────────────────────────────────────────────────▶
+
+P0: [Load 60s][Process 120s][GC]
+P1:                              [Load 60s][Process 120s][GC]
+P2:                                                           [Load 60s][Process 120s]
+
+Total: 3 × (60 + 120) = 540s
+GPU idle: ~67% of total time
+```
+
+**After (Concurrent, 8 partitions on 8 GPUs):**
+```
+Time ────────────────────────────────────────────────────────────────────▶
+
+P0: [Load 60s][Process 120s]
+P1: [Load 60s][Process 120s]     ← All load concurrently
+P2: [Load 60s][Process 120s]
+...
+P7: [Load 60s][Process 120s]
+
+Total: 60 + 120 = 180s (wall time)
+GPU idle: ~0% during processing
+```
+
+### Observed Results
+
+**Setup:** 8× A100 80GB, 6000 video samples, VideoAestheticsFilter
+
+| Mode | Time | GPU Utilization |
+|------|------|-----------------|
+| Pure GPU (no partitioning) | ~1100s | 100% on all 8 GPUs |
+| Concurrent partitions (8) | ~1100-1300s | 100% on all 8 GPUs |
+| Sequential (old, deadlocked) | ∞ (deadlock) | 8/8 GPU allocated, 14+ pending |
+
+The concurrent approach matches pure GPU mode performance while adding partition-level checkpointing and resume capability.
+
+---
+
+## 6. Checkpointing and Resume
+
+### Checkpoint Structure
+
+Each remote task manages its own checkpoints:
+
+```
+checkpoints/
+├── partitioning_info.json        # Partition metadata for validation
+├── partition_0/
+│   ├── op_0_video_aesthetics_filter/
+│   │   ├── data.parquet
+│   │   └── _SUCCESS
+│   └── ...
+├── partition_1/
+│   └── ...
+└── ...
+```
+
+### Resume Flow
+
+```
+Resume from Crash (Partition 2 was in progress)
+──────────────────────────────────────────────
+
+1. Load partitioning_info.json
+2. Validate current partitions match saved metadata
+3. Submit all partition tasks concurrently
+4. Each task independently:
+   - Checks its own checkpoint state
+   - Skips completed ops (loads from checkpoint)
+   - Resumes from last incomplete op
+5. Collect results and union
+```
+
+---
+
+## 7. Error Handling
+
+### Partition Task Failure
+
+If a remote task fails:
+- Other partitions continue processing independently
+- Failed partition's actors are cleaned up by Ray
+- On retry/resume, the failed partition restarts from its last checkpoint
+
+### GPU Resource Deadlock Prevention
+
+The concurrency scoping mechanism prevents deadlock by ensuring:
+- Total GPU requests across all concurrent partitions ≤ available GPUs
+- `num_proc` is divided by `max_concurrent_partitions` for actor-mode ops
+- Actor mode is set before scoping (critical ordering requirement)
+
+---
+
+## 8. Known Limitations and Future Work
+
+1. **No actor reuse across partitions**: Each partition loads models independently. For workloads dominated by model loading time, a shared actor pool approach (the original design) could reduce overhead.
+
+2. **Repartition cost**: `repartition()` adds a shuffle stage. For large datasets this is cheap relative to processing, but for small datasets it adds overhead.
+
+3. **Single block per partition**: After split, each partition typically has one block, which means the entire partition is processed as a single batch by the actor. This prevents streaming output — no progress is visible until the whole partition completes.
+
+4. **`max_concurrent_partitions` tuning**: Must be ≤ available GPUs for GPU-bound workloads. Auto-detection sets it to the GPU count, but mixed CPU/GPU pipelines may benefit from different values.
+
+---
+
+## 9. Design Decision Log
+
+| Decision | Choice | Rationale |
+|----------|--------|-----------|
+| Sequential vs concurrent | Concurrent | Better GPU utilization, simpler architecture |
+| Shared actors vs per-partition | Per-partition | Avoids detached actor lifecycle complexity |
+| Repartition before split | Always repartition | Avoids materializing dataset to check num_blocks |
+| Actor mode + scoping order | Actor mode first | Required for scope_op_concurrency to work correctly |
+| Remote task num_cpus | 0 | Task is just an orchestrator; actual compute uses Ray Data actors |
+
+---
+
+## References
+
+- [Ray Actors Documentation](https://docs.ray.io/en/latest/ray-core/actors.html)
+- [Ray Data User Guide](https://docs.ray.io/en/latest/data/data.html)
+- Source: `data_juicer/core/executor/ray_executor_partitioned.py`
+- Source: `data_juicer/core/executor/concurrency_scoping.py`
diff --git a/perf-test.py b/perf-test.py
new file mode 100644
index 0000000000..81a81fb93c
--- /dev/null
+++ b/perf-test.py
@@ -0,0 +1,742 @@
+#!/usr/bin/env python3
+"""
+Simple single-operator benchmark to test data loading and Ray Data parallelism.
+Enhanced for debugging Ray/DataJuicer GPU actor initialization issues.
+"""
+
+import argparse
+import importlib
+import json
+import os
+import subprocess
+import sys
+import time
+from datetime import datetime
+
+from loguru import logger
+
+# ── Paths ─────────────────────────────────────────────────────────────────────
+DJ_CODE_PATH = "/mnt/workspace/yileiz/data-juicer"
+OUTPUT_DIR = "/mnt/workspace/yileiz/outputs/partitioned_ray/simple_workdir"
+MODEL_PATH = "/mnt/workspace/miaoxiang.zfr/models/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE"
+DEFAULT_CAPTION_JSONL = "/mnt/workspace/miaoxiang.zfr/data/Youku-AliceMind/caption_val_abs_6k.jsonl"
+DEFAULT_VIDEO_DIR = "/mnt/workspace/shurui.ksr/Project/data/modelscope/Youku-AliceMind/videos/caption"
+# ──────────────────────────────────────────────────────────────────────────────
+
+if os.path.exists(DJ_CODE_PATH):
+    sys.path.insert(0, DJ_CODE_PATH)
+
+
+def setup_logging(log_dir=None):
+    """Setup logging to file and console."""
+    if log_dir is None:
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        log_dir = os.path.join(OUTPUT_DIR, f"run_{timestamp}")
+
+    os.makedirs(log_dir, exist_ok=True)
+    log_file = os.path.join(log_dir, "benchmark.log")
+
+    logger.remove()
+
+    logger.add(
+        sys.stdout,
+        level="INFO",
+        format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>",
+        colorize=True,
+    )
+
+    logger.add(
+        log_file,
+        level="DEBUG",
+        format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}",
+        rotation="100 MB",
+    )
+
+    logger.info(f"Log file: {log_file}")
+    return log_dir, log_file
+
+
+def monitor_gpu():
+    """Print GPU utilization."""
+    try:
+        result = subprocess.run(
+            ["nvidia-smi", "--query-gpu=index,name,utilization.gpu,memory.used,memory.total", "--format=csv,noheader"],
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        logger.info(f"GPU Status:\n{result.stdout}")
+    except Exception as e:
+        logger.warning(f"Failed to query GPU status: {e}")
+
+
+def log_ray_paths():
+    """Print likely Ray log locations for easier debugging."""
+    ray_tmp = "/tmp/ray"
+    if os.path.exists(ray_tmp):
+        logger.info(f"Ray temp dir exists: {ray_tmp}")
+        logger.info("Check Ray logs under: /tmp/ray/session_latest/logs/")
+        logger.info("Ray Data logs often under: /tmp/ray/session_latest/logs/ray-data/")
+    else:
+        logger.warning("Ray temp dir /tmp/ray not found yet")
+
+
+def prepare_jsonl_from_caption(jsonl_path, video_base_dir, num_samples=None, output_path=None):
+    """Prepare JSONL with absolute video paths."""
+    if output_path is None:
+        output_path = jsonl_path.replace(".jsonl", "_abs.jsonl")
+
+    if os.path.exists(output_path):
+        logger.info(f"Output already exists: {output_path}")
+        return output_path
+
+    count = 0
+    missing = 0
+    with open(jsonl_path, "r") as f_in, open(output_path, "w") as f_out:
+        for line in f_in:
+            if num_samples and count >= num_samples:
+                break
+            sample = json.loads(line)
+            videos = sample.get("videos", [])
+            abs_videos = [os.path.join(video_base_dir, os.path.basename(v)) for v in videos]
+            if all(os.path.exists(v) for v in abs_videos):
+                out_sample = {"videos": abs_videos, "text": sample.get("caption", "")}
+                f_out.write(json.dumps(out_sample, ensure_ascii=False) + "\n")
+                count += 1
+            else:
+                missing += 1
+
+    logger.info(f"Created {output_path} with {count} samples, skipped {missing} missing-video samples")
+    return output_path
+
+
+def split_jsonl(jsonl_path, num_shards=96):
+    """Split JSONL into shards."""
+    shard_dir = jsonl_path.replace(".jsonl", f"_sharded_{num_shards}")
+    marker = os.path.join(shard_dir, "_DONE")
+
+    if os.path.exists(marker):
+        logger.info(f"Sharded data exists: {shard_dir}")
+        return shard_dir
+
+    os.makedirs(shard_dir, exist_ok=True)
+
+    writers = [open(os.path.join(shard_dir, f"shard_{i:04d}.jsonl"), "w") for i in range(num_shards)]
+
+    count = 0
+    try:
+        with open(jsonl_path, "r") as f_in:
+            for line in f_in:
+                writers[count % num_shards].write(line)
+                count += 1
+    finally:
+        for w in writers:
+            w.close()
+
+    with open(marker, "w") as f:
+        f.write(f"{count} samples\n")
+
+    logger.info(f"Split {count} samples into {num_shards} shards")
+    return shard_dir
+
+
+def require_module(module_name, pip_hint=None):
+    """Fail fast if module is missing."""
+    try:
+        return importlib.import_module(module_name)
+    except Exception as e:
+        hint = f" Please install it first: {pip_hint}" if pip_hint else ""
+        raise RuntimeError(f"Missing required module [{module_name}].{hint}\nOriginal error: {e}") from e
+
+
+def precheck_environment(fail_fast=True):
+    """
+    Precheck environment in driver process to avoid hanging inside Ray actors.
+    """
+    logger.info("=" * 80)
+    logger.info("Prechecking environment before starting Ray actors")
+    logger.info("=" * 80)
+
+    # Basic env
+    logger.info(f"Python executable: {sys.executable}")
+    logger.info(f"Python version: {sys.version}")
+    logger.info(f'HF_ENDPOINT={os.environ.get("HF_ENDPOINT")}')
+
+    # Model path
+    if not os.path.exists(MODEL_PATH):
+        msg = f"Model path does not exist: {MODEL_PATH}"
+        if fail_fast:
+            raise FileNotFoundError(msg)
+        logger.warning(msg)
+    else:
+        logger.info(f"Model path exists: {MODEL_PATH}")
+
+    # Required modules
+    require_module("torch", "pip install torch")
+    require_module("transformers", "pip install transformers")
+    require_module("ray", "pip install ray")
+    require_module("pyarrow", "pip install pyarrow")
+
+    # Torch / CUDA visibility
+    import torch
+
+    logger.info(f"torch version: {torch.__version__}")
+    logger.info(f"torch.cuda.is_available(): {torch.cuda.is_available()}")
+    logger.info(f"torch.cuda.device_count(): {torch.cuda.device_count()}")
+    if torch.cuda.is_available():
+        for i in range(torch.cuda.device_count()):
+            try:
+                logger.info(f"CUDA device {i}: {torch.cuda.get_device_name(i)}")
+            except Exception:
+                pass
+
+    logger.info("Environment precheck passed.")
+
+
+def init_ray(object_store_gb=300, num_gpus=8):
+    """Initialize Ray with better defaults."""
+    # Pre-import to avoid circular import issues in Ray workers
+    logger.info("Pre-importing modules to avoid fsspec issues in Ray workers...")
+    import fsspec
+    import fsspec.spec
+    import fsspec.utils  # noqa: F401
+
+    try:
+        from huggingface_hub import HfFileSystem  # noqa: F401
+    except ImportError:
+        pass  # OK if not available
+
+    import ray
+
+    if ray.is_initialized():
+        logger.info("Ray already initialized")
+        return
+
+    # Check if there's a running Ray cluster
+    ray_address = os.environ.get("RAY_ADDRESS")
+
+    if ray_address:
+        # Connect to specified cluster
+        logger.info(f"Connecting to Ray cluster at {ray_address}...")
+        ray.init(address=ray_address)
+        logger.info("Connected to existing Ray cluster")
+    else:
+        # Start a new local Ray instance
+        logger.info(f"Starting new Ray instance with {num_gpus} GPUs, {object_store_gb}GB object store...")
+        ray.init(
+            num_gpus=num_gpus,
+            object_store_memory=object_store_gb * 1024**3,
+        )
+        logger.info(f"Ray initialized successfully")
+
+    log_ray_paths()
+
+
+def run_simple_benchmark(
+    data_path,
+    num_shards=96,
+    num_partitions=8,
+    fail_fast=True,
+    executor_type="ray",
+):
+    """Run benchmark with DataJuicer + video_aesthetics_filter.
+
+    Args:
+        executor_type: 'ray' (standard, uses all GPUs) or 'ray_partitioned' (partitioned).
+            ray_partitioned auto-detects GPU count and runs partitions concurrently.
+    """
+    import ray  # noqa: F401
+    import yaml
+
+    from data_juicer.config import init_configs
+    from data_juicer.core.executor.ray_executor import RayExecutor
+    from data_juicer.core.executor.ray_executor_partitioned import (
+        PartitionedRayExecutor,
+    )
+
+    # Environment
+    os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
+    os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+    os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1")
+
+    # Fail fast before actors
+    precheck_environment(fail_fast=fail_fast)
+
+    # Initialize Ray
+    init_ray(object_store_gb=300)
+
+    # Shard data
+    if os.path.isfile(data_path):
+        data_path = split_jsonl(data_path, num_shards)
+
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    work_dir = os.path.join(OUTPUT_DIR, f"dj_run_{timestamp}")
+    os.makedirs(work_dir, exist_ok=True)
+
+    logger.info(f"Using executor type: {executor_type}")
+
+    # Detect available GPUs from Ray cluster
+    import ray as _ray
+
+    num_gpus = int(_ray.cluster_resources().get("GPU", 0))
+    if num_gpus <= 0:
+        raise RuntimeError("No GPUs available in Ray cluster")
+    logger.info(f"Detected {num_gpus} GPUs in Ray cluster")
+
+    # Base config
+    cfg_dict = {
+        "project_name": "simple-benchmark",
+        "executor_type": executor_type,
+        "dataset_path": data_path,
+        "export_path": os.path.join(work_dir, "result.jsonl"),
+        "work_dir": work_dir,
+        "video_key": "videos",
+        "skip_op_error": False,  # fail loudly
+        "use_cache": False,
+        "open_monitor": True,
+        "debug": False,
+        "auto_op_parallelism": False,  # Disable auto calculation to use explicit num_proc
+        "process": [
+            {
+                "video_aesthetics_filter": {
+                    "hf_scorer_model": MODEL_PATH,
+                    "trust_remote_code": True,
+                    "min_score": 0.4,
+                    "max_score": 1.0,
+                    "frame_num": 9223372036854775807,  # sys.maxsize - use all frames
+                    "reduce_mode": "avg",
+                    "skip_op_error": False,  # fail loudly during debugging
+                    "batch_mode": True,
+                    "num_gpus": 1,
+                    "num_proc": num_gpus,
+                },
+            },
+        ],
+    }
+
+    # Add partition config only for ray_partitioned executor
+    if executor_type == "ray_partitioned":
+        cfg_dict["partition"] = {
+            "mode": "manual",
+            "num_of_partitions": num_partitions,
+        }
+        cfg_dict["checkpoint"] = {
+            "enabled": False,
+        }
+
+    config_path = os.path.join(work_dir, "config.yaml")
+    with open(config_path, "w") as f:
+        yaml.dump(cfg_dict, f, allow_unicode=True, sort_keys=False)
+
+    logger.info(f"Config saved to {config_path}")
+    logger.info(f"Work dir: {work_dir}")
+    logger.info(f"Data path: {data_path}")
+    if executor_type == "ray_partitioned":
+        logger.info(f"Num partitions: {num_partitions}")
+
+    monitor_gpu()
+
+    cfg = init_configs(args=["--config", config_path])
+
+    t0 = time.time()
+    if executor_type == "ray":
+        executor = RayExecutor(cfg)
+    else:
+        executor = PartitionedRayExecutor(cfg)
+    logger.info(f"Executor init ({executor_type}): {time.time() - t0:.2f}s")
+
+    t1 = time.time()
+    try:
+        executor.run()
+    except Exception:
+        logger.exception("DataJuicer execution failed")
+        logger.error(f"Please inspect Ray logs under /tmp/ray/session_latest/logs/")
+        raise
+
+    logger.info(f"Processing: {time.time() - t1:.2f}s")
+    monitor_gpu()
+    logger.info(f"Total: {time.time() - t0:.2f}s")
+    logger.info(f"Output dir: {work_dir}")
+
+
+def run_ray_data_test(data_path, num_shards=96):
+    """Test raw Ray Data parallelism without DataJuicer."""
+    import ray
+
+    if os.path.isfile(data_path):
+        data_path = split_jsonl(data_path, num_shards)
+
+    init_ray(object_store_gb=100)
+
+    logger.info(f"Reading data from {data_path}")
+
+    t0 = time.time()
+    ds = ray.data.read_json(data_path)
+    count = ds.count()
+    try:
+        num_blocks = ds.num_blocks()
+    except Exception:
+        num_blocks = "unknown_before_materialize"
+    logger.info(f"Loaded dataset: {count} rows, {num_blocks} blocks")
+
+    def count_videos(row):
+        return {"video_count": len(row.get("videos", [])), "text_len": len(row.get("text", ""))}
+
+    t1 = time.time()
+    ds = ds.map(count_videos)
+    result = ds.take(5)
+    logger.info(f"Map result: {result}")
+    logger.info(f"Map time: {time.time() - t1:.2f}s")
+
+    t2 = time.time()
+    total = ds.count()
+    logger.info(f"Total rows: {total}, count time: {time.time() - t2:.2f}s")
+
+    logger.info(f"Total time: {time.time() - t0:.2f}s")
+
+
+def run_direct_gpu_test(
+    data_path,
+    num_shards=96,
+    batch_size=8,
+    gpu_concurrency=8,
+    fail_fast=True,
+):
+    """
+    Direct GPU test bypassing PartitionedRayExecutor.
+    This tests if Ray Data GPU actors work correctly.
+    """
+    import pyarrow
+    import ray
+
+    os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
+    os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+    os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1")
+
+    # Precheck before actor creation
+    precheck_environment(fail_fast=fail_fast)
+
+    init_ray(object_store_gb=300)
+
+    logger.info("Direct GPU Test - bypassing PartitionedRayExecutor")
+    monitor_gpu()
+
+    t0 = time.time()
+    if os.path.isfile(data_path):
+        data_path = split_jsonl(data_path, num_shards)
+
+    ds = ray.data.read_json(data_path)
+    row_count = ds.count()
+    logger.info(f"Loaded {row_count} rows in {time.time() - t0:.2f}s")
+
+    def add_stats_column(table: pyarrow.Table):
+        new_column_data = [{} for _ in range(len(table))]
+        return table.append_column("__dj__stats__", [new_column_data])
+
+    ds = ds.map_batches(add_stats_column, batch_format="pyarrow")
+    logger.info("Added __dj__stats__ column")
+
+    from data_juicer.ops.filter.video_aesthetics_filter import VideoAestheticsFilter
+
+    # Create operator on driver for validation only
+    op_t0 = time.time()
+    op = VideoAestheticsFilter(
+        hf_scorer_model=MODEL_PATH,
+        trust_remote_code=True,
+        min_score=0.4,
+        max_score=1.0,
+        frame_num=9223372036854775807,  # sys.maxsize - use all frames
+        reduce_mode="avg",
+        num_gpus=1,
+    )
+    logger.info(f"Operator init on driver: {time.time() - op_t0:.2f}s")
+    logger.info(f"Operator: {op._name}")
+    logger.info(f"  use_cuda: {op.use_cuda()}")
+    logger.info(f"  use_ray_actor: {op.use_ray_actor()}")
+    logger.info(f"  num_gpus: {op.num_gpus}")
+    logger.info(f"  num_proc: {op.num_proc}")
+
+    # Restrict concurrency to available GPUs
+    import torch
+
+    available_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
+    if available_gpus <= 0:
+        raise RuntimeError("No CUDA GPUs visible, cannot run direct GPU test")
+
+    gpu_concurrency = min(gpu_concurrency, available_gpus)
+    logger.info(f"Using gpu_concurrency={gpu_concurrency}, batch_size={batch_size}")
+
+    # Prefer new API style: concurrency=
+    t1 = time.time()
+    logger.info("Creating Ray Data GPU actor pipeline...")
+
+    try:
+        ds = ds.map_batches(
+            VideoAestheticsFilter,
+            fn_constructor_args=op._init_args,
+            fn_constructor_kwargs=op._init_kwargs,
+            batch_size=batch_size,
+            num_cpus=1,
+            num_gpus=1,
+            concurrency=gpu_concurrency,
+            batch_format="pyarrow",
+        )
+        logger.info("Using map_batches(..., concurrency=...)")
+    except TypeError:
+        # Fallback for older Ray versions
+        from ray.data import ActorPoolStrategy
+
+        logger.warning("Ray version does not support concurrency= here, fallback to ActorPoolStrategy")
+        ds = ds.map_batches(
+            VideoAestheticsFilter,
+            fn_constructor_args=op._init_args,
+            fn_constructor_kwargs=op._init_kwargs,
+            batch_size=batch_size,
+            num_cpus=1,
+            num_gpus=1,
+            compute=ActorPoolStrategy(size=gpu_concurrency),
+            batch_format="pyarrow",
+        )
+
+    logger.info("Executing pipeline...")
+    t2 = time.time()
+    try:
+        result = ds.materialize()
+    except Exception:
+        logger.exception("Direct GPU pipeline execution failed")
+        logger.error("Please inspect /tmp/ray/session_latest/logs/")
+        raise
+
+    logger.info(f"Pipeline execution: {time.time() - t2:.2f}s")
+
+    count = result.count()
+    logger.info(f"Result: {count} rows")
+
+    monitor_gpu()
+    logger.info(f"Total time: {time.time() - t0:.2f}s")
+    logger.info(f"Pipeline setup time: {time.time() - t1:.2f}s")
+
+
+def run_direct_gpu_test_dj_match(
+    data_path,
+    num_shards=96,
+    batch_size=10,  # DJ CUDA default
+    gpu_concurrency=8,
+    fail_fast=True,
+):
+    """
+    Direct GPU test that matches the DJ pipeline as closely as possible.
+    Adds: convert_to_absolute_paths, count(), columns(), filter step.
+    """
+    from functools import partial
+
+    import pyarrow
+    import ray
+
+    os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
+    os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+    os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1")
+
+    precheck_environment(fail_fast=fail_fast)
+    init_ray(object_store_gb=300)
+
+    logger.info("Direct GPU Test (DJ-matched pipeline)")
+    monitor_gpu()
+
+    t0 = time.time()
+    if os.path.isfile(data_path):
+        data_path = split_jsonl(data_path, num_shards)
+
+    ds = ray.data.read_json(data_path)
+
+    # --- Match DJ: count() before processing ---
+    t_count = time.time()
+    row_count = ds.count()
+    logger.info(f"count(): {row_count} rows in {time.time() - t_count:.2f}s")
+
+    # --- Match DJ: columns() ---
+    t_cols = time.time()
+    cols = ds.columns()
+    logger.info(f"columns(): {cols} in {time.time() - t_cols:.2f}s")
+
+    # --- Match DJ: convert_to_absolute_paths ---
+    dataset_dir = os.path.dirname(data_path)
+
+    def convert_to_absolute_paths(batch, dataset_dir, path_keys):
+        for key in path_keys:
+            if key in batch.column_names:
+                col = batch.column(key)
+                new_col = []
+                for val in col.to_pylist():
+                    if isinstance(val, list):
+                        new_col.append([os.path.join(dataset_dir, p) if not os.path.isabs(p) else p for p in val])
+                    elif isinstance(val, str):
+                        new_col.append(os.path.join(dataset_dir, val) if not os.path.isabs(val) else val)
+                    else:
+                        new_col.append(val)
+                idx = batch.column_names.index(key)
+                batch = batch.set_column(idx, key, [new_col])
+        return batch
+
+    path_keys = [k for k in ["videos", "images", "audios"] if k in cols]
+    if path_keys:
+        ds = ds.map_batches(
+            partial(convert_to_absolute_paths, dataset_dir=dataset_dir, path_keys=path_keys),
+            batch_format="pyarrow",
+            zero_copy_batch=True,
+            batch_size=1000,
+        )
+        logger.info(f"Added convert_to_absolute_paths for keys: {path_keys}")
+
+    # --- Match DJ: add __dj__stats__ column ---
+    def add_stats_column(table: pyarrow.Table):
+        new_column_data = [{} for _ in range(len(table))]
+        return table.append_column("__dj__stats__", [new_column_data])
+
+    ds = ds.map_batches(add_stats_column, batch_format="pyarrow", batch_size=1000)
+    logger.info("Added __dj__stats__ column")
+
+    # --- Match DJ: compute_stats via actor ---
+    from data_juicer.ops.filter.video_aesthetics_filter import VideoAestheticsFilter
+
+    op = VideoAestheticsFilter(
+        hf_scorer_model=MODEL_PATH,
+        trust_remote_code=True,
+        min_score=0.4,
+        max_score=1.0,
+        frame_num=9223372036854775807,
+        reduce_mode="avg",
+        num_gpus=1,
+        batch_mode=True,
+    )
+    logger.info(f"Op: {op._name}, batch_size={batch_size}, is_batched={op.is_batched_op()}")
+
+    import torch
+
+    available_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
+    if available_gpus <= 0:
+        raise RuntimeError("No CUDA GPUs visible")
+    gpu_concurrency = min(gpu_concurrency, available_gpus)
+    logger.info(f"gpu_concurrency={gpu_concurrency}, batch_size={batch_size}")
+
+    t1 = time.time()
+    ds = ds.map_batches(
+        VideoAestheticsFilter,
+        fn_constructor_args=op._init_args,
+        fn_constructor_kwargs=op._init_kwargs,
+        batch_size=batch_size,
+        num_gpus=1,
+        concurrency=gpu_concurrency,
+        batch_format="pyarrow",
+    )
+    logger.info("Added compute_stats map_batches (actor mode)")
+
+    # --- Match DJ: filter step ---
+    def filter_batch(batch, filter_func):
+        mask = pyarrow.array(filter_func(batch.to_pydict()))
+        return batch.filter(mask)
+
+    ds = ds.map_batches(
+        partial(filter_batch, filter_func=op.process),
+        batch_format="pyarrow",
+        zero_copy_batch=True,
+        batch_size=1000,
+    )
+    logger.info("Added filter_batch step")
+
+    # --- Execute ---
+    logger.info("Executing full DJ-matched pipeline...")
+    t2 = time.time()
+    try:
+        result = ds.materialize()
+    except Exception:
+        logger.exception("Pipeline execution failed")
+        raise
+
+    logger.info(f"Pipeline execution: {time.time() - t2:.2f}s")
+    count = result.count()
+    logger.info(f"Result: {count} rows (filtered from {row_count})")
+    monitor_gpu()
+    logger.info(f"Total time: {time.time() - t0:.2f}s")
+    logger.info(f"Pipeline time (from first map_batches): {time.time() - t1:.2f}s")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Simple benchmark")
+    parser.add_argument(
+        "--caption-jsonl",
+        type=str,
+        default=DEFAULT_CAPTION_JSONL,
+    )
+    parser.add_argument(
+        "--video-dir",
+        type=str,
+        default=DEFAULT_VIDEO_DIR,
+    )
+    parser.add_argument("--num-samples", type=int, default=6000)
+    parser.add_argument("--num-shards", type=int, default=96)
+    parser.add_argument("--partitions", type=int, default=8)
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument("--gpu-concurrency", type=int, default=8)
+    parser.add_argument("--fail-fast", action="store_true", default=True)
+    parser.add_argument("--no-fail-fast", dest="fail_fast", action="store_false")
+    parser.add_argument("--mode", type=str, choices=["ray", "dj", "gpu", "gpu-dj", "both"], default="gpu")
+    parser.add_argument(
+        "--executor",
+        type=str,
+        choices=["ray", "ray_partitioned"],
+        default="ray",
+        help='Executor type: "ray" (standard, parallel GPUs) or "ray_partitioned" (partitioned)',
+    )
+    args = parser.parse_args()
+
+    log_dir, log_file = setup_logging()
+    logger.info(f"Arguments: {args}")
+
+    jsonl_path = prepare_jsonl_from_caption(args.caption_jsonl, args.video_dir, args.num_samples)
+
+    if args.mode in ["ray", "both"]:
+        logger.info("\n" + "=" * 60)
+        logger.info("Testing Ray Data parallelism")
+        logger.info("=" * 60)
+        run_ray_data_test(jsonl_path, args.num_shards)
+
+    if args.mode in ["dj", "both"]:
+        logger.info("\n" + "=" * 60)
+        logger.info(f"Testing DataJuicer with single operator (executor={args.executor})")
+        logger.info("=" * 60)
+        run_simple_benchmark(
+            jsonl_path,
+            num_shards=args.num_shards,
+            num_partitions=args.partitions,
+            fail_fast=args.fail_fast,
+            executor_type=args.executor,
+        )
+
+    if args.mode == "gpu":
+        logger.info("\n" + "=" * 60)
+        logger.info("Testing Direct GPU (bypass PartitionedRayExecutor)")
+        logger.info("=" * 60)
+        run_direct_gpu_test(
+            jsonl_path,
+            num_shards=args.num_shards,
+            batch_size=args.batch_size,
+            gpu_concurrency=args.gpu_concurrency,
+            fail_fast=args.fail_fast,
+        )
+
+    if args.mode == "gpu-dj":
+        logger.info("\n" + "=" * 60)
+        logger.info("Testing Direct GPU (DJ-matched pipeline)")
+        logger.info("=" * 60)
+        run_direct_gpu_test_dj_match(
+            jsonl_path,
+            num_shards=args.num_shards,
+            batch_size=10,  # DJ CUDA default
+            gpu_concurrency=args.gpu_concurrency,
+            fail_fast=args.fail_fast,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index 26519cbbc7..c88badaab3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -234,6 +234,7 @@ filterwarnings = [
 ]
 
 [tool.uv]
+constraint-dependencies = ["kaleido==0.2.1"]
 override-dependencies = [
     "opencv-python; sys_platform == 'never'",
     "opencv-python-headless; sys_platform == 'never'",
diff --git a/tests/core/executor/test_ray_executor_partitioned.py b/tests/core/executor/test_ray_executor_partitioned.py
index 9c6fec206b..2054f0c2a7 100644
--- a/tests/core/executor/test_ray_executor_partitioned.py
+++ b/tests/core/executor/test_ray_executor_partitioned.py
@@ -679,5 +679,221 @@ def test_dag_node_status_transitions(self):
             self.assertEqual(executor.pipeline_dag.nodes[node_id]["status"], "completed")
 
 
+class ConcurrencyScopingTest(DataJuicerTestCaseBase):
+    """Unit tests for scope_op_concurrency utility."""
+
+    def test_gpu_op_scoping(self):
+        """GPU op concurrency is divided by max_concurrent_partitions."""
+        from unittest.mock import MagicMock
+        from data_juicer.core.executor.concurrency_scoping import scope_op_concurrency
+
+        op = MagicMock()
+        op.use_ray_actor.return_value = True
+        op.num_proc = 4
+        self.assertEqual(scope_op_concurrency(op, 4), 1)
+        self.assertEqual(scope_op_concurrency(op, 2), 2)
+        self.assertEqual(scope_op_concurrency(op, 1), 4)
+
+    def test_gpu_op_scoping_floor_min_one(self):
+        """Scoped concurrency never goes below 1."""
+        from unittest.mock import MagicMock
+        from data_juicer.core.executor.concurrency_scoping import scope_op_concurrency
+
+        op = MagicMock()
+        op.use_ray_actor.return_value = True
+        op.num_proc = 2
+        self.assertEqual(scope_op_concurrency(op, 8), 1)
+
+    def test_cpu_op_unchanged(self):
+        """CPU ops (use_ray_actor=False) are not scoped."""
+        from unittest.mock import MagicMock
+        from data_juicer.core.executor.concurrency_scoping import scope_op_concurrency
+
+        op = MagicMock()
+        op.use_ray_actor.return_value = False
+        op.num_proc = 4
+        self.assertEqual(scope_op_concurrency(op, 4), 4)
+
+    def test_auto_mode_unchanged(self):
+        """Auto-mode (num_proc <= 0) is not scoped."""
+        from unittest.mock import MagicMock
+        from data_juicer.core.executor.concurrency_scoping import scope_op_concurrency
+
+        op = MagicMock()
+        op.use_ray_actor.return_value = True
+        op.num_proc = -1
+        self.assertEqual(scope_op_concurrency(op, 4), -1)
+
+    def test_none_num_proc_unchanged(self):
+        """None num_proc is not scoped."""
+        from unittest.mock import MagicMock
+        from data_juicer.core.executor.concurrency_scoping import scope_op_concurrency
+
+        op = MagicMock()
+        op.use_ray_actor.return_value = True
+        op.num_proc = None
+        self.assertIsNone(scope_op_concurrency(op, 4))
+
+    def test_resolve_max_concurrent_explicit_int(self):
+        """Explicit int values are passed through."""
+        from data_juicer.core.executor.ray_executor_partitioned import PartitionedRayExecutor
+        self.assertEqual(PartitionedRayExecutor._resolve_max_concurrent(4), 4)
+        self.assertEqual(PartitionedRayExecutor._resolve_max_concurrent(1), 1)
+        # Minimum clamp to 1
+        self.assertEqual(PartitionedRayExecutor._resolve_max_concurrent(0), 1)
+
+    def test_resolve_max_concurrent_auto(self):
+        """'auto' resolves to GPU count or 1."""
+        from data_juicer.core.executor.ray_executor_partitioned import PartitionedRayExecutor
+        result = PartitionedRayExecutor._resolve_max_concurrent("auto")
+        self.assertIsInstance(result, int)
+        self.assertGreaterEqual(result, 1)
+
+
+class ConcurrentPartitionConfigTest(DataJuicerTestCaseBase):
+    """Tests for max_concurrent_partitions config parsing."""
+
+    root_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', '..')
+
+    def setUp(self) -> None:
+        super().setUp()
+        unique_name = f'test_concurrent_cfg_{uuid.uuid4().hex[:8]}'
+        self.tmp_dir = os.path.join(self.root_path, 'tmp', unique_name)
+        os.makedirs(self.tmp_dir, exist_ok=True)
+
+    def tearDown(self) -> None:
+        super().tearDown()
+        if os.path.exists(self.tmp_dir):
+            shutil.rmtree(self.tmp_dir)
+
+    @TEST_TAG('ray')
+    def test_default_max_concurrent_partitions_auto(self):
+        """Default max_concurrent_partitions is 'auto', resolved from GPU count."""
+        cfg = init_configs([
+            '--config', os.path.join(self.root_path, 'demos/process_on_ray/configs/demo-new-config.yaml'),
+            '--partition.mode', 'manual',
+            '--partition.num_of_partitions', '4'
+        ])
+        cfg.export_path = os.path.join(self.tmp_dir, 'test_default_conc', 'res.jsonl')
+        cfg.work_dir = os.path.join(self.tmp_dir, 'test_default_conc')
+
+        executor = PartitionedRayExecutor(cfg)
+        # Auto-resolved: matches GPU count, or 1 if no GPUs
+        import ray as _ray
+        num_gpus = int(_ray.cluster_resources().get("GPU", 0))
+        if num_gpus > 1:
+            self.assertEqual(executor.max_concurrent_partitions, num_gpus)
+        else:
+            self.assertEqual(executor.max_concurrent_partitions, 1)
+
+    @TEST_TAG('ray')
+    def test_explicit_max_concurrent_partitions(self):
+        """Explicit max_concurrent_partitions is parsed correctly."""
+        cfg = init_configs([
+            '--config', os.path.join(self.root_path, 'demos/process_on_ray/configs/demo-new-config.yaml'),
+            '--partition.mode', 'manual',
+            '--partition.num_of_partitions', '8',
+            '--partition.max_concurrent_partitions', '4'
+        ])
+        cfg.export_path = os.path.join(self.tmp_dir, 'test_explicit_conc', 'res.jsonl')
+        cfg.work_dir = os.path.join(self.tmp_dir, 'test_explicit_conc')
+
+        executor = PartitionedRayExecutor(cfg)
+        self.assertEqual(executor.max_concurrent_partitions, 4)
+
+    @TEST_TAG('ray')
+    def test_num_partitions_inferred_from_max_concurrent(self):
+        """num_of_partitions is raised to max_concurrent_partitions when too low."""
+        cfg = init_configs([
+            '--config', os.path.join(self.root_path, 'demos/process_on_ray/configs/demo-new-config.yaml'),
+            '--partition.mode', 'manual',
+            '--partition.num_of_partitions', '2',
+            '--partition.max_concurrent_partitions', '8'
+        ])
+        cfg.export_path = os.path.join(self.tmp_dir, 'test_infer_partitions', 'res.jsonl')
+        cfg.work_dir = os.path.join(self.tmp_dir, 'test_infer_partitions')
+
+        executor = PartitionedRayExecutor(cfg)
+        # num_partitions should be raised to 8
+        self.assertEqual(executor.num_partitions, 8)
+        self.assertEqual(executor.max_concurrent_partitions, 8)
+
+    @TEST_TAG('ray')
+    def test_num_partitions_not_lowered(self):
+        """num_of_partitions is NOT lowered when already >= max_concurrent."""
+        cfg = init_configs([
+            '--config', os.path.join(self.root_path, 'demos/process_on_ray/configs/demo-new-config.yaml'),
+            '--partition.mode', 'manual',
+            '--partition.num_of_partitions', '16',
+            '--partition.max_concurrent_partitions', '8'
+        ])
+        cfg.export_path = os.path.join(self.tmp_dir, 'test_no_lower', 'res.jsonl')
+        cfg.work_dir = os.path.join(self.tmp_dir, 'test_no_lower')
+
+        executor = PartitionedRayExecutor(cfg)
+        self.assertEqual(executor.num_partitions, 16)
+        self.assertEqual(executor.max_concurrent_partitions, 8)
+
+    @TEST_TAG('ray')
+    def test_concurrent_execution_end2end(self):
+        """End-to-end test: concurrent partitions produce output."""
+        cfg = init_configs([
+            '--config', os.path.join(self.root_path, 'demos/process_on_ray/configs/demo-new-config.yaml'),
+            '--partition.mode', 'manual',
+            '--partition.num_of_partitions', '2',
+            '--partition.max_concurrent_partitions', '2'
+        ])
+        cfg.export_path = os.path.join(self.tmp_dir, 'test_conc_e2e', 'res.jsonl')
+        cfg.work_dir = os.path.join(self.tmp_dir, 'test_conc_e2e')
+
+        executor = PartitionedRayExecutor(cfg)
+        executor.run()
+
+        self.assertTrue(os.path.exists(cfg.export_path))
+
+    @TEST_TAG('ray')
+    def test_concurrent_with_checkpointing(self):
+        """Concurrent execution with checkpointing enabled."""
+        cfg = init_configs([
+            '--config', os.path.join(self.root_path, 'demos/process_on_ray/configs/demo-new-config.yaml'),
+            '--partition.mode', 'manual',
+            '--partition.num_of_partitions', '2',
+            '--partition.max_concurrent_partitions', '2',
+            '--checkpoint.enabled', 'true',
+            '--checkpoint.strategy', 'every_op'
+        ])
+        cfg.export_path = os.path.join(self.tmp_dir, 'test_conc_ckpt', 'res.jsonl')
+        cfg.work_dir = os.path.join(self.tmp_dir, 'test_conc_ckpt')
+
+        executor = PartitionedRayExecutor(cfg)
+        executor.run()
+
+        self.assertTrue(os.path.exists(cfg.export_path))
+
+        # Verify checkpoint files were created
+        checkpoint_dir = cfg.checkpoint_dir
+        if os.path.exists(checkpoint_dir):
+            checkpoint_files = [f for f in os.listdir(checkpoint_dir) if f.endswith('.parquet')]
+            self.assertGreater(len(checkpoint_files), 0, "No checkpoint files were created")
+
+    @TEST_TAG('ray')
+    def test_backward_compat_sequential(self):
+        """max_concurrent_partitions=1 uses sequential path (same as before)."""
+        cfg = init_configs([
+            '--config', os.path.join(self.root_path, 'demos/process_on_ray/configs/demo-new-config.yaml'),
+            '--partition.mode', 'manual',
+            '--partition.num_of_partitions', '2',
+            '--partition.max_concurrent_partitions', '1'
+        ])
+        cfg.export_path = os.path.join(self.tmp_dir, 'test_seq_compat', 'res.jsonl')
+        cfg.work_dir = os.path.join(self.tmp_dir, 'test_seq_compat')
+
+        executor = PartitionedRayExecutor(cfg)
+        self.assertEqual(executor.max_concurrent_partitions, 1)
+        executor.run()
+
+        self.assertTrue(os.path.exists(cfg.export_path))
+
+
 if __name__ == '__main__':
     unittest.main()