map operator: Add weights to evenly distributed works among workers (#19365)

tchaton · web-flow · commit 37a521cad223 · 2024-01-29T18:27:37.000Z
diff --git a/src/lightning/data/streaming/data_processor.py b/src/lightning/data/streaming/data_processor.py
@@ -241,7 +241,10 @@ def _map_items_to_workers_sequentially(num_workers: int, user_items: List[Any])
 
 
 def _map_items_to_workers_weighted(
-    num_workers: int, user_items: List[Any], weights: Optional[List[int]] = None
+    num_workers: int,
+    user_items: List[Any],
+    weights: Optional[List[int]] = None,
+    file_size: bool = True,
 ) -> List[List[Any]]:
     # Associate the items to the workers based on number of nodes and node rank.
     weights = [1] * len(user_items) if weights is None else weights
@@ -255,7 +258,11 @@ def _map_items_to_workers_weighted(
     for worker_id, size in worker_weights.items():
         if worker_id not in worker_ids_this_node:
             continue
-        print(f"Worker {worker_id} gets {size / 1e6:.1f} MB ({len(worker_items[worker_id])} files)")
+
+        if file_size:
+            print(f"Worker {worker_id} gets {size / 1e6:.1f} MB ({len(worker_items[worker_id])} files)")
+        else:
+            print(f"Worker {worker_id} gets ({len(worker_items[worker_id])}) items for a total weight of {size}.")
 
     return [worker_items[worker_id] for worker_id in worker_ids_this_node]
 
@@ -769,6 +776,7 @@ def __init__(
         fast_dev_run: Optional[Union[bool, int]] = None,
         random_seed: Optional[int] = 42,
         reorder_files: bool = True,
+        weights: Optional[List[int]] = None,
     ):
         """The `DatasetOptimiser` provides an efficient way to process data across multiple machine into chunks to make
         training faster.
@@ -784,6 +792,8 @@ def __init__(
             random_seed: The random seed to be set before shuffling the data.
             reorder_files: By default, reorders the files by file size to distribute work equally among all workers.
                 Set this to ``False`` if the order in which samples are processed should be preserved.
+            weights: Provide a list of weights associated to the inputs.
+                This is used to evenly split the work among the workers.
 
         """
         self.input_dir = _resolve_dir(input_dir)
@@ -799,6 +809,7 @@ def __init__(
         self.error_queue: Queue = Queue()
         self.stop_queues: List[Queue] = []
         self.reorder_files = reorder_files
+        self.weights = weights
 
         # Ensure the input dir is the same across all nodes
         self.input_dir = broadcast_object("input_dir", self.input_dir)
@@ -827,7 +838,14 @@ def run(self, data_recipe: DataRecipe) -> None:
         if not isinstance(user_items, list):
             raise ValueError("The `prepare_structure` should return a list of item metadata.")
 
-        if self.reorder_files and self.input_dir.path:
+        if self.weights is not None:
+            if len(self.weights) != len(user_items):
+                raise ValueError("The provided weights length should match the inputs' length.")
+            workers_user_items = _map_items_to_workers_weighted(
+                num_workers=self.num_workers, user_items=user_items, weights=self.weights, file_size=False
+            )
+
+        elif self.reorder_files and self.input_dir.path:
             # TODO: Only do this on node 0, and broadcast the item sizes to the other nodes.
             item_sizes = _get_item_filesizes(user_items, base_path=self.input_dir.path)
             workers_user_items = _map_items_to_workers_weighted(
diff --git a/src/lightning/data/streaming/functions.py b/src/lightning/data/streaming/functions.py
@@ -149,6 +149,7 @@ def map(
     fn: Callable[[str, Any], None],
     inputs: Sequence[Any],
     output_dir: Union[str, Dir],
+    weights: Optional[List[int]] = None,
     num_workers: Optional[int] = None,
     fast_dev_run: Union[bool, int] = False,
     num_nodes: Optional[int] = None,
@@ -201,6 +202,7 @@ def map(
             fast_dev_run=fast_dev_run,
             num_downloaders=num_downloaders,
             reorder_files=reorder_files,
+            weights=weights,
         )
         return data_processor.run(LambdaDataTransformRecipe(fn, inputs))
     return _execute(
diff --git a/tests/tests_data/streaming/test_data_processor.py b/tests/tests_data/streaming/test_data_processor.py
@@ -907,11 +907,30 @@ def test_data_processing_map_without_input_dir(monkeypatch, tmpdir):
     monkeypatch.setenv("DATA_OPTIMIZER_CACHE_FOLDER", cache_dir)
     monkeypatch.setenv("DATA_OPTIMIZER_DATA_CACHE_FOLDER", cache_dir)
 
-    map(map_fn_index, list(range(5)), output_dir=output_dir, num_workers=1, reorder_files=True)
+    map(
+        map_fn_index,
+        list(range(5)),
+        output_dir=output_dir,
+        num_workers=1,
+        reorder_files=True,
+        weights=[1 for _ in range(5)],
+    )
 
     assert sorted(os.listdir(output_dir)) == ["0.JPEG", "1.JPEG", "2.JPEG", "3.JPEG", "4.JPEG"]
 
 
+@pytest.mark.skipif(condition=not _PIL_AVAILABLE or sys.platform == "win32", reason="Requires: ['pil']")
+def test_data_processing_map_weights_mismatch(monkeypatch, tmpdir):
+    cache_dir = os.path.join(tmpdir, "cache")
+    output_dir = os.path.join(tmpdir, "target_dir")
+    os.makedirs(output_dir, exist_ok=True)
+    monkeypatch.setenv("DATA_OPTIMIZER_CACHE_FOLDER", cache_dir)
+    monkeypatch.setenv("DATA_OPTIMIZER_DATA_CACHE_FOLDER", cache_dir)
+
+    with pytest.raises(ValueError, match="The provided weights length"):
+        map(map_fn_index, list(range(5)), output_dir=output_dir, num_workers=1, reorder_files=True, weights=[1])
+
+
 @pytest.mark.skipif(condition=sys.platform == "win32", reason="Not supported on windows")
 def test_map_error_when_not_empty(monkeypatch, tmpdir):
     boto3 = mock.MagicMock()