Enable map over inputs without files input (#19285)

tchaton · web-flow · commit 19d9eabbc5b8 · 2024-01-16T12:19:01.000Z
diff --git a/src/lightning/data/streaming/constants.py b/src/lightning/data/streaming/constants.py
@@ -26,7 +26,7 @@
 # This is required for full pytree serialization / deserialization support
 _TORCH_GREATER_EQUAL_2_1_0 = RequirementCache("torch>=2.1.0")
 _VIZ_TRACKER_AVAILABLE = RequirementCache("viztracer")
-_LIGHTNING_CLOUD_LATEST = RequirementCache("lightning-cloud>=0.5.57")
+_LIGHTNING_CLOUD_LATEST = RequirementCache("lightning-cloud>=0.5.58")
 _BOTO3_AVAILABLE = RequirementCache("boto3")
 
 # DON'T CHANGE ORDER
diff --git a/src/lightning/data/streaming/data_processor.py b/src/lightning/data/streaming/data_processor.py
@@ -261,7 +261,7 @@ def _get_item_filesizes(items: List[Any], base_path: str = "") -> List[int]:
         flattened_item, _ = tree_flatten(item)
 
         num_bytes = 0
-        for index, element in enumerate(flattened_item):
+        for element in flattened_item:
             if isinstance(element, str) and element.startswith(base_path) and os.path.exists(element):
                 file_bytes = os.path.getsize(element)
                 if file_bytes == 0:
@@ -358,7 +358,7 @@ def _loop(self) -> None:
                         for uploader in self.uploaders:
                             uploader.join()
 
-                    if self.remove:
+                    if self.remove and self.input_dir.path is not None:
                         assert self.remover
                         self.remove_queue.put(None)
                         self.remover.join()
@@ -380,7 +380,7 @@ def _loop(self) -> None:
                 self.progress_queue.put((self.worker_index, self._counter))
                 self._last_time = time()
 
-            if self.remove:
+            if self.remove and self.input_dir.path is not None:
                 self.remove_queue.put(self.paths[index])
 
             try:
@@ -420,6 +420,13 @@ def _try_upload(self, filepath: Optional[str]) -> None:
         self.to_upload_queues[self._counter % self.num_uploaders].put(filepath)
 
     def _collect_paths(self) -> None:
+        if self.input_dir.path is None:
+            for index in range(len(self.items)):
+                self.ready_to_process_queue.put(index)
+            for _ in range(self.num_downloaders):
+                self.ready_to_process_queue.put(None)
+            return
+
         items = []
         for item in self.items:
             flattened_item, spec = tree_flatten(item)
@@ -456,6 +463,8 @@ def _collect_paths(self) -> None:
         self.items = items
 
     def _start_downloaders(self) -> None:
+        if self.input_dir.path is None:
+            return
         for _ in range(self.num_downloaders):
             to_download_queue: Queue = Queue()
             p = Process(
@@ -478,8 +487,9 @@ def _start_downloaders(self) -> None:
             self.to_download_queues[downloader_index].put(None)
 
     def _start_remover(self) -> None:
-        if not self.remove:
+        if not self.remove or self.input_dir.path is None:
             return
+
         self.remover = Process(
             target=_remove_target,
             args=(
@@ -548,9 +558,6 @@ def _handle_data_transform_recipe(self, index: int) -> None:
             for filename in filenames:
                 filepaths.append(os.path.join(directory, filename))
 
-        if len(filepaths) == 0:
-            raise RuntimeError("You haven't saved any files under the `output_dir`.")
-
         for filepath in filepaths:
             self._try_upload(filepath)
 
@@ -804,7 +811,7 @@ def run(self, data_recipe: DataRecipe) -> None:
         if not isinstance(user_items, list):
             raise ValueError("The `prepare_structure` should return a list of item metadata.")
 
-        if self.reorder_files:
+        if self.reorder_files and self.input_dir.path:
             # TODO: Only do this on node 0, and broadcast the item sizes to the other nodes.
             item_sizes = _get_item_filesizes(user_items, base_path=self.input_dir.path)
             workers_user_items = _map_items_to_workers_weighted(
diff --git a/src/lightning/data/streaming/functions.py b/src/lightning/data/streaming/functions.py
@@ -16,7 +16,7 @@
 from datetime import datetime
 from pathlib import Path
 from types import FunctionType
-from typing import Any, Callable, Optional, Sequence, Union
+from typing import Any, Callable, Dict, Optional, Sequence, Union
 
 import torch
 
@@ -30,16 +30,28 @@
     from torch.utils._pytree import tree_flatten
 
 
-def _get_input_dir(inputs: Sequence[Any]) -> str:
-    flattened_item, _ = tree_flatten(inputs[0])
+def _get_indexed_paths(data: Any) -> Dict[int, str]:
+    flattened_item, _ = tree_flatten(data)
 
     indexed_paths = {
         index: element
         for index, element in enumerate(flattened_item)
         if isinstance(element, str) and os.path.exists(element)
     }
 
+    return indexed_paths
+
+
+def _get_input_dir(inputs: Sequence[Any]) -> Optional[str]:
+    indexed_paths = _get_indexed_paths(inputs[0])
+
     if len(indexed_paths) == 0:
+        # Check whether the second element has any input_path
+        indexed_paths = _get_indexed_paths(inputs[1])
+        if len(indexed_paths) == 0:
+            return None
+
+        # Every element should have filepaths if any contains one.
         raise ValueError(f"The provided item {inputs[0]} didn't contain any filepaths.")
 
     absolute_path = str(Path(list(indexed_paths.values())[0]).resolve())
@@ -129,6 +141,7 @@ def map(
     machine: Optional[str] = None,
     num_downloaders: Optional[int] = None,
     reorder_files: bool = True,
+    error_when_not_empty: bool = False,
 ) -> None:
     """This function map a callbable over a collection of files possibly in a distributed way.
 
@@ -144,6 +157,7 @@ def map(
         num_downloaders: The number of downloaders per worker.
         reorder_files: By default, reorders the files by file size to distribute work equally among all workers.
             Set this to ``False`` if the order in which samples are processed should be preserved.
+        error_when_not_empty: Whether we should error if the output folder isn't empty.
 
     """
     if not isinstance(inputs, Sequence):
@@ -161,7 +175,8 @@ def map(
                 " HINT: You can either use `/teamspace/s3_connections/...` or `/teamspace/datasets/...`."
             )
 
-        _assert_dir_is_empty(output_dir)
+        if error_when_not_empty:
+            _assert_dir_is_empty(output_dir)
 
         input_dir = _resolve_dir(_get_input_dir(inputs))
 
diff --git a/tests/tests_data/streaming/test_data_processor.py b/tests/tests_data/streaming/test_data_processor.py
@@ -4,6 +4,7 @@
 from typing import Any, List
 from unittest import mock
 
+import lightning_cloud
 import numpy as np
 import pytest
 import torch
@@ -25,6 +26,7 @@
     _wait_for_file_to_exist,
 )
 from lightning.data.streaming.functions import LambdaDataTransformRecipe, map, optimize
+from lightning_cloud import resolver
 from lightning_utilities.core.imports import RequirementCache
 
 _PIL_AVAILABLE = RequirementCache("PIL")
@@ -872,3 +874,46 @@ def test_get_item_filesizes(tmp_path):
     assert os.path.getsize(tmp_path / "empty_file") == 0
     with pytest.raises(RuntimeError, match="has 0 bytes!"):
         _get_item_filesizes([str(tmp_path / "empty_file")])
+
+
+def map_fn_index(output_dir, index):
+    with open(os.path.join(output_dir, f"{index}.JPEG"), "w") as f:
+        f.write("Hello")
+
+
+@pytest.mark.skipif(condition=not _PIL_AVAILABLE or sys.platform == "win32", reason="Requires: ['pil']")
+def test_data_processing_map_without_input_dir(monkeypatch, tmpdir):
+    cache_dir = os.path.join(tmpdir, "cache")
+    output_dir = os.path.join(tmpdir, "target_dir")
+    os.makedirs(output_dir, exist_ok=True)
+    monkeypatch.setenv("DATA_OPTIMIZER_CACHE_FOLDER", cache_dir)
+    monkeypatch.setenv("DATA_OPTIMIZER_DATA_CACHE_FOLDER", cache_dir)
+
+    map(map_fn_index, list(range(5)), output_dir=output_dir, num_workers=1, reorder_files=True)
+
+    assert sorted(os.listdir(output_dir)) == ["0.JPEG", "1.JPEG", "2.JPEG", "3.JPEG", "4.JPEG"]
+
+
+@pytest.mark.skipif(condition=sys.platform == "win32", reason="Not supported on windows")
+def test_map_error_when_not_empty(monkeypatch, tmpdir):
+    boto3 = mock.MagicMock()
+    client_s3_mock = mock.MagicMock()
+    client_s3_mock.list_objects_v2.return_value = {"KeyCount": 1, "Contents": []}
+    boto3.client.return_value = client_s3_mock
+    monkeypatch.setattr(resolver, "boto3", boto3)
+
+    with pytest.raises(RuntimeError, match="data and datasets are meant to be immutable"):
+        map(
+            map_fn,
+            [0, 1],
+            output_dir=lightning_cloud.resolver.Dir(path=None, url="s3://bucket"),
+            error_when_not_empty=True,
+        )
+
+    with pytest.raises(OSError, match="cache"):
+        map(
+            map_fn,
+            [0, 1],
+            output_dir=lightning_cloud.resolver.Dir(path=None, url="s3://bucket"),
+            error_when_not_empty=False,
+        )