Skip to content

Commit 7194311

Browse files
authored
fix: Add error handling for empty Parquet files while indexing and corresponding tests (#601)
* fix: Add error handling for empty Parquet files in ParquetDir iterator * add test * update
1 parent 2402f45 commit 7194311

File tree

2 files changed

+28
-0
lines changed

2 files changed

+28
-0
lines changed

src/litdata/utilities/parquet.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@ def __iter__(self) -> Generator[Tuple[Dict[str, Any], int], None, None]:
3535
Yields:
3636
Generator[Tuple[str, int], None, None]: A generator yielding tuples of file name, file path, and order.
3737
"""
38+
if not self.files:
39+
raise RuntimeError(
40+
f"No Parquet files were found at '{self.dir.url or self.dir.path}'. "
41+
"Please verify that the provided path is correct and that it contains at least one .parquet file. "
42+
"If the files are located in a subdirectory, please specify the correct path."
43+
)
44+
3845
with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
3946
futures = {executor.submit(self.task, _file): (order, _file) for order, _file in enumerate(self.files)}
4047
for future in futures:

tests/streaming/test_parquet.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,3 +203,24 @@ def test_cache_dir_option(monkeypatch, huggingface_hub_fs_mock, default):
203203
pass
204204
# check chunk cache dir was filled
205205
assert len([f for f in os.listdir(ds.input_dir.path) if f.endswith(".parquet")]) == 5 # 5 chunks
206+
207+
208+
@pytest.mark.parametrize(
209+
("pq_url"),
210+
[
211+
"s3://some_bucket/some_path",
212+
"gs://some_bucket/some_path",
213+
"hf://datasets/some_org/some_repo/some_path",
214+
],
215+
)
216+
@patch("litdata.utilities.parquet._HF_HUB_AVAILABLE", True)
217+
@patch("litdata.streaming.downloader._HF_HUB_AVAILABLE", True)
218+
@patch("litdata.utilities.parquet._FSSPEC_AVAILABLE", True)
219+
def test_no_parquet_files(pq_url, tmpdir, huggingface_hub_fs_mock, fsspec_pq_mock):
220+
ls_mock = Mock()
221+
ls_mock.ls = Mock(side_effect=lambda *args, **kwargs: [])
222+
huggingface_hub_fs_mock.HfFileSystem = Mock(return_value=ls_mock)
223+
fsspec_pq_mock.filesystem = Mock(return_value=ls_mock)
224+
225+
with pytest.raises(RuntimeError, match="No Parquet files were found"):
226+
index_parquet_dataset(pq_url, cache_dir=tmpdir)

0 commit comments

Comments
 (0)