Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def __init__(
file_root_path: Optional[str] = None,
file_extensions: Optional[List[str]] = None,
file_name_meta_key: str = "file_name",
s3_key_meta_key: str = "file_name",
max_workers: int = 32,
max_cache_size: int = 100,
) -> None:
Expand Down Expand Up @@ -94,6 +95,7 @@ def __init__(
self.max_workers = max_workers
self.max_cache_size = max_cache_size
self.file_name_meta_key = file_name_meta_key
self.s3_key_meta_key = s3_key_meta_key

self._storage: Optional[S3Storage] = None

Expand Down Expand Up @@ -178,16 +180,24 @@ def _download_file(self, document: Document) -> Optional[Document]:
f"Document missing required file name metadata key '{self.file_name_meta_key}'. Skipping download."
)
return None

s3_key = document.meta.get(self.s3_key_meta_key, file_name)
if not s3_key:
logger.warning(
f"Document missing required S3 key metadata key '{self.s3_key_meta_key}'. Skipping download."
)
return None

file_path = self.file_root_path / Path(file_name)
file_path.parent.mkdir(parents=True, exist_ok=True)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just added this in case file_name contains /, such that this does not fail.


if file_path.is_file():
# set access and modification time to now without redownloading the file
file_path.touch()

else:
# we know that _storage is not None after warm_up() is called, but mypy does not know that
self._storage.download(key=file_name, local_file_path=file_path) # type: ignore[union-attr]
self._storage.download(key=s3_key, local_file_path=file_path) # type: ignore[union-attr]

document.meta["file_path"] = str(file_path)
return document
Expand Down
Loading