From 37a1f974324dd86afb8fe25070ef7d0988230428 Mon Sep 17 00:00:00 2001 From: tstadel <60758086+tstadel@users.noreply.github.com> Date: Fri, 26 Sep 2025 14:40:13 +0200 Subject: [PATCH] feat: add s3_key_meta_field --- .../components/downloaders/s3/s3_downloader.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/integrations/amazon_bedrock/src/haystack_integrations/components/downloaders/s3/s3_downloader.py b/integrations/amazon_bedrock/src/haystack_integrations/components/downloaders/s3/s3_downloader.py index d23ef371a..da53b8c04 100644 --- a/integrations/amazon_bedrock/src/haystack_integrations/components/downloaders/s3/s3_downloader.py +++ b/integrations/amazon_bedrock/src/haystack_integrations/components/downloaders/s3/s3_downloader.py @@ -39,6 +39,7 @@ def __init__( file_root_path: Optional[str] = None, file_extensions: Optional[List[str]] = None, file_name_meta_key: str = "file_name", + s3_key_meta_key: str = "file_name", max_workers: int = 32, max_cache_size: int = 100, ) -> None: @@ -94,6 +95,7 @@ def __init__( self.max_workers = max_workers self.max_cache_size = max_cache_size self.file_name_meta_key = file_name_meta_key + self.s3_key_meta_key = s3_key_meta_key self._storage: Optional[S3Storage] = None @@ -178,8 +180,16 @@ def _download_file(self, document: Document) -> Optional[Document]: f"Document missing required file name metadata key '{self.file_name_meta_key}'. Skipping download." ) return None + + s3_key = document.meta.get(self.s3_key_meta_key, file_name) + if not s3_key: + logger.warning( + f"Document missing required S3 key metadata key '{self.s3_key_meta_key}'. Skipping download." + ) + return None file_path = self.file_root_path / Path(file_name) + file_path.parent.mkdir(parents=True, exist_ok=True) if file_path.is_file(): # set access and modification time to now without redownloading the file @@ -187,7 +197,7 @@ def _download_file(self, document: Document) -> Optional[Document]: else: # we know that _storage is not None after warm_up() is called, but mypy does not know that - self._storage.download(key=file_name, local_file_path=file_path) # type: ignore[union-attr] + self._storage.download(key=s3_key, local_file_path=file_path) # type: ignore[union-attr] document.meta["file_path"] = str(file_path) return document