From 344b0d3da64c3b374d085d0c5597a035871243e2 Mon Sep 17 00:00:00 2001 From: Dominic Kempf Date: Tue, 1 Oct 2024 16:50:07 +0200 Subject: [PATCH 1/5] Check all available checksum algorithm in DataVerse registry population --- pooch/downloaders.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pooch/downloaders.py b/pooch/downloaders.py index 9500389b..ffa8b31e 100644 --- a/pooch/downloaders.py +++ b/pooch/downloaders.py @@ -1158,6 +1158,9 @@ def populate_registry(self, pooch): """ for filedata in self.api_response.json()["data"]["latestVersion"]["files"]: - pooch.registry[filedata["dataFile"]["filename"]] = ( - f"md5:{filedata['dataFile']['md5']}" - ) + for algorithm in ["md5", "sha1", "sha256", "sha512"]: + if algorithm in filedata["dataFile"]: + pooch.registry[filedata["dataFile"]["filename"]] = ( + f"{algorithm}:{filedata['dataFile'][algorithm]}" + ) + break From 6d7d6670140e680469bf463c23847b2d22706fbb Mon Sep 17 00:00:00 2001 From: Dominic Kempf Date: Wed, 13 Nov 2024 14:24:28 +0100 Subject: [PATCH 2/5] Also check for checksums in a checksum subdictionary --- pooch/downloaders.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pooch/downloaders.py b/pooch/downloaders.py index ffa8b31e..570795e3 100644 --- a/pooch/downloaders.py +++ b/pooch/downloaders.py @@ -1159,8 +1159,20 @@ def populate_registry(self, pooch): for filedata in self.api_response.json()["data"]["latestVersion"]["files"]: for algorithm in ["md5", "sha1", "sha256", "sha512"]: + # Look for the checksum directly in the file dictionary if algorithm in filedata["dataFile"]: pooch.registry[filedata["dataFile"]["filename"]] = ( f"{algorithm}:{filedata['dataFile'][algorithm]}" ) break + # Look for the checksum in the checksum sub-dictionary + if algorithm in filedata["dataFile"].get("checksum", {}): + pooch.registry[filedata["dataFile"]["filename"]] = ( + f"{algorithm}:{filedata['dataFile']['checksum'][algorithm]}" + ) + break + + raise ValueError( + f"Checksum for file '{filedata['dataFile']['filename']}'" + " not found in the DataVerse API response." + ) From 57459bdef9b0dfe009b6b2b5476df507ed23e81c Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Mon, 25 Nov 2024 15:20:40 -0800 Subject: [PATCH 3/5] Fix previous implementation Restructure how the two APIs were being supported: the previous implementation would raise an error when "md5" and "checksum" were not available in the response, and not check for any other algorithms. Correctly parse the checksum algorithm that is available in the new Dataverse API: we need to convert "SHA-1" to "sha1" to name one. --- pooch/downloaders.py | 52 +++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/pooch/downloaders.py b/pooch/downloaders.py index 570795e3..0a78f247 100644 --- a/pooch/downloaders.py +++ b/pooch/downloaders.py @@ -1157,22 +1157,44 @@ def populate_registry(self, pooch): The pooch instance that the registry will be added to. """ - for filedata in self.api_response.json()["data"]["latestVersion"]["files"]: - for algorithm in ["md5", "sha1", "sha256", "sha512"]: - # Look for the checksum directly in the file dictionary - if algorithm in filedata["dataFile"]: - pooch.registry[filedata["dataFile"]["filename"]] = ( - f"{algorithm}:{filedata['dataFile'][algorithm]}" - ) - break - # Look for the checksum in the checksum sub-dictionary - if algorithm in filedata["dataFile"].get("checksum", {}): - pooch.registry[filedata["dataFile"]["filename"]] = ( - f"{algorithm}:{filedata['dataFile']['checksum'][algorithm]}" - ) - break + for file in self.api_response.json()["data"]["latestVersion"]["files"]: + filedata = file["dataFile"] + # Support old API: algorithm listed in the dataFile key + algorithms = [ + k for k in {"md5", "sha1", "sha256", "sha512"} if k in filedata.keys() + ] + if algorithms: + (algorithm,) = algorithms + pooch.registry[filedata["filename"]] = ( + f"{algorithm}:{filedata[algorithm]}" + ) + + # Support new API + elif "checksum" in filedata.keys(): + algorithm = self._parse_hashing_algorithm(filedata["checksum"]["type"]) + pooch.registry[filedata["filename"]] = ( + f"{algorithm}:{filedata['checksum']['value']}" + ) + else: raise ValueError( - f"Checksum for file '{filedata['dataFile']['filename']}'" + f"Checksum for file '{filedata['filename']}'" " not found in the DataVerse API response." ) + + def _parse_hashing_algorithm(self, algorithm): + """ + Parse hashing algorithm in Dataverse API responses. + + Parse the algorithms (MD5, SHA-1, SHA-256, SHA-512, etc.) present under + the "checksum" key in Dataverse API responses to the corresponding ones + supported by Pooch. + """ + algorithm = algorithm.lower() + if algorithm == "sha-1": + return "sha1" + if algorithm == "sha-256": + return "sha256" + if algorithm == "sha-512": + return "sha512" + return algorithm From c03dba3b2d3a91a725f430606f1ee1ef04f4d2d3 Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Mon, 25 Nov 2024 15:22:43 -0800 Subject: [PATCH 4/5] Test the new implementation for populate_registry --- pooch/tests/test_downloaders.py | 147 ++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) diff --git a/pooch/tests/test_downloaders.py b/pooch/tests/test_downloaders.py index 14e32f92..f3bebbde 100644 --- a/pooch/tests/test_downloaders.py +++ b/pooch/tests/test_downloaders.py @@ -543,3 +543,150 @@ def test_populate_registry(self, httpserver, tmp_path, api_response): # Populate registry downloader.populate_registry(puppy) assert puppy.registry == {self.file_name: f"md5:{self.file_checksum}"} + + +class MockOldResponse: + """ + Mock request response to test checksum algorithm in DataverseRepository. + + Use the old API where the checksum algorithm was listed as a key in the + file information. + """ + + status_code = 200 + + def __init__(self, algorithm: str, checksum_key=False): + """ + Parameters + ---------- + algorithm : str + Hashing algorithm that will be used in the fake repsonse. + checksum_key : bool, optional + Whether to add the 'checksum' key in the response (new API), or + use the hashing algorithm type as key (old API). + + Notes + ----- + + Old API response: + + .. code:: + + { + ... + "data": { + "files": [ + { + "label": "foobar.txt", + "dataFile": { + ... + "id": 12345, + "filename": "foobar.txt", + "md5": "0123456789abcdef", + } + } + ... + ] + } + } + + + New API response: + + .. code:: + + { + ... + "data": { + "files": [ + { + "label": "foobar.txt", + "dataFile": { + ... + "id": 12345, + "filename": "foobar.txt", + "checksum": { + "type": "MD5", + "value": "0123456789abcdef" + } + } + } + ... + ] + } + } + """ + + self._algorithm = algorithm + self._checksum_key = checksum_key + + @property + def _file(self) -> dict: + file_dict = { + "label": "foobar.txt", + "dataFile": { + "id": 12345, + "filename": "foobar.txt", + }, + } + checksum = "0123456789abcdef" + if self._checksum_key: + file_dict = { + "label": "foobar.txt", + "dataFile": { + "id": 12345, + "filename": "foobar.txt", + "checksum": { + "type": self._algorithm.upper(), + "value": checksum, + }, + }, + } + else: + file_dict = { + "label": "foobar.txt", + "dataFile": { + "id": 12345, + "filename": "foobar.txt", + self._algorithm: checksum, + }, + } + return file_dict + + def json(self) -> dict: + """ + Return a dictionary with a fake response. + """ + files = [self._file] + response = {"data": {"latestVersion": {"files": files}}} + return response + + +class MockDataverse(DataverseRepository): + """ + Mock class to test checksum algorithms in DataverseRepository. + """ + + def __init__(self, response): + super().__init__(doi=None, archive_url=None) + self._api_response = response + + +class TestDataversePopulateRegistry: + """ + Test checksum algorithms in Dataverse downloaders. + """ + + @pytest.mark.parametrize("checksum_key", (False, True), ids=["old_api", "new_api"]) + @pytest.mark.parametrize("algorithm", ["md5", "sha1", "sha256", "sha512"]) + def test_populate_registry(self, tmp_path, algorithm, checksum_key): + """ + Test populating registry of DataverseRepository. + + Test if the new and old APIs are supported. + """ + response = MockOldResponse(algorithm, checksum_key) + downloader = MockDataverse(response) + puppy = Pooch(path=tmp_path, base_url="https://foo.bar/") + downloader.populate_registry(puppy) + assert puppy.registry == {"foobar.txt": f"{algorithm}:0123456789abcdef"} From 87655d6c408cafcb7f24beab08fa1a333bc0763d Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Mon, 25 Nov 2024 15:34:40 -0800 Subject: [PATCH 5/5] Remove old docstring line --- pooch/tests/test_downloaders.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pooch/tests/test_downloaders.py b/pooch/tests/test_downloaders.py index f3bebbde..f0fd7cb7 100644 --- a/pooch/tests/test_downloaders.py +++ b/pooch/tests/test_downloaders.py @@ -548,9 +548,6 @@ def test_populate_registry(self, httpserver, tmp_path, api_response): class MockOldResponse: """ Mock request response to test checksum algorithm in DataverseRepository. - - Use the old API where the checksum algorithm was listed as a key in the - file information. """ status_code = 200