From 2430868e2a583a177fbd27b56f4b10dd0e66e820 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 18 Mar 2025 18:17:04 +0100 Subject: [PATCH 01/48] Refactor: Simplify Path handling --- src/nplinker/genomics/antismash/antismash_downloader.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index a02728f4..a95483ed 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -42,8 +42,7 @@ def download_and_extract_antismash_data( >>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted") """ download_root = Path(download_root) - extract_root = Path(extract_root) - extract_path = extract_root / "antismash" / antismash_id + extract_path = Path(extract_root) / "antismash" / antismash_id try: if extract_path.exists(): From 1f072467054a02defa9dfb5c767f861fba712aac Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 18 Mar 2025 18:17:42 +0100 Subject: [PATCH 02/48] Refactor: Improve extract path handling to ensure that non-empty dirs are not deleted --- .../genomics/antismash/antismash_downloader.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index a95483ed..8f4fe5b7 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -44,12 +44,12 @@ def download_and_extract_antismash_data( download_root = Path(download_root) extract_path = Path(extract_root) / "antismash" / antismash_id - try: - if extract_path.exists(): - _check_extract_path(extract_path) - else: - extract_path.mkdir(parents=True, exist_ok=True) + if extract_path.exists(): + _check_extract_path(extract_path) + else: + extract_path.mkdir(parents=True, exist_ok=True) + try: for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]: url = base_url.format(antismash_id, antismash_id + ".zip") download_and_extract_archive(url, download_root, extract_path, antismash_id + ".zip") From 8d7238c9cbd4007ea96d7f376fb7f283570c5d97 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 18 Mar 2025 18:18:16 +0100 Subject: [PATCH 03/48] Refactor: Extract file cleanup logic into a separate function for better readability --- .../antismash/antismash_downloader.py | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index 8f4fe5b7..93f22fc3 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -55,15 +55,7 @@ def download_and_extract_antismash_data( download_and_extract_archive(url, download_root, extract_path, antismash_id + ".zip") break - # delete subdirs - for subdir_path in list_dirs(extract_path): - shutil.rmtree(subdir_path) - - # delete unnecessary files - files_to_keep = list_files(extract_path, suffix=(".json", ".gbk")) - for file in list_files(extract_path): - if file not in files_to_keep: - os.remove(file) + _cleanup_extracted_files(extract_path) logger.info("antiSMASH BGC data of %s is downloaded and extracted.", antismash_id) @@ -77,3 +69,15 @@ def _check_extract_path(extract_path: Path): # check if extract_path is empty if any(extract_path.iterdir()): raise ValueError(f'Nonempty directory: "{extract_path}"') + + +def _cleanup_extracted_files(extract_path: str | PathLike) -> None: + # delete subdirs + for subdir_path in list_dirs(extract_path): + shutil.rmtree(subdir_path) + + # delete unnecessary files + files_to_keep = list_files(extract_path, suffix=(".json", ".gbk")) + for file in list_files(extract_path): + if file not in files_to_keep: + os.remove(file) From d5e64eb57bb6948ce07315438fd838f38eb84454 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 18 Mar 2025 18:19:35 +0100 Subject: [PATCH 04/48] Refactor: Move extract_path preparation logic into a seperate function for better readabiliy --- .../genomics/antismash/antismash_downloader.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index 93f22fc3..f2397ea8 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -44,10 +44,7 @@ def download_and_extract_antismash_data( download_root = Path(download_root) extract_path = Path(extract_root) / "antismash" / antismash_id - if extract_path.exists(): - _check_extract_path(extract_path) - else: - extract_path.mkdir(parents=True, exist_ok=True) + _prepare_extract_path(extract_path) try: for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]: @@ -81,3 +78,10 @@ def _cleanup_extracted_files(extract_path: str | PathLike) -> None: for file in list_files(extract_path): if file not in files_to_keep: os.remove(file) + + +def _prepare_extract_path(extract_path: str | PathLike) -> None: + if extract_path.exists(): + _check_extract_path(extract_path) + else: + extract_path.mkdir(parents=True, exist_ok=True) From 66f13d6d57639e87036b684a59be6e6d183b7ad3 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 18 Mar 2025 19:17:41 +0100 Subject: [PATCH 05/48] Refactor: Separate genome assembly resolution and antiSMASH data retrieval into distinct functions for improved clarity --- .../antismash/podp_antismash_downloader.py | 92 ++++++++++++++----- 1 file changed, 69 insertions(+), 23 deletions(-) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 84e3cee4..5b9ded97 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -179,33 +179,18 @@ def podp_download_and_extract_antismash_data( logger.info(f"Genome ID {raw_genome_id} skipped due to previous failed attempt") continue - # if not downloaded or lookup attempted, then try to resolve the ID - # and download - logger.info(f"Start lookup process for genome ID {raw_genome_id}") - gs_obj.resolved_refseq_id = _resolve_refseq_id(genome_id_data) - gs_obj.resolve_attempted = True - - if gs_obj.resolved_refseq_id == "": - # give up on this one - logger.warning(f"Failed lookup for genome ID {raw_genome_id}") + # resolve genome ID + try: + get_genome_assembly_accession(gs, genome_record["genome_ID"]) + except Exception as e: + logger.warning(f"Failed to resolve genome ID {gs.original_id}. Error: {e}") continue - # if resolved id is valid, try to download and extract antismash data + # retrieve antismash BGC data from antiSMASH-DB try: - download_and_extract_antismash_data( - gs_obj.resolved_refseq_id, project_download_root, project_extract_root - ) - - gs_obj.bgc_path = str( - Path(project_download_root, gs_obj.resolved_refseq_id + ".zip").absolute() - ) - - output_path = Path(project_extract_root, "antismash", gs_obj.resolved_refseq_id) - if output_path.exists(): - Path.touch(output_path / "completed", exist_ok=True) - + retrieve_antismash_db_data(gs, project_download_root, project_extract_root) except Exception: - gs_obj.bgc_path = "" + continue # raise and log warning for failed downloads failed_ids = [gs.original_id for gs in gs_dict.values() if not gs.bgc_path] @@ -247,6 +232,67 @@ def get_best_available_genome_id(genome_id_data: Mapping[str, str]) -> str | Non return best_id +def get_genome_assembly_accession( + genome_status: GenomeStatus, genome_id_data: Mapping[str, str] +) -> None: + """Resolve and update the genome assembly accession for a given genome status. + + This function attempts to resolve the RefSeq ID for the provided genome record + and updates the `genome_status` object with the resolved ID. It also sets the + `resolve_attempted` flag to `True` to indicate that an attempt to resolve the + RefSeq ID has been made. If the resolution fails, raises a RuntimeError and leaves + the `resolved_refseq_id` empty. + + Args: + genome_status (GenomeStatus): An object representing the status of the genome, + which will be updated with the resolved RefSeq ID. + genome_id_data (Mapping[str, str]): A dictionary containing genome + information, where keys like "RefSeq_accession", "GenBank_accession", + or "JGI_Genome_ID" are used to resolve the RefSeq ID. + + Warnings: + Logs a warning if the RefSeq ID cannot be resolved. + """ + genome_status.resolved_refseq_id = _resolve_refseq_id(genome_id_data) + genome_status.resolve_attempted = True + + if genome_status.resolved_refseq_id == "": + raise RuntimeError("Failed to get genome assembly accession") + + +def retrieve_antismash_db_data( + genome_status: GenomeStatus, download_root: str | PathLike, extract_root: str | PathLike +) -> None: + """Retrieve antiSMASH database data for a given genome and update its status. + + This function downloads and extracts antiSMASH data for a genome identified + by its resolved RefSeq ID. It updates the `genome_status` object with the + path to the downloaded data or sets it to an empty string if an error occurs. + + Args: + genome_status (GenomeStatus): An object representing the genome's status, + including its resolved RefSeq ID and BGC path. + download_root (str | PathLike): The root directory where the antiSMASH + data will be downloaded. + extract_root (str | PathLike): The root directory where the antiSMASH + data will be extracted. + + Raises: + Exception: If an error occurs during the download or extraction process. + """ + antismash_id = genome_status.resolved_refseq_id + extract_path = Path(extract_root, "antismash", antismash_id) + download_path = Path(download_root, f"{antismash_id}.zip").absolute() + + try: + download_and_extract_antismash_data(antismash_id, download_root, extract_root) + Path.touch(extract_path / "completed", exist_ok=True) + genome_status.bgc_path = str(download_path) + except Exception as e: + genome_status.bgc_path = "" + raise e + + def _resolve_genbank_accession(genbank_id: str) -> str: """Try to get RefSeq assembly id through given GenBank assembly id. From 706f8419b32c0929f721b8b0c59feec8fa3bba64 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 18 Mar 2025 19:18:06 +0100 Subject: [PATCH 06/48] Refactor: Improve genome ID handling and logging --- .../antismash/podp_antismash_downloader.py | 36 +++++++++---------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 5b9ded97..744025a2 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -153,30 +153,26 @@ def podp_download_and_extract_antismash_data( gs_dict = GenomeStatus.read_json(gs_file) for i, genome_record in enumerate(genome_records): - # get the best available ID from the dict - genome_id_data = genome_record["genome_ID"] - raw_genome_id = get_best_available_genome_id(genome_id_data) - if raw_genome_id is None or len(raw_genome_id) == 0: - logger.warning(f'Invalid input genome record "{genome_record}"') - continue + logger.info( + f"Getting antismash BGC data for genome record {i + 1} of {len(genome_records)}." + ) - # check if genome ID exist in the genome status file - if raw_genome_id not in gs_dict: - gs_dict[raw_genome_id] = GenomeStatus(raw_genome_id) + # get the best available genome ID from the dict + original_genome_id = get_best_available_genome_id(genome_record["genome_ID"]) + if not original_genome_id: + logger.warning(f"Skipping invalid genome record: {genome_record}") + continue - gs_obj = gs_dict[raw_genome_id] + # Retrieve or initialize the GenomeStatus object for the genome ID + gs = gs_dict.setdefault(original_genome_id, GenomeStatus(original_genome_id)) - logger.info( - f"Checking for antismash data {i + 1}/{len(genome_records)}, " - f"current genome ID={raw_genome_id}" - ) - # first, check if BGC data is downloaded - if gs_obj.bgc_path and Path(gs_obj.bgc_path).exists(): - logger.info(f"Genome ID {raw_genome_id} already downloaded to {gs_obj.bgc_path}") + # Skip genome if BGC data is downloaded + if gs.bgc_path and Path(gs.bgc_path).exists(): + logger.info(f"Genome ID {original_genome_id} already downloaded to {gs.bgc_path}") continue - # second, check if lookup attempted previously - if gs_obj.resolve_attempted: - logger.info(f"Genome ID {raw_genome_id} skipped due to previous failed attempt") + # Skip genome if lookup attempted previously + if gs.resolve_attempted: + logger.info(f"Genome ID {original_genome_id} skipped due to previous failed attempt") continue # resolve genome ID From b25506e7d9501df7245a152b6094411364ef4213 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 18 Mar 2025 19:20:18 +0100 Subject: [PATCH 07/48] Refactor: Move logging for antiSMASH data retrieval errors and success messages --- src/nplinker/genomics/antismash/antismash_downloader.py | 3 --- .../genomics/antismash/podp_antismash_downloader.py | 7 +++++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index f2397ea8..07662107 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -54,11 +54,8 @@ def download_and_extract_antismash_data( _cleanup_extracted_files(extract_path) - logger.info("antiSMASH BGC data of %s is downloaded and extracted.", antismash_id) - except Exception as e: shutil.rmtree(extract_path) - logger.warning(e) raise e diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 744025a2..04189f87 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -185,8 +185,11 @@ def podp_download_and_extract_antismash_data( # retrieve antismash BGC data from antiSMASH-DB try: retrieve_antismash_db_data(gs, project_download_root, project_extract_root) - except Exception: - continue + logger.info(f"antiSMASH BGC data for {gs.original_id} is downloaded and extracted") + except Exception as e: + logger.warning( + f"Failed to retrieve BGC data from antiSMASH-DB for {gs.original_id}. Error: {e}" + ) # raise and log warning for failed downloads failed_ids = [gs.original_id for gs in gs_dict.values() if not gs.bgc_path] From 2ed8ac6cf158cc2721c0caa7fb01e70914abe3ad Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 18 Mar 2025 19:29:05 +0100 Subject: [PATCH 08/48] simplify comment --- src/nplinker/genomics/antismash/podp_antismash_downloader.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 04189f87..0cdea2fe 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -166,11 +166,10 @@ def podp_download_and_extract_antismash_data( # Retrieve or initialize the GenomeStatus object for the genome ID gs = gs_dict.setdefault(original_genome_id, GenomeStatus(original_genome_id)) - # Skip genome if BGC data is downloaded + # Skip genomes if gs.bgc_path and Path(gs.bgc_path).exists(): logger.info(f"Genome ID {original_genome_id} already downloaded to {gs.bgc_path}") continue - # Skip genome if lookup attempted previously if gs.resolve_attempted: logger.info(f"Genome ID {original_genome_id} skipped due to previous failed attempt") continue From a1e49620a2da8ab7cd00c3864b8e77e3c54e7337 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 18 Mar 2025 19:30:48 +0100 Subject: [PATCH 09/48] Enhance logging messages in antiSMASH data retrieval --- .../genomics/antismash/podp_antismash_downloader.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 0cdea2fe..aadf1eb4 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -162,13 +162,15 @@ def podp_download_and_extract_antismash_data( if not original_genome_id: logger.warning(f"Skipping invalid genome record: {genome_record}") continue - # Retrieve or initialize the GenomeStatus object for the genome ID gs = gs_dict.setdefault(original_genome_id, GenomeStatus(original_genome_id)) # Skip genomes if gs.bgc_path and Path(gs.bgc_path).exists(): - logger.info(f"Genome ID {original_genome_id} already downloaded to {gs.bgc_path}") + logger.info( + f"antiSMASH BGC data for genome ID {original_genome_id} already downloaded to " + f"{gs.bgc_path}" + ) continue if gs.resolve_attempted: logger.info(f"Genome ID {original_genome_id} skipped due to previous failed attempt") @@ -184,7 +186,9 @@ def podp_download_and_extract_antismash_data( # retrieve antismash BGC data from antiSMASH-DB try: retrieve_antismash_db_data(gs, project_download_root, project_extract_root) - logger.info(f"antiSMASH BGC data for {gs.original_id} is downloaded and extracted") + logger.info( + f"antiSMASH BGC data for genome ID {gs.original_id} is downloaded and extracted" + ) except Exception as e: logger.warning( f"Failed to retrieve BGC data from antiSMASH-DB for {gs.original_id}. Error: {e}" From e8dd391e56ea97b138648e755247823925b3f61e Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 18 Mar 2025 19:38:55 +0100 Subject: [PATCH 10/48] test: adapt test to changed logging info message --- tests/unit/genomics/test_podp_antismash_downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/genomics/test_podp_antismash_downloader.py b/tests/unit/genomics/test_podp_antismash_downloader.py index 7fb9ec0c..302d354d 100644 --- a/tests/unit/genomics/test_podp_antismash_downloader.py +++ b/tests/unit/genomics/test_podp_antismash_downloader.py @@ -218,7 +218,7 @@ def test_caching(download_root, extract_root, genome_status_file, caplog): assert genome_obj.resolve_attempted podp_download_and_extract_antismash_data(genome_records, download_root, extract_root) assert ( - f"Genome ID {genome_obj.original_id} already downloaded to {genome_obj.bgc_path}" + f"antiSMASH BGC data for genome ID {genome_obj.original_id} already downloaded to {genome_obj.bgc_path}" in caplog.text ) assert ( From ea159ca824a5d8bf701c2fb9ab03c3ededff0a59 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 12:06:02 +0100 Subject: [PATCH 11/48] Feat: Add antiSMASH API functionality --- src/nplinker/genomics/antismash/__init__.py | 12 +- .../antismash/antismash_api_client.py | 97 ++++++++++++++++ .../antismash/antismash_downloader.py | 97 ++++++++++++---- .../genomics/antismash/ncbi_downloader.py | 107 ++++++++++++++++++ .../antismash/podp_antismash_downloader.py | 72 ++++++++++-- .../genomics/test_antismash_downloader.py | 10 +- 6 files changed, 362 insertions(+), 33 deletions(-) create mode 100644 src/nplinker/genomics/antismash/antismash_api_client.py create mode 100644 src/nplinker/genomics/antismash/ncbi_downloader.py diff --git a/src/nplinker/genomics/antismash/__init__.py b/src/nplinker/genomics/antismash/__init__.py index e126f548..4817e806 100644 --- a/src/nplinker/genomics/antismash/__init__.py +++ b/src/nplinker/genomics/antismash/__init__.py @@ -1,16 +1,24 @@ -from .antismash_downloader import download_and_extract_antismash_data +from .antismash_api_client import antismash_job_is_done +from .antismash_api_client import submit_antismash_job +from .antismash_downloader import download_and_extract_from_antismash_api +from .antismash_downloader import download_and_extract_from_antismash_db from .antismash_loader import AntismashBGCLoader from .antismash_loader import parse_bgc_genbank +from .ncbi_downloader import download_and_extract_ncbi_genome from .podp_antismash_downloader import GenomeStatus from .podp_antismash_downloader import get_best_available_genome_id from .podp_antismash_downloader import podp_download_and_extract_antismash_data __all__ = [ - "download_and_extract_antismash_data", + "download_and_extract_from_antismash_api", + "download_and_extract_from_antismash_db", "AntismashBGCLoader", "parse_bgc_genbank", "GenomeStatus", "get_best_available_genome_id", "podp_download_and_extract_antismash_data", + "download_and_extract_ncbi_genome", + "submit_antismash_job", + "antismash_job_is_done", ] diff --git a/src/nplinker/genomics/antismash/antismash_api_client.py b/src/nplinker/genomics/antismash/antismash_api_client.py new file mode 100644 index 00000000..32b932c0 --- /dev/null +++ b/src/nplinker/genomics/antismash/antismash_api_client.py @@ -0,0 +1,97 @@ +from __future__ import annotations +import logging +from os import PathLike +from pathlib import Path +from typing import Optional +import requests + + +logger = logging.getLogger(__name__) + + +def submit_antismash_job(genbank_filepath: str | PathLike) -> Optional[str]: + """Submits an antiSMASH job using the provided GenBank file. + + This function sends a GenBank file to the antiSMASH API + and retrieves the job ID if the submission is successful. + + Args: + genbank_filepath (str | PathLike): The path to the GenBank file to be submitted. + + Returns: + Optional[str]: The job ID if the submission is successful, or None if it fails. + + Raises: + requests.exceptions.RequestException: If there is an issue with the HTTP request. + RuntimeError: If the API response does not contain a job ID. + """ + url = "https://antismash.secondarymetabolites.org/api/v1.0/submit" + genbank_filepath = Path(genbank_filepath) + + with open(genbank_filepath, "rb") as file: + files = {"seq": file} + response = requests.post(url, files=files) + response.raise_for_status() # Raise an exception for HTTP errors + + data = response.json() + if "id" not in data: + raise RuntimeError("No antiSMASH job ID returned") + return data["id"] + + +def query_antismash_job(job_id: str) -> Optional[dict]: + """Gets the status of an antiSMASH job. + + Args: + job_id (str): The job ID to query. + + Returns: + dict: The response JSON if successful, otherwise None. + """ + url = f"https://antismash.secondarymetabolites.org/api/v1.0/status/{job_id}" + + try: + response = requests.get(url, timeout=10) + response.raise_for_status() # Raise an exception for HTTP errors + return response.json() + + except requests.exceptions.RequestException as req_err: + logger.error(f"Request failed for job_id {job_id}: {req_err}") + except ValueError as json_err: # Handles JSON decoding errors + logger.error(f"Invalid JSON response for job_id {job_id}: {json_err}") + except Exception as err: + logger.error(f"Unexpected error while getting job state for job_id {job_id}: {err}") + + +def antismash_job_is_done(job_id: str) -> bool: + """Checks if the antiSMASH job is complete by polling the job status. + + Args: + job_id (str): The job ID to query. + + Returns: + bool: True if the job is done, False if the job is still running. + + Raises: + RuntimeError: If the job status could not be retrieved or if the job failed. + ValueError: If the job state is missing or unexpected in the response. + """ + response = query_antismash_job(job_id) + + if response is None: + raise RuntimeError(f"Failed to retrieve job status for job_id {job_id}") + if "state" not in response: + raise ValueError(f"Job state missing in response for job_id: {job_id}") + + job_state = response["state"] + if job_state in ("running", "queued"): + return False + if job_state == "done": + return True + if job_state == "failed": + job_status = response.get("status", "No error message provided") + raise RuntimeError(f"AntiSMASH job {job_id} failed with an error: {job_status}") + else: + raise ValueError( + f"Unexpected job state for antismash job ID {job_id}. Job state: {job_state}" + ) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index 07662107..928001c9 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -4,6 +4,7 @@ import shutil from os import PathLike from pathlib import Path +import requests from nplinker.utils import download_and_extract_archive from nplinker.utils import list_dirs from nplinker.utils import list_files @@ -15,50 +16,108 @@ ANTISMASH_DB_DOWNLOAD_URL = "https://antismash-db.secondarymetabolites.org/output/{}/{}" # The antiSMASH DBV2 is for the availability of the old version, better to keep it. ANTISMASH_DBV2_DOWNLOAD_URL = "https://antismash-dbv2.secondarymetabolites.org/output/{}/{}" +# antismash api to download results from submitted jobs +ANTISMASH_API_DOWNLOAD_URL = "https://antismash.secondarymetabolites.org/upload/{}/{}" def download_and_extract_antismash_data( - antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike + url: str, antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike ) -> None: """Download and extract antiSMASH BGC archive for a specified genome. - The antiSMASH database (https://antismash-db.secondarymetabolites.org/) - is used to download the BGC archive. And antiSMASH use RefSeq assembly id - of a genome as the id of the archive. + This function downloads a BGC archive from the specified URL, extracts its contents, + and organizes the extracted files into a structured directory under the given `extract_root`. Args: - antismash_id: The id used to download BGC archive from antiSMASH database. - If the id is versioned (e.g., "GCF_004339725.1") please be sure to - specify the version as well. - download_root: Path to the directory to place downloaded archive in. - extract_root: Path to the directory data files will be extracted to. + url (str): The URL to download the BGC archive from. + antismash_id (str): The identifier for the antiSMASH genome, used to name the extraction directory. + download_root: Path to the directory where the downloaded archive will be stored. + extract_root: Path to the directory where the data files will be extracted. Note that an `antismash` directory will be created in the specified `extract_root` if it doesn't exist. The files will be extracted to `/antismash/` directory. Raises: - ValueError: if `/antismash/` dir is not empty. + ValueError: if `/antismash/` dir is not empty. + Exception: If any error occurs during the download or extraction process, the partially extracted + directory will be cleaned up, and the exception will be re-raised. Examples: - >>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted") + >>> download_and_extract_antismash_data( + "https://antismash-db.secondarymetabolites.org/output/GCF_001.1/GCF_001.1.zip", + "GCF_001.1", + "/data/download", + "/data/extracted" + ) """ - download_root = Path(download_root) extract_path = Path(extract_root) / "antismash" / antismash_id _prepare_extract_path(extract_path) - try: - for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]: - url = base_url.format(antismash_id, antismash_id + ".zip") - download_and_extract_archive(url, download_root, extract_path, antismash_id + ".zip") - break - + download_and_extract_archive(url, download_root, extract_path, f"{antismash_id}.zip") _cleanup_extracted_files(extract_path) - except Exception as e: shutil.rmtree(extract_path) raise e +def download_and_extract_from_antismash_api( + job_id: str, antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike +) -> None: + """Downloads and extracts results from an antiSMASH API job. + + This function constructs the download URL using the provided job ID then + downloads the results as a ZIP file and extracts its contents to the specified directories. + + Args: + antismash_id (str): The unique identifier for the antiSMASH dataset. + job_id (str): The job ID for the antiSMASH API job. + download_root (str or PathLike): The root directory where the ZIP file will be downloaded. + extract_root (str or PathLike): The root directory where the contents of the ZIP file will be extracted. + + Raises: + requests.exceptions.RequestException: If there is an issue with the HTTP request. + zipfile.BadZipFile: If the downloaded file is not a valid ZIP file. + OSError: If there is an issue with file operations such as writing or extracting. + """ + url = ANTISMASH_API_DOWNLOAD_URL.format(job_id, antismash_id + ".zip") + download_and_extract_antismash_data(url, antismash_id, download_root, extract_root) + + +def download_and_extract_from_antismash_db( + refseq_acc: str, download_root: str | PathLike, extract_root: str | PathLike +) -> None: + """Download and extract antiSMASH BGC archive for a specified genome. + + The antiSMASH database (https://antismash-db.secondarymetabolites.org/) + is used to download the BGC archive. And antiSMASH use RefSeq assembly id + of a genome as the id of the archive. + + Args: + refseq_acc: The id used to download BGC archive from antiSMASH database. + If the id is versioned (e.g., "GCF_004339725.1") please be sure to + specify the version as well. + download_root: Path to the directory to place downloaded archive in. + extract_root: Path to the directory data files will be extracted to. + Note that an `antismash` directory will be created in the specified `extract_root` if + it doesn't exist. The files will be extracted to `/antismash/` directory. + + Raises: + ValueError: if `/antismash/` dir is not empty. + + Examples: + >>> download_and_extract_from_antismash_db("GCF_004339725.1", "/data/download", "/data/extracted") + """ + for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]: + url = base_url.format(refseq_acc, f"{refseq_acc}.zip") + if requests.head(url).status_code == 404: # not found + continue + download_and_extract_antismash_data(url, refseq_acc, download_root, extract_root) + return # Exit the loop once a valid URL is processed + + # if both urls give 404 not found + raise RuntimeError(f"No results in antiSMASH DB for {refseq_acc}") + + def _check_extract_path(extract_path: Path): # check if extract_path is empty if any(extract_path.iterdir()): diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py new file mode 100644 index 00000000..b0b333d5 --- /dev/null +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -0,0 +1,107 @@ +import logging +import os +import shutil +import time +from os import PathLike +from pathlib import Path +from typing import Optional +from nplinker.utils import check_md5 +from nplinker.utils import download_url +from nplinker.utils import extract_archive + + +logger = logging.getLogger(__name__) + + +def download_and_extract_ncbi_genome( + refseq_id: str, + download_root: str | PathLike, + extract_root: str | PathLike, + max_attempts: int = 10, +) -> Optional[Path]: + """Downloads and extracts an NCBI dataset for a given genome refseq ID. + + This function attempts to download a dataset from the NCBI database using + the provided refseq ID. It retries the download process up to a maximum + number of times if any errors occur. The function verifies the integrity + of the downloaded files using MD5 checksums and moves and renames the + GenBank files upon successful verification. + + Args: + refseq_id (str): The refseq ID for the dataset to be downloaded. + download_root (str or Path): The root directory where the dataset will be downloaded. + extract_root (str or Path): The root directory where the dataset will be extracted. + max_attempts (int): The maximum number of times to attempt downloading. + + Returns: + Path: The path to the extracted dataset if successful, otherwise None. + + Raises: + Exception: If the maximum number of retries is reached and the dataset could + not be successfully downloaded and extracted. + """ + url = ( + "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/" + f"{refseq_id}/download?include_annotation_type=GENOME_GB" + ) + + download_root = Path(download_root) + extract_path = Path(extract_root) / "ncbi_genomes" + filename = f"ncbi_{refseq_id}.zip" + + extract_path.mkdir(parents=True, exist_ok=True) + + for attempt in range(1, max_attempts + 1): + try: + download_url(url, download_root, filename) + archive = download_root / filename + break + except Exception as e: + logger.warning(f"Attempt {attempt}/{max_attempts} failed to download {url}. Error: {e}") + if attempt < max_attempts: + time.sleep(2) + else: + raise RuntimeError( + f"Maximum download retries ({max_attempts}) reached for {url}. Download failed." + ) + + extract_archive(archive, extract_path) + verify_ncbi_dataset_md5_sums(extract_path) + + # Move and rename GenBank file + genbank_path = extract_path / "ncbi_dataset" / "data" / refseq_id / "genomic.gbff" + new_genbank_path = extract_path / f"{refseq_id}.gbff" + genbank_path.rename(new_genbank_path) + + # Delete unnecessary files + shutil.rmtree(extract_path / "ncbi_dataset") + os.remove(extract_path / "md5sum.txt") + os.remove(extract_path / "README.md") + + return new_genbank_path + + +def verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool: + """Verify the integrity of files in a specified directory using MD5 checksums. + + This function reads an "md5sum.txt" file located in the given extraction path, + which contains MD5 checksums and corresponding file names. It then computes + the MD5 checksum for each file and compares it with the expected value. If any + file's checksum does not match, a `ValueError` is raised. + + Args: + extract_path (PathLike): Path to the directory containing the files and + the "md5sum.txt" file. + + Returns: + bool: True if all files pass the MD5 checksum verification. + + Raises: + ValueError: If the MD5 checksum of any file does not match the expected value. + """ + with open(extract_path / "md5sum.txt", "r") as f: + for line in f: + md5sum, file_name = line.strip().split() + file_path = extract_path / file_name + if not check_md5(file_path, md5sum): + raise ValueError(f"MD5 checksum mismatch for {file_path}") diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index aadf1eb4..2d5f727c 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -2,6 +2,7 @@ import json import logging import re +import time import warnings from collections.abc import Mapping from collections.abc import Sequence @@ -11,7 +12,11 @@ from bs4 import BeautifulSoup from jsonschema import validate from nplinker.defaults import GENOME_STATUS_FILENAME -from nplinker.genomics.antismash import download_and_extract_antismash_data +from nplinker.genomics.antismash import antismash_job_is_done +from nplinker.genomics.antismash import download_and_extract_from_antismash_api +from nplinker.genomics.antismash import download_and_extract_from_antismash_db +from nplinker.genomics.antismash import download_and_extract_ncbi_genome +from nplinker.genomics.antismash import submit_antismash_job from nplinker.schemas import GENOME_STATUS_SCHEMA @@ -189,11 +194,33 @@ def podp_download_and_extract_antismash_data( logger.info( f"antiSMASH BGC data for genome ID {gs.original_id} is downloaded and extracted" ) + continue except Exception as e: logger.warning( f"Failed to retrieve BGC data from antiSMASH-DB for {gs.original_id}. Error: {e}" ) + # retrieve antismash BGC by submitting antismash job via API + try: + logger.info( + f"Downloading genome and submitting antiSMASH job for genome ID {gs.original_id}." + ) + genome_path = download_and_extract_ncbi_genome( + gs.resolved_refseq_id, project_download_root, project_extract_root + ) + job_id = submit_antismash_job(genome_path) + while antismash_job_is_done(job_id) is False: + time.sleep(15) + retrieve_antismash_job_data(job_id, gs, project_download_root, project_extract_root) + logger.info( + f"antiSMASH BGC data for genome ID {gs.original_id} is downloaded and extracted" + ) + continue + except Exception as e: + logger.warning( + f"Failed to retrieve BGC data by submitting a antiSMASH job for genome ID {gs.original_id}. Error: {e}" + ) + # raise and log warning for failed downloads failed_ids = [gs.original_id for gs in gs_dict.values() if not gs.bgc_path] if failed_ids: @@ -286,13 +313,42 @@ def retrieve_antismash_db_data( extract_path = Path(extract_root, "antismash", antismash_id) download_path = Path(download_root, f"{antismash_id}.zip").absolute() - try: - download_and_extract_antismash_data(antismash_id, download_root, extract_root) - Path.touch(extract_path / "completed", exist_ok=True) - genome_status.bgc_path = str(download_path) - except Exception as e: - genome_status.bgc_path = "" - raise e + download_and_extract_from_antismash_db(antismash_id, download_root, extract_root) + Path.touch(extract_path / "completed", exist_ok=True) + genome_status.bgc_path = str(download_path) + + +def retrieve_antismash_job_data( + job_id: str, + genome_status: GenomeStatus, + download_root: str | PathLike, + extract_root: str | PathLike, +) -> None: + """Retrieve antiSMASH API data for a given genome and update its status. + + This function downloads and extracts antiSMASH data for a genome identified + by its resolved RefSeq ID. It updates the `genome_status` object with the + path to the downloaded data or sets it to an empty string if an error occurs. + + Args: + job_id (str): The job ID for the antiSMASH API job. + genome_status (GenomeStatus): An object representing the genome's status, + including its resolved RefSeq ID and BGC path. + download_root (str | PathLike): The root directory where the antiSMASH + data will be downloaded. + extract_root (str | PathLike): The root directory where the antiSMASH + data will be extracted. + + Raises: + Exception: If an error occurs during the download or extraction process. + """ + antismash_id = genome_status.resolved_refseq_id + extract_path = Path(extract_root, "antismash", antismash_id) + download_path = Path(download_root, f"{antismash_id}.zip").absolute() + + download_and_extract_from_antismash_api(job_id, antismash_id, download_root, extract_root) + Path.touch(extract_path / "completed", exist_ok=True) + genome_status.bgc_path = str(download_path) def _resolve_genbank_accession(genbank_id: str) -> str: diff --git a/tests/unit/genomics/test_antismash_downloader.py b/tests/unit/genomics/test_antismash_downloader.py index 1dfeb4cf..651629c2 100644 --- a/tests/unit/genomics/test_antismash_downloader.py +++ b/tests/unit/genomics/test_antismash_downloader.py @@ -1,5 +1,5 @@ import pytest -from nplinker.genomics.antismash import download_and_extract_antismash_data +from nplinker.genomics.antismash import download_and_extract_from_antismash_db from nplinker.utils import extract_archive from nplinker.utils import list_files @@ -14,7 +14,7 @@ def test_default(self, tmp_path): extract_root.mkdir() original_extract_root = tmp_path / "original" original_extract_root.mkdir() - download_and_extract_antismash_data(self.antismash_id, download_root, extract_root) + download_and_extract_from_antismash_db(self.antismash_id, download_root, extract_root) archive = download_root / "GCF_004339725.1.zip" extracted_folder = extract_root / "antismash" / "GCF_004339725.1" extracted_files = list_files(extracted_folder, keep_parent=False) @@ -32,7 +32,9 @@ def test_error_nonempty_path(self, tmp_path): nonempty_path = tmp_path / "extracted" / "antismash" / f"{self.antismash_id}" / "subdir" nonempty_path.mkdir(parents=True) with pytest.raises(ValueError, match="Nonempty directory"): - download_and_extract_antismash_data(self.antismash_id, tmp_path, tmp_path / "extracted") + download_and_extract_from_antismash_db( + self.antismash_id, tmp_path, tmp_path / "extracted" + ) # test a non-existent ID, which can be either a fake ID, non-existent in NCBI # or a valid NCBI genome ID but it does not have BGC data in antismash database @@ -44,6 +46,6 @@ def test_nonexisting_id(self, tmp_path): extract_root.mkdir() for test_id in nonexisting_ids: with pytest.raises(RuntimeError): - download_and_extract_antismash_data(test_id, download_root, extract_root) + download_and_extract_from_antismash_db(test_id, download_root, extract_root) extracted_folder = extract_root / "antismash" / test_id assert not extracted_folder.exists() From a972a7001f07fc1ea7eb8924b4fbe71e71f523f6 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 12:06:02 +0100 Subject: [PATCH 12/48] Feat: Add antiSMASH API functionality --- src/nplinker/genomics/antismash/__init__.py | 12 +- .../antismash/antismash_api_client.py | 97 ++++++++++++++++ .../antismash/antismash_downloader.py | 97 ++++++++++++---- .../genomics/antismash/ncbi_downloader.py | 107 ++++++++++++++++++ .../antismash/podp_antismash_downloader.py | 72 ++++++++++-- .../genomics/test_antismash_downloader.py | 10 +- 6 files changed, 362 insertions(+), 33 deletions(-) create mode 100644 src/nplinker/genomics/antismash/antismash_api_client.py create mode 100644 src/nplinker/genomics/antismash/ncbi_downloader.py diff --git a/src/nplinker/genomics/antismash/__init__.py b/src/nplinker/genomics/antismash/__init__.py index e126f548..4817e806 100644 --- a/src/nplinker/genomics/antismash/__init__.py +++ b/src/nplinker/genomics/antismash/__init__.py @@ -1,16 +1,24 @@ -from .antismash_downloader import download_and_extract_antismash_data +from .antismash_api_client import antismash_job_is_done +from .antismash_api_client import submit_antismash_job +from .antismash_downloader import download_and_extract_from_antismash_api +from .antismash_downloader import download_and_extract_from_antismash_db from .antismash_loader import AntismashBGCLoader from .antismash_loader import parse_bgc_genbank +from .ncbi_downloader import download_and_extract_ncbi_genome from .podp_antismash_downloader import GenomeStatus from .podp_antismash_downloader import get_best_available_genome_id from .podp_antismash_downloader import podp_download_and_extract_antismash_data __all__ = [ - "download_and_extract_antismash_data", + "download_and_extract_from_antismash_api", + "download_and_extract_from_antismash_db", "AntismashBGCLoader", "parse_bgc_genbank", "GenomeStatus", "get_best_available_genome_id", "podp_download_and_extract_antismash_data", + "download_and_extract_ncbi_genome", + "submit_antismash_job", + "antismash_job_is_done", ] diff --git a/src/nplinker/genomics/antismash/antismash_api_client.py b/src/nplinker/genomics/antismash/antismash_api_client.py new file mode 100644 index 00000000..32b932c0 --- /dev/null +++ b/src/nplinker/genomics/antismash/antismash_api_client.py @@ -0,0 +1,97 @@ +from __future__ import annotations +import logging +from os import PathLike +from pathlib import Path +from typing import Optional +import requests + + +logger = logging.getLogger(__name__) + + +def submit_antismash_job(genbank_filepath: str | PathLike) -> Optional[str]: + """Submits an antiSMASH job using the provided GenBank file. + + This function sends a GenBank file to the antiSMASH API + and retrieves the job ID if the submission is successful. + + Args: + genbank_filepath (str | PathLike): The path to the GenBank file to be submitted. + + Returns: + Optional[str]: The job ID if the submission is successful, or None if it fails. + + Raises: + requests.exceptions.RequestException: If there is an issue with the HTTP request. + RuntimeError: If the API response does not contain a job ID. + """ + url = "https://antismash.secondarymetabolites.org/api/v1.0/submit" + genbank_filepath = Path(genbank_filepath) + + with open(genbank_filepath, "rb") as file: + files = {"seq": file} + response = requests.post(url, files=files) + response.raise_for_status() # Raise an exception for HTTP errors + + data = response.json() + if "id" not in data: + raise RuntimeError("No antiSMASH job ID returned") + return data["id"] + + +def query_antismash_job(job_id: str) -> Optional[dict]: + """Gets the status of an antiSMASH job. + + Args: + job_id (str): The job ID to query. + + Returns: + dict: The response JSON if successful, otherwise None. + """ + url = f"https://antismash.secondarymetabolites.org/api/v1.0/status/{job_id}" + + try: + response = requests.get(url, timeout=10) + response.raise_for_status() # Raise an exception for HTTP errors + return response.json() + + except requests.exceptions.RequestException as req_err: + logger.error(f"Request failed for job_id {job_id}: {req_err}") + except ValueError as json_err: # Handles JSON decoding errors + logger.error(f"Invalid JSON response for job_id {job_id}: {json_err}") + except Exception as err: + logger.error(f"Unexpected error while getting job state for job_id {job_id}: {err}") + + +def antismash_job_is_done(job_id: str) -> bool: + """Checks if the antiSMASH job is complete by polling the job status. + + Args: + job_id (str): The job ID to query. + + Returns: + bool: True if the job is done, False if the job is still running. + + Raises: + RuntimeError: If the job status could not be retrieved or if the job failed. + ValueError: If the job state is missing or unexpected in the response. + """ + response = query_antismash_job(job_id) + + if response is None: + raise RuntimeError(f"Failed to retrieve job status for job_id {job_id}") + if "state" not in response: + raise ValueError(f"Job state missing in response for job_id: {job_id}") + + job_state = response["state"] + if job_state in ("running", "queued"): + return False + if job_state == "done": + return True + if job_state == "failed": + job_status = response.get("status", "No error message provided") + raise RuntimeError(f"AntiSMASH job {job_id} failed with an error: {job_status}") + else: + raise ValueError( + f"Unexpected job state for antismash job ID {job_id}. Job state: {job_state}" + ) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index 07662107..928001c9 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -4,6 +4,7 @@ import shutil from os import PathLike from pathlib import Path +import requests from nplinker.utils import download_and_extract_archive from nplinker.utils import list_dirs from nplinker.utils import list_files @@ -15,50 +16,108 @@ ANTISMASH_DB_DOWNLOAD_URL = "https://antismash-db.secondarymetabolites.org/output/{}/{}" # The antiSMASH DBV2 is for the availability of the old version, better to keep it. ANTISMASH_DBV2_DOWNLOAD_URL = "https://antismash-dbv2.secondarymetabolites.org/output/{}/{}" +# antismash api to download results from submitted jobs +ANTISMASH_API_DOWNLOAD_URL = "https://antismash.secondarymetabolites.org/upload/{}/{}" def download_and_extract_antismash_data( - antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike + url: str, antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike ) -> None: """Download and extract antiSMASH BGC archive for a specified genome. - The antiSMASH database (https://antismash-db.secondarymetabolites.org/) - is used to download the BGC archive. And antiSMASH use RefSeq assembly id - of a genome as the id of the archive. + This function downloads a BGC archive from the specified URL, extracts its contents, + and organizes the extracted files into a structured directory under the given `extract_root`. Args: - antismash_id: The id used to download BGC archive from antiSMASH database. - If the id is versioned (e.g., "GCF_004339725.1") please be sure to - specify the version as well. - download_root: Path to the directory to place downloaded archive in. - extract_root: Path to the directory data files will be extracted to. + url (str): The URL to download the BGC archive from. + antismash_id (str): The identifier for the antiSMASH genome, used to name the extraction directory. + download_root: Path to the directory where the downloaded archive will be stored. + extract_root: Path to the directory where the data files will be extracted. Note that an `antismash` directory will be created in the specified `extract_root` if it doesn't exist. The files will be extracted to `/antismash/` directory. Raises: - ValueError: if `/antismash/` dir is not empty. + ValueError: if `/antismash/` dir is not empty. + Exception: If any error occurs during the download or extraction process, the partially extracted + directory will be cleaned up, and the exception will be re-raised. Examples: - >>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted") + >>> download_and_extract_antismash_data( + "https://antismash-db.secondarymetabolites.org/output/GCF_001.1/GCF_001.1.zip", + "GCF_001.1", + "/data/download", + "/data/extracted" + ) """ - download_root = Path(download_root) extract_path = Path(extract_root) / "antismash" / antismash_id _prepare_extract_path(extract_path) - try: - for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]: - url = base_url.format(antismash_id, antismash_id + ".zip") - download_and_extract_archive(url, download_root, extract_path, antismash_id + ".zip") - break - + download_and_extract_archive(url, download_root, extract_path, f"{antismash_id}.zip") _cleanup_extracted_files(extract_path) - except Exception as e: shutil.rmtree(extract_path) raise e +def download_and_extract_from_antismash_api( + job_id: str, antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike +) -> None: + """Downloads and extracts results from an antiSMASH API job. + + This function constructs the download URL using the provided job ID then + downloads the results as a ZIP file and extracts its contents to the specified directories. + + Args: + antismash_id (str): The unique identifier for the antiSMASH dataset. + job_id (str): The job ID for the antiSMASH API job. + download_root (str or PathLike): The root directory where the ZIP file will be downloaded. + extract_root (str or PathLike): The root directory where the contents of the ZIP file will be extracted. + + Raises: + requests.exceptions.RequestException: If there is an issue with the HTTP request. + zipfile.BadZipFile: If the downloaded file is not a valid ZIP file. + OSError: If there is an issue with file operations such as writing or extracting. + """ + url = ANTISMASH_API_DOWNLOAD_URL.format(job_id, antismash_id + ".zip") + download_and_extract_antismash_data(url, antismash_id, download_root, extract_root) + + +def download_and_extract_from_antismash_db( + refseq_acc: str, download_root: str | PathLike, extract_root: str | PathLike +) -> None: + """Download and extract antiSMASH BGC archive for a specified genome. + + The antiSMASH database (https://antismash-db.secondarymetabolites.org/) + is used to download the BGC archive. And antiSMASH use RefSeq assembly id + of a genome as the id of the archive. + + Args: + refseq_acc: The id used to download BGC archive from antiSMASH database. + If the id is versioned (e.g., "GCF_004339725.1") please be sure to + specify the version as well. + download_root: Path to the directory to place downloaded archive in. + extract_root: Path to the directory data files will be extracted to. + Note that an `antismash` directory will be created in the specified `extract_root` if + it doesn't exist. The files will be extracted to `/antismash/` directory. + + Raises: + ValueError: if `/antismash/` dir is not empty. + + Examples: + >>> download_and_extract_from_antismash_db("GCF_004339725.1", "/data/download", "/data/extracted") + """ + for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]: + url = base_url.format(refseq_acc, f"{refseq_acc}.zip") + if requests.head(url).status_code == 404: # not found + continue + download_and_extract_antismash_data(url, refseq_acc, download_root, extract_root) + return # Exit the loop once a valid URL is processed + + # if both urls give 404 not found + raise RuntimeError(f"No results in antiSMASH DB for {refseq_acc}") + + def _check_extract_path(extract_path: Path): # check if extract_path is empty if any(extract_path.iterdir()): diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py new file mode 100644 index 00000000..b0b333d5 --- /dev/null +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -0,0 +1,107 @@ +import logging +import os +import shutil +import time +from os import PathLike +from pathlib import Path +from typing import Optional +from nplinker.utils import check_md5 +from nplinker.utils import download_url +from nplinker.utils import extract_archive + + +logger = logging.getLogger(__name__) + + +def download_and_extract_ncbi_genome( + refseq_id: str, + download_root: str | PathLike, + extract_root: str | PathLike, + max_attempts: int = 10, +) -> Optional[Path]: + """Downloads and extracts an NCBI dataset for a given genome refseq ID. + + This function attempts to download a dataset from the NCBI database using + the provided refseq ID. It retries the download process up to a maximum + number of times if any errors occur. The function verifies the integrity + of the downloaded files using MD5 checksums and moves and renames the + GenBank files upon successful verification. + + Args: + refseq_id (str): The refseq ID for the dataset to be downloaded. + download_root (str or Path): The root directory where the dataset will be downloaded. + extract_root (str or Path): The root directory where the dataset will be extracted. + max_attempts (int): The maximum number of times to attempt downloading. + + Returns: + Path: The path to the extracted dataset if successful, otherwise None. + + Raises: + Exception: If the maximum number of retries is reached and the dataset could + not be successfully downloaded and extracted. + """ + url = ( + "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/" + f"{refseq_id}/download?include_annotation_type=GENOME_GB" + ) + + download_root = Path(download_root) + extract_path = Path(extract_root) / "ncbi_genomes" + filename = f"ncbi_{refseq_id}.zip" + + extract_path.mkdir(parents=True, exist_ok=True) + + for attempt in range(1, max_attempts + 1): + try: + download_url(url, download_root, filename) + archive = download_root / filename + break + except Exception as e: + logger.warning(f"Attempt {attempt}/{max_attempts} failed to download {url}. Error: {e}") + if attempt < max_attempts: + time.sleep(2) + else: + raise RuntimeError( + f"Maximum download retries ({max_attempts}) reached for {url}. Download failed." + ) + + extract_archive(archive, extract_path) + verify_ncbi_dataset_md5_sums(extract_path) + + # Move and rename GenBank file + genbank_path = extract_path / "ncbi_dataset" / "data" / refseq_id / "genomic.gbff" + new_genbank_path = extract_path / f"{refseq_id}.gbff" + genbank_path.rename(new_genbank_path) + + # Delete unnecessary files + shutil.rmtree(extract_path / "ncbi_dataset") + os.remove(extract_path / "md5sum.txt") + os.remove(extract_path / "README.md") + + return new_genbank_path + + +def verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool: + """Verify the integrity of files in a specified directory using MD5 checksums. + + This function reads an "md5sum.txt" file located in the given extraction path, + which contains MD5 checksums and corresponding file names. It then computes + the MD5 checksum for each file and compares it with the expected value. If any + file's checksum does not match, a `ValueError` is raised. + + Args: + extract_path (PathLike): Path to the directory containing the files and + the "md5sum.txt" file. + + Returns: + bool: True if all files pass the MD5 checksum verification. + + Raises: + ValueError: If the MD5 checksum of any file does not match the expected value. + """ + with open(extract_path / "md5sum.txt", "r") as f: + for line in f: + md5sum, file_name = line.strip().split() + file_path = extract_path / file_name + if not check_md5(file_path, md5sum): + raise ValueError(f"MD5 checksum mismatch for {file_path}") diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index aadf1eb4..2d5f727c 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -2,6 +2,7 @@ import json import logging import re +import time import warnings from collections.abc import Mapping from collections.abc import Sequence @@ -11,7 +12,11 @@ from bs4 import BeautifulSoup from jsonschema import validate from nplinker.defaults import GENOME_STATUS_FILENAME -from nplinker.genomics.antismash import download_and_extract_antismash_data +from nplinker.genomics.antismash import antismash_job_is_done +from nplinker.genomics.antismash import download_and_extract_from_antismash_api +from nplinker.genomics.antismash import download_and_extract_from_antismash_db +from nplinker.genomics.antismash import download_and_extract_ncbi_genome +from nplinker.genomics.antismash import submit_antismash_job from nplinker.schemas import GENOME_STATUS_SCHEMA @@ -189,11 +194,33 @@ def podp_download_and_extract_antismash_data( logger.info( f"antiSMASH BGC data for genome ID {gs.original_id} is downloaded and extracted" ) + continue except Exception as e: logger.warning( f"Failed to retrieve BGC data from antiSMASH-DB for {gs.original_id}. Error: {e}" ) + # retrieve antismash BGC by submitting antismash job via API + try: + logger.info( + f"Downloading genome and submitting antiSMASH job for genome ID {gs.original_id}." + ) + genome_path = download_and_extract_ncbi_genome( + gs.resolved_refseq_id, project_download_root, project_extract_root + ) + job_id = submit_antismash_job(genome_path) + while antismash_job_is_done(job_id) is False: + time.sleep(15) + retrieve_antismash_job_data(job_id, gs, project_download_root, project_extract_root) + logger.info( + f"antiSMASH BGC data for genome ID {gs.original_id} is downloaded and extracted" + ) + continue + except Exception as e: + logger.warning( + f"Failed to retrieve BGC data by submitting a antiSMASH job for genome ID {gs.original_id}. Error: {e}" + ) + # raise and log warning for failed downloads failed_ids = [gs.original_id for gs in gs_dict.values() if not gs.bgc_path] if failed_ids: @@ -286,13 +313,42 @@ def retrieve_antismash_db_data( extract_path = Path(extract_root, "antismash", antismash_id) download_path = Path(download_root, f"{antismash_id}.zip").absolute() - try: - download_and_extract_antismash_data(antismash_id, download_root, extract_root) - Path.touch(extract_path / "completed", exist_ok=True) - genome_status.bgc_path = str(download_path) - except Exception as e: - genome_status.bgc_path = "" - raise e + download_and_extract_from_antismash_db(antismash_id, download_root, extract_root) + Path.touch(extract_path / "completed", exist_ok=True) + genome_status.bgc_path = str(download_path) + + +def retrieve_antismash_job_data( + job_id: str, + genome_status: GenomeStatus, + download_root: str | PathLike, + extract_root: str | PathLike, +) -> None: + """Retrieve antiSMASH API data for a given genome and update its status. + + This function downloads and extracts antiSMASH data for a genome identified + by its resolved RefSeq ID. It updates the `genome_status` object with the + path to the downloaded data or sets it to an empty string if an error occurs. + + Args: + job_id (str): The job ID for the antiSMASH API job. + genome_status (GenomeStatus): An object representing the genome's status, + including its resolved RefSeq ID and BGC path. + download_root (str | PathLike): The root directory where the antiSMASH + data will be downloaded. + extract_root (str | PathLike): The root directory where the antiSMASH + data will be extracted. + + Raises: + Exception: If an error occurs during the download or extraction process. + """ + antismash_id = genome_status.resolved_refseq_id + extract_path = Path(extract_root, "antismash", antismash_id) + download_path = Path(download_root, f"{antismash_id}.zip").absolute() + + download_and_extract_from_antismash_api(job_id, antismash_id, download_root, extract_root) + Path.touch(extract_path / "completed", exist_ok=True) + genome_status.bgc_path = str(download_path) def _resolve_genbank_accession(genbank_id: str) -> str: diff --git a/tests/unit/genomics/test_antismash_downloader.py b/tests/unit/genomics/test_antismash_downloader.py index 1dfeb4cf..651629c2 100644 --- a/tests/unit/genomics/test_antismash_downloader.py +++ b/tests/unit/genomics/test_antismash_downloader.py @@ -1,5 +1,5 @@ import pytest -from nplinker.genomics.antismash import download_and_extract_antismash_data +from nplinker.genomics.antismash import download_and_extract_from_antismash_db from nplinker.utils import extract_archive from nplinker.utils import list_files @@ -14,7 +14,7 @@ def test_default(self, tmp_path): extract_root.mkdir() original_extract_root = tmp_path / "original" original_extract_root.mkdir() - download_and_extract_antismash_data(self.antismash_id, download_root, extract_root) + download_and_extract_from_antismash_db(self.antismash_id, download_root, extract_root) archive = download_root / "GCF_004339725.1.zip" extracted_folder = extract_root / "antismash" / "GCF_004339725.1" extracted_files = list_files(extracted_folder, keep_parent=False) @@ -32,7 +32,9 @@ def test_error_nonempty_path(self, tmp_path): nonempty_path = tmp_path / "extracted" / "antismash" / f"{self.antismash_id}" / "subdir" nonempty_path.mkdir(parents=True) with pytest.raises(ValueError, match="Nonempty directory"): - download_and_extract_antismash_data(self.antismash_id, tmp_path, tmp_path / "extracted") + download_and_extract_from_antismash_db( + self.antismash_id, tmp_path, tmp_path / "extracted" + ) # test a non-existent ID, which can be either a fake ID, non-existent in NCBI # or a valid NCBI genome ID but it does not have BGC data in antismash database @@ -44,6 +46,6 @@ def test_nonexisting_id(self, tmp_path): extract_root.mkdir() for test_id in nonexisting_ids: with pytest.raises(RuntimeError): - download_and_extract_antismash_data(test_id, download_root, extract_root) + download_and_extract_from_antismash_db(test_id, download_root, extract_root) extracted_folder = extract_root / "antismash" / test_id assert not extracted_folder.exists() From f3a159f3bcfbc1568058ecd3a6fda08838953439 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 14:27:18 +0100 Subject: [PATCH 13/48] fix: improve logging message for start of antiSMASH API process --- src/nplinker/genomics/antismash/podp_antismash_downloader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 2d5f727c..35f269cc 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -203,7 +203,8 @@ def podp_download_and_extract_antismash_data( # retrieve antismash BGC by submitting antismash job via API try: logger.info( - f"Downloading genome and submitting antiSMASH job for genome ID {gs.original_id}." + "Downloading genome assembly from NCBI and submitting antiSMASH job for " + f"genome ID {gs.original_id}." ) genome_path = download_and_extract_ncbi_genome( gs.resolved_refseq_id, project_download_root, project_extract_root From da4da3c85d472c7e535068072da9cf5943b87410 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 14:30:58 +0100 Subject: [PATCH 14/48] docs: improve docstring for download_and_extract_ncbi_genome function --- .../genomics/antismash/ncbi_downloader.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py index b0b333d5..a65a5b92 100644 --- a/src/nplinker/genomics/antismash/ncbi_downloader.py +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -19,26 +19,27 @@ def download_and_extract_ncbi_genome( extract_root: str | PathLike, max_attempts: int = 10, ) -> Optional[Path]: - """Downloads and extracts an NCBI dataset for a given genome refseq ID. + """Downloads and extracts an NCBI dataset for a given genome RefSeq ID. - This function attempts to download a dataset from the NCBI database using - the provided refseq ID. It retries the download process up to a maximum - number of times if any errors occur. The function verifies the integrity - of the downloaded files using MD5 checksums and moves and renames the - GenBank files upon successful verification. + This function retrieves a dataset from the NCBI database using the provided + RefSeq ID. It retries the download process up to a specified maximum number + of attempts in case of errors. The function verifies the integrity of the + downloaded files using MD5 checksums, extracts the dataset, and renames the + GenBank file for easier access. Unnecessary files are removed after successful + processing. Args: - refseq_id (str): The refseq ID for the dataset to be downloaded. - download_root (str or Path): The root directory where the dataset will be downloaded. - extract_root (str or Path): The root directory where the dataset will be extracted. - max_attempts (int): The maximum number of times to attempt downloading. + refseq_id (str): The RefSeq ID for the dataset to be downloaded. + download_root (str | PathLike): The directory where the dataset will be downloaded. + extract_root (str | PathLike): The directory where the dataset will be extracted. + max_attempts (int): The maximum number of download attempts. Defaults to 10. Returns: - Path: The path to the extracted dataset if successful, otherwise None. + Optional[Path]: The path to the extracted GenBank file if successful, otherwise None. Raises: - Exception: If the maximum number of retries is reached and the dataset could - not be successfully downloaded and extracted. + RuntimeError: If the maximum number of retries is reached and the dataset + could not be successfully downloaded and extracted. """ url = ( "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/" From af094e4176c912fbc164ecbf92cd477faa20bbd3 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 14:39:45 +0100 Subject: [PATCH 15/48] fix: update logging messages for antiSMASH data retrieval failures --- .../genomics/antismash/podp_antismash_downloader.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 35f269cc..f96538e1 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -196,8 +196,9 @@ def podp_download_and_extract_antismash_data( ) continue except Exception as e: - logger.warning( - f"Failed to retrieve BGC data from antiSMASH-DB for {gs.original_id}. Error: {e}" + logger.info( + f"Unable to retrieve BGC data from antiSMASH-DB for genome ID {gs.original_id}. " + f"Error: {e}" ) # retrieve antismash BGC by submitting antismash job via API @@ -218,10 +219,14 @@ def podp_download_and_extract_antismash_data( ) continue except Exception as e: - logger.warning( - f"Failed to retrieve BGC data by submitting a antiSMASH job for genome ID {gs.original_id}. Error: {e}" + logger.info( + f"Unable to retrieve BGC data via antiSMASH API for genome ID {gs.original_id}. " + f"Error: {e}" ) + if gs.bgc_path == "": + logger.warning(f"Failed to retrieve BGC data for genome ID {gs.original_id}.") + # raise and log warning for failed downloads failed_ids = [gs.original_id for gs in gs_dict.values() if not gs.bgc_path] if failed_ids: From 99d99a25714ca7ad76301698c048018b6ac285d5 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 14:43:35 +0100 Subject: [PATCH 16/48] add logging after antiSMASH job submission --- src/nplinker/genomics/antismash/podp_antismash_downloader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index f96538e1..efd7acc5 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -211,6 +211,7 @@ def podp_download_and_extract_antismash_data( gs.resolved_refseq_id, project_download_root, project_extract_root ) job_id = submit_antismash_job(genome_path) + logger.info(f"Waiting for antiSMASH job {job_id} to complete.") while antismash_job_is_done(job_id) is False: time.sleep(15) retrieve_antismash_job_data(job_id, gs, project_download_root, project_extract_root) From 1c8393479a9d27924be27086347fed048f152194 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 16:48:57 +0100 Subject: [PATCH 17/48] refactor: rename refseq_id to genome_assembly_acc --- src/nplinker/genomics/antismash/ncbi_downloader.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py index a65a5b92..b6b5da0c 100644 --- a/src/nplinker/genomics/antismash/ncbi_downloader.py +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -14,7 +14,7 @@ def download_and_extract_ncbi_genome( - refseq_id: str, + genome_assembly_acc: str, download_root: str | PathLike, extract_root: str | PathLike, max_attempts: int = 10, @@ -29,7 +29,7 @@ def download_and_extract_ncbi_genome( processing. Args: - refseq_id (str): The RefSeq ID for the dataset to be downloaded. + genome_assembly_acc (str): The NCBI accession of the genome assembly to be downloaded. download_root (str | PathLike): The directory where the dataset will be downloaded. extract_root (str | PathLike): The directory where the dataset will be extracted. max_attempts (int): The maximum number of download attempts. Defaults to 10. @@ -43,12 +43,12 @@ def download_and_extract_ncbi_genome( """ url = ( "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/" - f"{refseq_id}/download?include_annotation_type=GENOME_GB" + f"{genome_assembly_acc}/download?include_annotation_type=GENOME_GB" ) download_root = Path(download_root) extract_path = Path(extract_root) / "ncbi_genomes" - filename = f"ncbi_{refseq_id}.zip" + filename = f"ncbi_{genome_assembly_acc}.zip" extract_path.mkdir(parents=True, exist_ok=True) @@ -70,8 +70,8 @@ def download_and_extract_ncbi_genome( verify_ncbi_dataset_md5_sums(extract_path) # Move and rename GenBank file - genbank_path = extract_path / "ncbi_dataset" / "data" / refseq_id / "genomic.gbff" - new_genbank_path = extract_path / f"{refseq_id}.gbff" + genbank_path = extract_path / "ncbi_dataset" / "data" / genome_assembly_acc / "genomic.gbff" + new_genbank_path = extract_path / f"{genome_assembly_acc}.gbff" genbank_path.rename(new_genbank_path) # Delete unnecessary files From 496db38b06416c6accb39bf197fee388502eaa4b Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 16:49:50 +0100 Subject: [PATCH 18/48] improve genome download process with validation and retry logic --- .../genomics/antismash/ncbi_downloader.py | 71 +++++++++++++------ 1 file changed, 49 insertions(+), 22 deletions(-) diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py index b6b5da0c..d2095e81 100644 --- a/src/nplinker/genomics/antismash/ncbi_downloader.py +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -5,6 +5,8 @@ from os import PathLike from pathlib import Path from typing import Optional +import httpx +import requests from nplinker.utils import check_md5 from nplinker.utils import download_url from nplinker.utils import extract_archive @@ -41,31 +43,11 @@ def download_and_extract_ncbi_genome( RuntimeError: If the maximum number of retries is reached and the dataset could not be successfully downloaded and extracted. """ - url = ( - "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/" - f"{genome_assembly_acc}/download?include_annotation_type=GENOME_GB" - ) - - download_root = Path(download_root) extract_path = Path(extract_root) / "ncbi_genomes" - filename = f"ncbi_{genome_assembly_acc}.zip" - extract_path.mkdir(parents=True, exist_ok=True) - for attempt in range(1, max_attempts + 1): - try: - download_url(url, download_root, filename) - archive = download_root / filename - break - except Exception as e: - logger.warning(f"Attempt {attempt}/{max_attempts} failed to download {url}. Error: {e}") - if attempt < max_attempts: - time.sleep(2) - else: - raise RuntimeError( - f"Maximum download retries ({max_attempts}) reached for {url}. Download failed." - ) - + _check_genome_accession_validity(genome_assembly_acc) + archive = _download_genome(genome_assembly_acc, download_root, max_attempts) extract_archive(archive, extract_path) verify_ncbi_dataset_md5_sums(extract_path) @@ -106,3 +88,48 @@ def verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool: file_path = extract_path / file_name if not check_md5(file_path, md5sum): raise ValueError(f"MD5 checksum mismatch for {file_path}") + + +def _check_genome_accession_validity(genome_assembly_acc, max_attempts=10): + """Check the validity of genome accessio.""" + url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{genome_assembly_acc}/check" + + # Retry multiple times because NCBI has currently issues (500 Internal Server Error) + for attempt in range(1, max_attempts + 1): + try: + response = requests.get(url) + response.raise_for_status() + break + except Exception: + if attempt < max_attempts: + time.sleep(1) + + # Raise if no attempt was successful + response.raise_for_status() + # Raise if genome assembly is not successful + if "valid_assemblies" not in response.json(): + raise ValueError(f"Not a valid genome assembly accession: {genome_assembly_acc}") + + +def _download_genome(genome_assembly_acc, download_root, max_attempts): + url = ( + "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/" + f"{genome_assembly_acc}/download?include_annotation_type=GENOME_GB" + ) + download_root = Path(download_root) + filename = f"ncbi_{genome_assembly_acc}.zip" + + # Retry multiple times because NCBI has issues currently + for attempt in range(1, max_attempts + 1): + try: + download_url(url, download_root, filename) + return download_root / filename + except httpx.ReadTimeout as e: + logger.warning(f"Attempt {attempt}/{max_attempts} failed to download {url}. Error: {e}") + if attempt < max_attempts: + time.sleep(1) + else: + raise httpx.ReadTimeout( + f"Failed to download the genome {genome_assembly_acc} from NCBI. " + f"Maximum download retries ({max_attempts}) reached for {url}." + ) From cc6573c206869fbb6152a02d0bad494f072b8669 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 16:50:28 +0100 Subject: [PATCH 19/48] test: add unit tests for download_and_extract_ncbi_genome function --- tests/unit/genomics/test_ncbi_downloader.py | 43 +++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 tests/unit/genomics/test_ncbi_downloader.py diff --git a/tests/unit/genomics/test_ncbi_downloader.py b/tests/unit/genomics/test_ncbi_downloader.py new file mode 100644 index 00000000..dadbe9ea --- /dev/null +++ b/tests/unit/genomics/test_ncbi_downloader.py @@ -0,0 +1,43 @@ +from unittest.mock import patch +import httpx +import pytest +from nplinker.genomics.antismash.ncbi_downloader import download_and_extract_ncbi_genome + + +@pytest.fixture +def download_root(tmp_path): + return tmp_path / "download" + + +@pytest.fixture +def extract_root(tmp_path): + return tmp_path / "extracted" + + +def test_download_and_extract_ncbi_genome_success(download_root, extract_root): + refseq_id = "GCF_000514775.1" + + genome_path = download_and_extract_ncbi_genome(refseq_id, download_root, extract_root) + + assert genome_path == extract_root / "ncbi_genomes" / f"{refseq_id}.gbff" + assert not (extract_root / "ncbi_genomes" / "md5sum.txt").exists() + assert not (extract_root / "ncbi_genomes" / "README.md").exists() + assert not (extract_root / "ncbi_genomes" / "ncbi_dataset").exists() + + +def test_download_and_extract_ncbi_genome_max_retries(download_root, extract_root): + refseq_id = "GCF_000514775.1" + + with patch( + "nplinker.genomics.antismash.ncbi_downloader.download_url", + side_effect=httpx.ReadTimeout("Download failed"), + ): + with pytest.raises(httpx.ReadTimeout, match="Maximum download retries"): + download_and_extract_ncbi_genome(refseq_id, download_root, extract_root, max_attempts=1) + + +def test_download_and_extract_ncbi_genome_invalid_refseq_id(download_root, extract_root): + refseq_id = "invalid_ref_seq_id" + + with pytest.raises(ValueError, match="Not a valid genome assembly accession"): + download_and_extract_ncbi_genome(refseq_id, download_root, extract_root) From 51e4817b669a18be12a9c814213905e667a85081 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 16:50:52 +0100 Subject: [PATCH 20/48] refactor: rename verify_ncbi_dataset_md5_sums to _verify_ncbi_dataset_md5_sums for consistency --- src/nplinker/genomics/antismash/ncbi_downloader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py index d2095e81..f3260ef0 100644 --- a/src/nplinker/genomics/antismash/ncbi_downloader.py +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -49,7 +49,7 @@ def download_and_extract_ncbi_genome( _check_genome_accession_validity(genome_assembly_acc) archive = _download_genome(genome_assembly_acc, download_root, max_attempts) extract_archive(archive, extract_path) - verify_ncbi_dataset_md5_sums(extract_path) + _verify_ncbi_dataset_md5_sums(extract_path) # Move and rename GenBank file genbank_path = extract_path / "ncbi_dataset" / "data" / genome_assembly_acc / "genomic.gbff" @@ -64,7 +64,7 @@ def download_and_extract_ncbi_genome( return new_genbank_path -def verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool: +def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool: """Verify the integrity of files in a specified directory using MD5 checksums. This function reads an "md5sum.txt" file located in the given extraction path, From 6b564ea6d418763a312ac3e59144a4d267b42c9e Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 16:52:24 +0100 Subject: [PATCH 21/48] refactor: move _verify_ncbi_dataset_md5_sums function to a new location for better organization --- .../genomics/antismash/ncbi_downloader.py | 52 +++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py index f3260ef0..3a91b367 100644 --- a/src/nplinker/genomics/antismash/ncbi_downloader.py +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -64,32 +64,6 @@ def download_and_extract_ncbi_genome( return new_genbank_path -def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool: - """Verify the integrity of files in a specified directory using MD5 checksums. - - This function reads an "md5sum.txt" file located in the given extraction path, - which contains MD5 checksums and corresponding file names. It then computes - the MD5 checksum for each file and compares it with the expected value. If any - file's checksum does not match, a `ValueError` is raised. - - Args: - extract_path (PathLike): Path to the directory containing the files and - the "md5sum.txt" file. - - Returns: - bool: True if all files pass the MD5 checksum verification. - - Raises: - ValueError: If the MD5 checksum of any file does not match the expected value. - """ - with open(extract_path / "md5sum.txt", "r") as f: - for line in f: - md5sum, file_name = line.strip().split() - file_path = extract_path / file_name - if not check_md5(file_path, md5sum): - raise ValueError(f"MD5 checksum mismatch for {file_path}") - - def _check_genome_accession_validity(genome_assembly_acc, max_attempts=10): """Check the validity of genome accessio.""" url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{genome_assembly_acc}/check" @@ -133,3 +107,29 @@ def _download_genome(genome_assembly_acc, download_root, max_attempts): f"Failed to download the genome {genome_assembly_acc} from NCBI. " f"Maximum download retries ({max_attempts}) reached for {url}." ) + + +def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool: + """Verify the integrity of files in a specified directory using MD5 checksums. + + This function reads an "md5sum.txt" file located in the given extraction path, + which contains MD5 checksums and corresponding file names. It then computes + the MD5 checksum for each file and compares it with the expected value. If any + file's checksum does not match, a `ValueError` is raised. + + Args: + extract_path (PathLike): Path to the directory containing the files and + the "md5sum.txt" file. + + Returns: + bool: True if all files pass the MD5 checksum verification. + + Raises: + ValueError: If the MD5 checksum of any file does not match the expected value. + """ + with open(extract_path / "md5sum.txt", "r") as f: + for line in f: + md5sum, file_name = line.strip().split() + file_path = extract_path / file_name + if not check_md5(file_path, md5sum): + raise ValueError(f"MD5 checksum mismatch for {file_path}") From 7570852bb7437419228b02978d820deae009bead Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 18:06:18 +0100 Subject: [PATCH 22/48] feat: handle already download antiSMASH results --- src/nplinker/genomics/antismash/__init__.py | 2 + .../antismash/antismash_downloader.py | 35 ++++++++++++++ .../antismash/podp_antismash_downloader.py | 46 ++++++++++++++++++- 3 files changed, 81 insertions(+), 2 deletions(-) diff --git a/src/nplinker/genomics/antismash/__init__.py b/src/nplinker/genomics/antismash/__init__.py index 4817e806..7c9179a0 100644 --- a/src/nplinker/genomics/antismash/__init__.py +++ b/src/nplinker/genomics/antismash/__init__.py @@ -2,6 +2,7 @@ from .antismash_api_client import submit_antismash_job from .antismash_downloader import download_and_extract_from_antismash_api from .antismash_downloader import download_and_extract_from_antismash_db +from .antismash_downloader import extract_antismash_data from .antismash_loader import AntismashBGCLoader from .antismash_loader import parse_bgc_genbank from .ncbi_downloader import download_and_extract_ncbi_genome @@ -11,6 +12,7 @@ __all__ = [ + "extract_antismash_data", "download_and_extract_from_antismash_api", "download_and_extract_from_antismash_db", "AntismashBGCLoader", diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index 928001c9..ef351ff8 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -6,6 +6,7 @@ from pathlib import Path import requests from nplinker.utils import download_and_extract_archive +from nplinker.utils import extract_archive from nplinker.utils import list_dirs from nplinker.utils import list_files @@ -118,6 +119,40 @@ def download_and_extract_from_antismash_db( raise RuntimeError(f"No results in antiSMASH DB for {refseq_acc}") +def extract_antismash_data( + archive: str | PathLike, extract_root: str | PathLike, antimash_id: str +) -> None: + """Extracts antiSMASH results from a given archive into a specified directory. + + This function handles the extraction of antiSMASH results by preparing the + extraction path, extracting the archive, and performing cleanup of + unnecessary files. If an error occurs during the process, the partially + extracted files are removed, and the exception is re-raised. + + Args: + archive (str | PathLike): The path to the archive file containing antiSMASH results. + extract_root (str | PathLike): The root directory where the data should + be extracted. + antimash_id (str): A unique identifier for the antiSMASH data, used to + create a subdirectory for the extracted files. + + Raises: + Exception: If any error occurs during the extraction process, the + exception is re-raised after cleaning up the extraction directory. + """ + extract_path = Path(extract_root) / "antismash" / antimash_id + + _prepare_extract_path(extract_path) + + try: + extract_archive(archive, extract_path, remove_finished=False) + _cleanup_extracted_files(extract_path) + + except Exception as e: + shutil.rmtree(extract_path) + raise e + + def _check_extract_path(extract_path: Path): # check if extract_path is empty if any(extract_path.iterdir()): diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index efd7acc5..311c5160 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -16,6 +16,7 @@ from nplinker.genomics.antismash import download_and_extract_from_antismash_api from nplinker.genomics.antismash import download_and_extract_from_antismash_db from nplinker.genomics.antismash import download_and_extract_ncbi_genome +from nplinker.genomics.antismash import extract_antismash_data from nplinker.genomics.antismash import submit_antismash_job from nplinker.schemas import GENOME_STATUS_SCHEMA @@ -170,13 +171,23 @@ def podp_download_and_extract_antismash_data( # Retrieve or initialize the GenomeStatus object for the genome ID gs = gs_dict.setdefault(original_genome_id, GenomeStatus(original_genome_id)) - # Skip genomes + # Check if genomes already have antiSMASH BGC data if gs.bgc_path and Path(gs.bgc_path).exists(): logger.info( f"antiSMASH BGC data for genome ID {original_genome_id} already downloaded to " f"{gs.bgc_path}" ) - continue + try: + process_existing_antismash_data(gs, project_extract_root) + continue + except Exception as e: + logger.warning( + "Failed to process existing antiSMASH BGC data for genome ID " + f"{original_genome_id}. Error: {e}" + ) + gs.bgc_path = "" # Reset bgc path + + # Check if a previous attempt to get bgc data has failed if gs.resolve_attempted: logger.info(f"Genome ID {original_genome_id} skipped due to previous failed attempt") continue @@ -296,6 +307,37 @@ def get_genome_assembly_accession( raise RuntimeError("Failed to get genome assembly accession") +def process_existing_antismash_data(gs_obj: GenomeStatus, extract_root: str | PathLike) -> None: + """Processes already downloaded antiSMASH BGC data archive. + + This function ensures that the antiSMASH data archive associated with a given genomic sequence + object is properly extracted into a specified directory. If the data has already been extracted, + the function skips the extraction process. + + Args: + gs_obj: An object representing a genomic sequence, which contains the path + to the antiSMASH BGC data (accessible via `gs_obj.bgc_path`) and + an original identifier (`gs_obj.original_id`). + extract_root: The root directory where the antiSMASH data should be extracted. + + Raises: + Any exceptions raised by the `extract_antismash_data` function if the extraction fails. + """ + antismash_id = Path(gs_obj.bgc_path).stem + extract_path = Path(extract_root, "antismash", antismash_id) + completed_marker = extract_path / "completed" + + # Check if archive is already successfully extracted + if completed_marker.exists(): + logger.info( + f"antiSMASH BGC data for {gs_obj.original_id} already extracted at {extract_path}." + ) + return + + extract_antismash_data(gs_obj.bgc_path, extract_root, antismash_id) + completed_marker.touch(exist_ok=True) + + def retrieve_antismash_db_data( genome_status: GenomeStatus, download_root: str | PathLike, extract_root: str | PathLike ) -> None: From 98812546caa77d506714d5b3013f1b06986aab8f Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 18:07:04 +0100 Subject: [PATCH 23/48] fix mistake in docstring --- src/nplinker/genomics/antismash/podp_antismash_downloader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 311c5160..5af4169a 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -297,8 +297,8 @@ def get_genome_assembly_accession( information, where keys like "RefSeq_accession", "GenBank_accession", or "JGI_Genome_ID" are used to resolve the RefSeq ID. - Warnings: - Logs a warning if the RefSeq ID cannot be resolved. + Raises: + RuntimeError: If the RefSeq ID cannot be resolved. """ genome_status.resolved_refseq_id = _resolve_refseq_id(genome_id_data) genome_status.resolve_attempted = True From 0e1ec548fa420ff0b9f51f74545b10b2ca9a8bc0 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 18:14:32 +0100 Subject: [PATCH 24/48] fix: update return type of submit_antismash_job to str --- src/nplinker/genomics/antismash/antismash_api_client.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/nplinker/genomics/antismash/antismash_api_client.py b/src/nplinker/genomics/antismash/antismash_api_client.py index 32b932c0..2b6fd7a8 100644 --- a/src/nplinker/genomics/antismash/antismash_api_client.py +++ b/src/nplinker/genomics/antismash/antismash_api_client.py @@ -9,7 +9,7 @@ logger = logging.getLogger(__name__) -def submit_antismash_job(genbank_filepath: str | PathLike) -> Optional[str]: +def submit_antismash_job(genbank_filepath: str | PathLike) -> str: """Submits an antiSMASH job using the provided GenBank file. This function sends a GenBank file to the antiSMASH API @@ -19,11 +19,11 @@ def submit_antismash_job(genbank_filepath: str | PathLike) -> Optional[str]: genbank_filepath (str | PathLike): The path to the GenBank file to be submitted. Returns: - Optional[str]: The job ID if the submission is successful, or None if it fails. + str: The job ID if the submission. Raises: requests.exceptions.RequestException: If there is an issue with the HTTP request. - RuntimeError: If the API response does not contain a job ID. + RuntimeError: If the API response does not contain a job ID. """ url = "https://antismash.secondarymetabolites.org/api/v1.0/submit" genbank_filepath = Path(genbank_filepath) From f53f1a7b062f73600a17bb6d9ecdc159d426306e Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 18:16:36 +0100 Subject: [PATCH 25/48] fix: update return type of download_and_extract_ncbi_genome to Path --- src/nplinker/genomics/antismash/ncbi_downloader.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py index 3a91b367..a47c53f5 100644 --- a/src/nplinker/genomics/antismash/ncbi_downloader.py +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -4,7 +4,6 @@ import time from os import PathLike from pathlib import Path -from typing import Optional import httpx import requests from nplinker.utils import check_md5 @@ -20,7 +19,7 @@ def download_and_extract_ncbi_genome( download_root: str | PathLike, extract_root: str | PathLike, max_attempts: int = 10, -) -> Optional[Path]: +) -> Path: """Downloads and extracts an NCBI dataset for a given genome RefSeq ID. This function retrieves a dataset from the NCBI database using the provided @@ -37,7 +36,7 @@ def download_and_extract_ncbi_genome( max_attempts (int): The maximum number of download attempts. Defaults to 10. Returns: - Optional[Path]: The path to the extracted GenBank file if successful, otherwise None. + Path: The path to the extracted GenBank file. Raises: RuntimeError: If the maximum number of retries is reached and the dataset From 5e5b2bd241f7f0287b318bb0c63aea03daa20555 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 18:29:04 +0100 Subject: [PATCH 26/48] update submit_antismash_job to return job ID as string and improve error handling in antismash_job_is_done --- .../antismash/antismash_api_client.py | 58 +++++++------------ 1 file changed, 21 insertions(+), 37 deletions(-) diff --git a/src/nplinker/genomics/antismash/antismash_api_client.py b/src/nplinker/genomics/antismash/antismash_api_client.py index 2b6fd7a8..39cba5b4 100644 --- a/src/nplinker/genomics/antismash/antismash_api_client.py +++ b/src/nplinker/genomics/antismash/antismash_api_client.py @@ -2,7 +2,6 @@ import logging from os import PathLike from pathlib import Path -from typing import Optional import requests @@ -36,60 +35,45 @@ def submit_antismash_job(genbank_filepath: str | PathLike) -> str: data = response.json() if "id" not in data: raise RuntimeError("No antiSMASH job ID returned") - return data["id"] - - -def query_antismash_job(job_id: str) -> Optional[dict]: - """Gets the status of an antiSMASH job. - - Args: - job_id (str): The job ID to query. - - Returns: - dict: The response JSON if successful, otherwise None. - """ - url = f"https://antismash.secondarymetabolites.org/api/v1.0/status/{job_id}" - - try: - response = requests.get(url, timeout=10) - response.raise_for_status() # Raise an exception for HTTP errors - return response.json() - - except requests.exceptions.RequestException as req_err: - logger.error(f"Request failed for job_id {job_id}: {req_err}") - except ValueError as json_err: # Handles JSON decoding errors - logger.error(f"Invalid JSON response for job_id {job_id}: {json_err}") - except Exception as err: - logger.error(f"Unexpected error while getting job state for job_id {job_id}: {err}") + return str(data["id"]) def antismash_job_is_done(job_id: str) -> bool: - """Checks if the antiSMASH job is complete by polling the job status. + """Determines if the antiSMASH job has completed by checking its status. + + This function queries the antiSMASH API to retrieve the current state + of the job and determines whether it has finished successfully, is still + in progress, or has encountered an error. Args: - job_id (str): The job ID to query. + job_id (str): The unique identifier of the antiSMASH job. Returns: - bool: True if the job is done, False if the job is still running. + bool: True if the job is completed successfully, False if it is still + running or queued. Raises: - RuntimeError: If the job status could not be retrieved or if the job failed. - ValueError: If the job state is missing or unexpected in the response. + RuntimeError: If the job has failed or if the API response indicates an error. + ValueError: If the job state is missing or an unexpected state is encountered + in the API response. + requests.exceptions.HTTPError: If an HTTP error occurs during the API request. """ - response = query_antismash_job(job_id) + url = f"https://antismash.secondarymetabolites.org/api/v1.0/status/{job_id}" + + response = requests.get(url, timeout=10) + response.raise_for_status() # Raise exception for HTTP errors + respose_data = response.json() - if response is None: - raise RuntimeError(f"Failed to retrieve job status for job_id {job_id}") - if "state" not in response: + if "state" not in respose_data: raise ValueError(f"Job state missing in response for job_id: {job_id}") - job_state = response["state"] + job_state = respose_data["state"] if job_state in ("running", "queued"): return False if job_state == "done": return True if job_state == "failed": - job_status = response.get("status", "No error message provided") + job_status = respose_data.get("status", "No error message provided") raise RuntimeError(f"AntiSMASH job {job_id} failed with an error: {job_status}") else: raise ValueError( From 909ca639c586e8066acf43e2c49c55bcae73adda Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 18:29:18 +0100 Subject: [PATCH 27/48] chore: add types-requests to development dependencies --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 879895f0..8fd371b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,6 +58,7 @@ dev = [ "mypy", "typing_extensions", # stub packages. Update the `format-typing-check.yml` too if you add more. + "types-requests", "types-beautifulsoup4", "types-jsonschema", "types-networkx", From 252a177c840eb28a32c17398a44d351927cfb7cc Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 18:30:27 +0100 Subject: [PATCH 28/48] fix: update return type of _verify_ncbi_dataset_md5_sums to None and remove return value documentation --- src/nplinker/genomics/antismash/ncbi_downloader.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py index a47c53f5..9bf8c96b 100644 --- a/src/nplinker/genomics/antismash/ncbi_downloader.py +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -108,7 +108,7 @@ def _download_genome(genome_assembly_acc, download_root, max_attempts): ) -def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool: +def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> None: """Verify the integrity of files in a specified directory using MD5 checksums. This function reads an "md5sum.txt" file located in the given extraction path, @@ -120,9 +120,6 @@ def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> bool: extract_path (PathLike): Path to the directory containing the files and the "md5sum.txt" file. - Returns: - bool: True if all files pass the MD5 checksum verification. - Raises: ValueError: If the MD5 checksum of any file does not match the expected value. """ From 81a01024c1b01da9007be45444a7e09a689f69d4 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 18:31:49 +0100 Subject: [PATCH 29/48] fix: update _verify_ncbi_dataset_md5_sums to accept str or PathLike for extract_path --- src/nplinker/genomics/antismash/ncbi_downloader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py index 9bf8c96b..a39338fa 100644 --- a/src/nplinker/genomics/antismash/ncbi_downloader.py +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -108,7 +108,7 @@ def _download_genome(genome_assembly_acc, download_root, max_attempts): ) -def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> None: +def _verify_ncbi_dataset_md5_sums(extract_path: str | PathLike) -> None: """Verify the integrity of files in a specified directory using MD5 checksums. This function reads an "md5sum.txt" file located in the given extraction path, @@ -123,6 +123,7 @@ def _verify_ncbi_dataset_md5_sums(extract_path: PathLike) -> None: Raises: ValueError: If the MD5 checksum of any file does not match the expected value. """ + extract_path = Path(extract_path) with open(extract_path / "md5sum.txt", "r") as f: for line in f: md5sum, file_name = line.strip().split() From 50890a73d3e38466118ed5d030b25d0788b4cb66 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 18:33:26 +0100 Subject: [PATCH 30/48] fix: convert extract_path to Path in _prepare_extract_path for consistency --- src/nplinker/genomics/antismash/antismash_downloader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index ef351ff8..2c27938e 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -172,6 +172,7 @@ def _cleanup_extracted_files(extract_path: str | PathLike) -> None: def _prepare_extract_path(extract_path: str | PathLike) -> None: + extract_path = Path(extract_path) if extract_path.exists(): _check_extract_path(extract_path) else: From b784476bbec8223ca0535c6a644eda11ee4ae8ef Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 18:53:52 +0100 Subject: [PATCH 31/48] chore: update typing dependencies in format-typing-check workflow --- .github/workflows/format-typing-check.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/format-typing-check.yml b/.github/workflows/format-typing-check.yml index a4128a15..c5a97d4b 100644 --- a/.github/workflows/format-typing-check.yml +++ b/.github/workflows/format-typing-check.yml @@ -37,8 +37,8 @@ jobs: - name: Install ruff and mypy run: | pip install ruff mypy typing_extensions \ - types-Deprecated types-beautifulsoup4 types-jsonschema \ - types-networkx types-tabulate types-PyYAML pandas-stubs + types-Deprecated types-beautifulsoup4 types-jsonschema types-requests \ + types-networkx types-tabulate types-PyYAML pandas-stubs - name: Get all changed python files id: changed-python-files uses: tj-actions/changed-files@v44 From bdebd9679a2f3358f6ce2fc733ca6958d0342190 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 19:05:54 +0100 Subject: [PATCH 32/48] fix: clarify return value documentation for submit_antismash_job function --- src/nplinker/genomics/antismash/antismash_api_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nplinker/genomics/antismash/antismash_api_client.py b/src/nplinker/genomics/antismash/antismash_api_client.py index 39cba5b4..124aedcb 100644 --- a/src/nplinker/genomics/antismash/antismash_api_client.py +++ b/src/nplinker/genomics/antismash/antismash_api_client.py @@ -18,7 +18,7 @@ def submit_antismash_job(genbank_filepath: str | PathLike) -> str: genbank_filepath (str | PathLike): The path to the GenBank file to be submitted. Returns: - str: The job ID if the submission. + str: The job ID of the submitted antiSMASH job. Raises: requests.exceptions.RequestException: If there is an issue with the HTTP request. From 7334c902813a3e5dae31b3087a0d626bd73686b9 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 19 Mar 2025 19:06:02 +0100 Subject: [PATCH 33/48] fix: enable postponed evaluation of type annotations in ncbi_downloader.py --- src/nplinker/genomics/antismash/ncbi_downloader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py index a39338fa..143102c8 100644 --- a/src/nplinker/genomics/antismash/ncbi_downloader.py +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -1,3 +1,4 @@ +from __future__ import annotations import logging import os import shutil From 2f0d074312682d702805400b918e40fae2e2c828 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Thu, 20 Mar 2025 17:50:24 +0100 Subject: [PATCH 34/48] feat: add genome accession resolver for NCBI assembly accessions --- .../antismash/genome_accession_resolver.py | 225 ++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 src/nplinker/genomics/antismash/genome_accession_resolver.py diff --git a/src/nplinker/genomics/antismash/genome_accession_resolver.py b/src/nplinker/genomics/antismash/genome_accession_resolver.py new file mode 100644 index 00000000..a1a8423d --- /dev/null +++ b/src/nplinker/genomics/antismash/genome_accession_resolver.py @@ -0,0 +1,225 @@ +import logging +import re +from typing import Any +from typing import Literal +import httpx +from bs4 import BeautifulSoup + + +JGI_GENOME_LOOKUP_URL = ( + "https://img.jgi.doe.gov/cgi-bin/m/main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid={}" +) +USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0" + +logger = logging.getLogger(__name__) + + +def get_latest_assembly_accession(acc: str) -> str: + """Retrieve the latest NCBI genome assembly accession for a given accession. + + This function retrieves the most recent genome assembly accession + from the revision history of the provided accession. It prioritizes + RefSeq accessions, and if unavailable, falls back to GenBank accessions. + + Args: + acc (str): The accession identifier to resolve. + + Returns: + str: The latest valid genome assembly accession. + + Raises: + ValueError: If no valid RefSeq or GenBank accession is found in + the assembly revision history. + """ + revision_history = _get_revision_history(acc) + + acc_priority = ("refseq_accession", "genbank_accession") + for acc_type in acc_priority: + assembly_revisions = revision_history.get("assembly_revisions", []) + revisions_with_acc = [entry for entry in assembly_revisions if acc_type in entry] + if revisions_with_acc: + latest_revision = max(revisions_with_acc, key=lambda x: x["release_date"]) + return latest_revision[acc_type] + + raise ValueError("No valid genome accession found in assembly revision history") + + +def resolve_genome_accession(genome_id_data: dict) -> str: + """Gets the NCBI genome assembly accession. + + Gets the latest RefSeq genome assembly accession, or if not available, + the latest GenBank genome assembly accession. + + This function gets the latest RefSeq genome assembly accession, or if not + available, the latest GenBank genome assembly accession. It attempts to get + a genome accession by checking the genome id date for specific ID types in + the following order: + 1. RefSeq_accession + 2. GenBank_accession + 3. JGI_Genome_ID + + For each ID type, it uses a corresponding resolver function to process + the ID. If a resolver fails, a warning is logged, and the function + proceeds to the next ID type. If no valid genome assembly accession can be + retrieved, a RuntimeError is raised. + + Args: + genome_id_data (dict): A dictionary containing genome ID types as keys + and their corresponding values. + + Returns: + str: The retrieved genome accession, prioritizing RefSeq if available, + otherwise GenBank. + + Raises: + RuntimeError: If no valid assembly accessions can be retrieved. + + Logs: + Warning messages if a resolver fails for a specific ID type. + """ + resolver_priority = ["RefSeq_accession", "GenBank_accession", "JGI_Genome_ID"] + resolvers: dict[str, callable] = { + "RefSeq_accession": _resolve_refseq, + "GenBank_accession": _resolve_genbank, + "JGI_Genome_ID": _resolve_jgi, + } + + for id_type in resolver_priority: + if id_type not in genome_id_data: + continue + + resolver = resolvers[id_type] + try: + return resolver(genome_id_data[id_type].strip()) + except Exception as e: + logger.warning(f"Failed to resolve {id_type}: {e}") + + raise RuntimeError("No valid assembly accessions found") + + +def validate_assembly_acc(acc: str, acc_type: Literal["RefSeq", "GenBank"]) -> None: + """Validates a NCBI genome assembly accession string based on its type. + + This function checks if the provided genome assembly accession string + adheres to the expected format for the specified accession type + ('RefSeq' or 'GenBank'). It ensures that the accession starts with the + correct prefix ('GCF_' for RefSeq or 'GCA_' for GenBank) and contains + a version number. + + Args: + acc (str): The genome assembly accession string to validate. + acc_type (Literal["RefSeq", "GenBank"]): The type of accession, either + 'RefSeq' or 'GenBank'. + + Raises: + ValueError: If the accession type is invalid. + ValueError: If the accession does not start with the expected prefix. + ValueError: If the accession is missing a version number. + """ + if acc_type == "RefSeq": + prefix = "GCF_" + elif acc_type == "GenBank": + prefix = "GCA_" + else: + raise ValueError( + f"Invalid genome assembly accession type '{acc_type}'. Expected 'RefSeq' or 'GenBank'." + ) + if not acc.startswith(prefix): + raise ValueError(f"Invalid {acc_type} assembly accession (must start with {prefix}): {acc}") + if "." not in acc: + raise ValueError(f"Invalid assembly accession (missing version number): {acc}") + + +def _get_revision_history(assembly_acc: str) -> dict[str, Any]: + """Fetches the revision history of a genome assembly from the NCBI Datasets API. + + Args: + assembly_acc (str): The accession number of the genome assembly. + + Returns: + dict[str, Any]: A dictionary containing the revision history of the specified assembly. + + Raises: + httpx.HTTPStatusError: If the HTTP request fails or returns a non-success status code. + ValueError: If no revision history data is found for the given accession number. + """ + url = ( + f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{assembly_acc}/revision_history" + ) + resp = httpx.get(url, headers={"User-Agent": USER_AGENT}, timeout=10.0, follow_redirects=True) + resp.raise_for_status() + revision_history = resp.json() + if not revision_history: + raise ValueError(f"No Assembly Revision data found for {assembly_acc}") + return revision_history + + +def _resolve_refseq(acc: str) -> str: + """Retrieves the latest NCBI genome assembly accession for a given RefSeq accession. + + This function validates the provided RefSeq accession and retrieves the + most up-to-date RefSeq genome assembly accession, or if not found, + alternatively the most up-to-date RefSeq GenBank genome assembly accession. + + Args: + acc (str): The RefSeq accession to resolve. + + Returns: + str: The latest NCBI assembly accession corresponding to the given RefSeq accession. + + Raises: + ValueError: If the provided accession is invalid or not a RefSeq accession. + """ + validate_assembly_acc(acc, "RefSeq") + return get_latest_assembly_accession(acc) + + +def _resolve_genbank(acc: str) -> str: + """Retrieves the latest NCBI genome assembly accession for a given GenBank accession. + + This function validates the provided GenBank accession and retrieves + most up-to-date RefSeq genome assembly accession, or if not found, + alternatively the most up-to-date RefSeq GenBank genome assembly accession. + + Args: + acc (str): The GenBank accession to resolve. + + Returns: + str: The latest NCBI assembly accession corresponding to the given GenBank accession. + + Raises: + ValueError: If the provided accession is invalid or not a RefSeq accession. + """ + validate_assembly_acc(acc, "GenBank") + return get_latest_assembly_accession(acc) + + +def _resolve_jgi(jgi_genome_id: str) -> str: + """Resolves a JGI Genome ID to its corresponding NCBI assembly accession. + + This function queries a predefined JGI genome lookup URL using the provided + JGI Genome ID. It parses the HTML response to extract the NCBI assembly + accession link. If no valid link is found, an exception is raised. The + function then retrieves the most up-to-date NCBI assembly accession for the + found assembly accession. + + Args: + jgi_genome_id (str): The JGI Genome ID to resolve. + + Returns: + str: The latest NCBI assembly accession corresponding to the given JGI Genome ID. + + Raises: + ValueError: If no NCBI accessions can be found for the given JGI Genome ID. + httpx.HTTPStatusError: If the HTTP request to the JGI genome lookup URL fails. + """ + url = JGI_GENOME_LOOKUP_URL.format(jgi_genome_id) + resp = httpx.get(url, headers={"User-Agent": USER_AGENT}, timeout=10.0, follow_redirects=True) + resp.raise_for_status() + + soup = BeautifulSoup(resp.content, "html.parser") + link = soup.find("a", href=re.compile("https://www.ncbi.nlm.nih.gov/datasets/genome/.*")) + if not link: + raise ValueError(f"Unable to find NCBI accessions for the JGI Genome ID: {jgi_genome_id}. ") + assembly_acc = link.text + return get_latest_assembly_accession(assembly_acc) From 5d13fea4457a1690974f1bf0d29eaa4ea45ee667 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Thu, 20 Mar 2025 17:50:33 +0100 Subject: [PATCH 35/48] test: add unit tests for genome accession resolver functions --- .../test_genome_accession_resolver.py | 203 ++++++++++++++++++ 1 file changed, 203 insertions(+) create mode 100644 tests/unit/genomics/test_genome_accession_resolver.py diff --git a/tests/unit/genomics/test_genome_accession_resolver.py b/tests/unit/genomics/test_genome_accession_resolver.py new file mode 100644 index 00000000..5b76eedc --- /dev/null +++ b/tests/unit/genomics/test_genome_accession_resolver.py @@ -0,0 +1,203 @@ +from unittest.mock import patch +import pytest +from nplinker.genomics.antismash.genome_accession_resolver import get_latest_assembly_accession +from nplinker.genomics.antismash.genome_accession_resolver import resolve_genome_accession +from nplinker.genomics.antismash.genome_accession_resolver import validate_assembly_acc + + +USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0" + + +@pytest.fixture +def mock_resolvers(): + with ( + patch( + "nplinker.genomics.antismash.genome_accession_resolver._resolve_refseq" + ) as mock_refseq, + patch( + "nplinker.genomics.antismash.genome_accession_resolver._resolve_genbank" + ) as mock_genbank, + patch("nplinker.genomics.antismash.genome_accession_resolver._resolve_jgi") as mock_jgi, + ): + yield mock_refseq, mock_genbank, mock_jgi + + +# test get_latest_assembly_accession + + +def test_get_latest_assembly_accession_refseq_success(): + with patch( + "nplinker.genomics.antismash.genome_accession_resolver._get_revision_history" + ) as mock_get_revision_history: + mock_get_revision_history.return_value = { + "assembly_revisions": [ + {"refseq_accession": "GCF_000001.2", "release_date": "2023-01-01"}, + {"refseq_accession": "GCF_000001.1", "release_date": "2022-01-01"}, + ] + } + + result = get_latest_assembly_accession("GCF_000001.1") + + assert result == "GCF_000001.2" + mock_get_revision_history.assert_called_once_with("GCF_000001.1") + + +def test_get_latest_assembly_accession_genbank_success(): + with patch( + "nplinker.genomics.antismash.genome_accession_resolver._get_revision_history" + ) as mock_get_revision_history: + mock_get_revision_history.return_value = { + "assembly_revisions": [ + {"genbank_accession": "GCA_000002.2", "release_date": "2023-01-01"}, + {"genbank_accession": "GCA_000002.1", "release_date": "2022-01-01"}, + ] + } + + result = get_latest_assembly_accession("GCA_000002.1") + + assert result == "GCA_000002.2" + mock_get_revision_history.assert_called_once_with("GCA_000002.1") + + +def test_get_latest_assembly_accession_refseq_and_genbank(): + with patch( + "nplinker.genomics.antismash.genome_accession_resolver._get_revision_history" + ) as mock_get_revision_history: + mock_get_revision_history.return_value = { + "assembly_revisions": [ + {"refseq_accession": "GCF_000001.2", "release_date": "2023-01-01"}, + {"genbank_accession": "GCA_000002.2", "release_date": "2022-01-01"}, + ] + } + + result = get_latest_assembly_accession("GCF_000001.1") + + assert result == "GCF_000001.2" + mock_get_revision_history.assert_called_once_with("GCF_000001.1") + + +def test_get_latest_assembly_accession_no_valid_accession(): + with patch( + "nplinker.genomics.antismash.genome_accession_resolver._get_revision_history" + ) as mock_get_revision_history: + mock_get_revision_history.return_value = {"assembly_revisions": []} + + with pytest.raises(ValueError, match="No valid genome accession found"): + get_latest_assembly_accession("GCF_000001.1") + + mock_get_revision_history.assert_called_once_with("GCF_000001.1") + + +# test resolve_for_genome_accession + + +def test_resolve_genome_accession_refseq_success(mock_resolvers): + mock_refseq, mock_genbank, mock_jgi = mock_resolvers + mock_refseq.return_value = "GCF_000001.2" + + genome_id_data = {"RefSeq_accession": "GCF_000001.1"} + result = resolve_genome_accession(genome_id_data) + + assert result == "GCF_000001.2" + mock_refseq.assert_called_once_with("GCF_000001.1") + mock_genbank.assert_not_called() + mock_jgi.assert_not_called() + + +def test_resolve_genome_accession_genbank_success(mock_resolvers): + mock_refseq, mock_genbank, mock_jgi = mock_resolvers + mock_refseq.side_effect = Exception("RefSeq resolution failed") + mock_genbank.return_value = "GCA_000002.2" + + genome_id_data = { + "RefSeq_accession": "GCF_000001.1", + "GenBank_accession": "GCA_000002.1", + } + result = resolve_genome_accession(genome_id_data) + + assert result == "GCA_000002.2" + mock_refseq.assert_called_once_with("GCF_000001.1") + mock_genbank.assert_called_once_with("GCA_000002.1") + mock_jgi.assert_not_called() + + +def test_resolve_genome_accession_jgi_success(mock_resolvers): + mock_refseq, mock_genbank, mock_jgi = mock_resolvers + mock_refseq.side_effect = Exception("RefSeq resolution failed") + mock_genbank.side_effect = Exception("GenBank resolution failed") + mock_jgi.return_value = "GCF_000001.2" + + genome_id_data = { + "RefSeq_accession": "GCF_000001.1", + "GenBank_accession": "GCA_000002.1", + "JGI_Genome_ID": "12345", + } + result = resolve_genome_accession(genome_id_data) + + assert result == "GCF_000001.2" + mock_refseq.assert_called_once_with("GCF_000001.1") + mock_genbank.assert_called_once_with("GCA_000002.1") + mock_jgi.assert_called_once_with("12345") + + +def test_resolve_genome_accession_no_valid_accession(mock_resolvers): + mock_refseq, mock_genbank, mock_jgi = mock_resolvers + mock_refseq.side_effect = Exception("RefSeq resolution failed") + mock_genbank.side_effect = Exception("GenBank resolution failed") + mock_jgi.side_effect = Exception("JGI resolution failed") + + genome_id_data = { + "RefSeq_accession": "GCF_000001.1", + "GenBank_accession": "GCA_000002.1", + "JGI_Genome_ID": "12345", + } + + with pytest.raises(RuntimeError, match="No valid assembly accessions found"): + resolve_genome_accession(genome_id_data) + + mock_refseq.assert_called_once_with("GCF_000001.1") + mock_genbank.assert_called_once_with("GCA_000002.1") + mock_jgi.assert_called_once_with("12345") + + +def test_resolve_genome_accession_missing_keys(mock_resolvers): + mock_refseq, mock_genbank, mock_jgi = mock_resolvers + + genome_id_data = {} + with pytest.raises(RuntimeError, match="No valid assembly accessions found"): + resolve_genome_accession(genome_id_data) + + mock_refseq.assert_not_called() + mock_genbank.assert_not_called() + mock_jgi.assert_not_called() + + +def test_validate_assembly_acc_valid(): + # Test valid GenBank accession + validate_assembly_acc("GCA_000002.1", "GenBank") + # Test valid RefSeq accession + validate_assembly_acc("GCF_000001.1", "RefSeq") + + +def test_validate_assembly_acc_invalid_type(): + # Test invalid accession type + with pytest.raises(ValueError, match="Invalid genome assembly accession type 'InvalidType'"): + validate_assembly_acc("GCF_000001.1", "InvalidType") + + +def test_validate_assembly_acc_invalid_prefix(): + # Test invalid prefix for GenBank accession + with pytest.raises(ValueError, match="Invalid GenBank assembly accession"): + validate_assembly_acc("GCF_000002.1", "GenBank") + # Test invalid prefix for RefSeq accession + with pytest.raises(ValueError, match="Invalid RefSeq assembly accession"): + validate_assembly_acc("GCA_000001.1", "RefSeq") + + +def test_validate_assembly_acc_missing_version(): + # Test missing version number for GenBank accession + with pytest.raises(ValueError, match="Invalid assembly accession \\(missing version number\\)"): + validate_assembly_acc("GCA_000002", "GenBank") + # Test missing version number for RefSeq accession + with pytest.raises(ValueError, match="Invalid assembly accession \\(missing version number\\)"): + validate_assembly_acc("GCF_000001", "RefSeq") From 5a201ccb331e065a57c0a4017a4ded70ab276148 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Thu, 20 Mar 2025 18:03:21 +0100 Subject: [PATCH 36/48] fix: ensure no mypy typing errors --- .../genomics/antismash/genome_accession_resolver.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/nplinker/genomics/antismash/genome_accession_resolver.py b/src/nplinker/genomics/antismash/genome_accession_resolver.py index a1a8423d..95fb1364 100644 --- a/src/nplinker/genomics/antismash/genome_accession_resolver.py +++ b/src/nplinker/genomics/antismash/genome_accession_resolver.py @@ -1,6 +1,7 @@ import logging import re from typing import Any +from typing import Callable from typing import Literal import httpx from bs4 import BeautifulSoup @@ -39,7 +40,7 @@ def get_latest_assembly_accession(acc: str) -> str: revisions_with_acc = [entry for entry in assembly_revisions if acc_type in entry] if revisions_with_acc: latest_revision = max(revisions_with_acc, key=lambda x: x["release_date"]) - return latest_revision[acc_type] + return str(latest_revision[acc_type]) raise ValueError("No valid genome accession found in assembly revision history") @@ -78,7 +79,7 @@ def resolve_genome_accession(genome_id_data: dict) -> str: Warning messages if a resolver fails for a specific ID type. """ resolver_priority = ["RefSeq_accession", "GenBank_accession", "JGI_Genome_ID"] - resolvers: dict[str, callable] = { + resolvers: dict[str, Callable] = { "RefSeq_accession": _resolve_refseq, "GenBank_accession": _resolve_genbank, "JGI_Genome_ID": _resolve_jgi, @@ -90,7 +91,8 @@ def resolve_genome_accession(genome_id_data: dict) -> str: resolver = resolvers[id_type] try: - return resolver(genome_id_data[id_type].strip()) + genome_id = genome_id_data[id_type].strip() + return str(resolver(genome_id)) except Exception as e: logger.warning(f"Failed to resolve {id_type}: {e}") @@ -151,6 +153,8 @@ def _get_revision_history(assembly_acc: str) -> dict[str, Any]: revision_history = resp.json() if not revision_history: raise ValueError(f"No Assembly Revision data found for {assembly_acc}") + if not isinstance(revision_history, dict): + raise ValueError(f"Unexpected response format: {type(revision_history)}") return revision_history From a8ed1465689d8ed2163ae5a6f3a72969e09f311c Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Thu, 20 Mar 2025 19:12:55 +0100 Subject: [PATCH 37/48] feat: use the new genome ID resolver --- src/nplinker/genomics/antismash/__init__.py | 2 + .../antismash/podp_antismash_downloader.py | 156 +----------------- 2 files changed, 5 insertions(+), 153 deletions(-) diff --git a/src/nplinker/genomics/antismash/__init__.py b/src/nplinker/genomics/antismash/__init__.py index 7c9179a0..03e50b07 100644 --- a/src/nplinker/genomics/antismash/__init__.py +++ b/src/nplinker/genomics/antismash/__init__.py @@ -5,6 +5,7 @@ from .antismash_downloader import extract_antismash_data from .antismash_loader import AntismashBGCLoader from .antismash_loader import parse_bgc_genbank +from .genome_accession_resolver import resolve_genome_accession from .ncbi_downloader import download_and_extract_ncbi_genome from .podp_antismash_downloader import GenomeStatus from .podp_antismash_downloader import get_best_available_genome_id @@ -13,6 +14,7 @@ __all__ = [ "extract_antismash_data", + "resolve_genome_accession", "download_and_extract_from_antismash_api", "download_and_extract_from_antismash_db", "AntismashBGCLoader", diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 5af4169a..4bba189b 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -1,15 +1,12 @@ from __future__ import annotations import json import logging -import re import time import warnings from collections.abc import Mapping from collections.abc import Sequence from os import PathLike from pathlib import Path -import httpx -from bs4 import BeautifulSoup from jsonschema import validate from nplinker.defaults import GENOME_STATUS_FILENAME from nplinker.genomics.antismash import antismash_job_is_done @@ -17,6 +14,7 @@ from nplinker.genomics.antismash import download_and_extract_from_antismash_db from nplinker.genomics.antismash import download_and_extract_ncbi_genome from nplinker.genomics.antismash import extract_antismash_data +from nplinker.genomics.antismash import resolve_genome_accession from nplinker.genomics.antismash import submit_antismash_job from nplinker.schemas import GENOME_STATUS_SCHEMA @@ -194,7 +192,8 @@ def podp_download_and_extract_antismash_data( # resolve genome ID try: - get_genome_assembly_accession(gs, genome_record["genome_ID"]) + gs.resolved_refseq_id = resolve_genome_accession(genome_record["genome_ID"]) + gs.resolve_attempted = True except Exception as e: logger.warning(f"Failed to resolve genome ID {gs.original_id}. Error: {e}") continue @@ -279,34 +278,6 @@ def get_best_available_genome_id(genome_id_data: Mapping[str, str]) -> str | Non return best_id -def get_genome_assembly_accession( - genome_status: GenomeStatus, genome_id_data: Mapping[str, str] -) -> None: - """Resolve and update the genome assembly accession for a given genome status. - - This function attempts to resolve the RefSeq ID for the provided genome record - and updates the `genome_status` object with the resolved ID. It also sets the - `resolve_attempted` flag to `True` to indicate that an attempt to resolve the - RefSeq ID has been made. If the resolution fails, raises a RuntimeError and leaves - the `resolved_refseq_id` empty. - - Args: - genome_status (GenomeStatus): An object representing the status of the genome, - which will be updated with the resolved RefSeq ID. - genome_id_data (Mapping[str, str]): A dictionary containing genome - information, where keys like "RefSeq_accession", "GenBank_accession", - or "JGI_Genome_ID" are used to resolve the RefSeq ID. - - Raises: - RuntimeError: If the RefSeq ID cannot be resolved. - """ - genome_status.resolved_refseq_id = _resolve_refseq_id(genome_id_data) - genome_status.resolve_attempted = True - - if genome_status.resolved_refseq_id == "": - raise RuntimeError("Failed to get genome assembly accession") - - def process_existing_antismash_data(gs_obj: GenomeStatus, extract_root: str | PathLike) -> None: """Processes already downloaded antiSMASH BGC data archive. @@ -398,124 +369,3 @@ def retrieve_antismash_job_data( download_and_extract_from_antismash_api(job_id, antismash_id, download_root, extract_root) Path.touch(extract_path / "completed", exist_ok=True) genome_status.bgc_path = str(download_path) - - -def _resolve_genbank_accession(genbank_id: str) -> str: - """Try to get RefSeq assembly id through given GenBank assembly id. - - Note that GenBank assembly accession starts with "GCA_" and RefSeq assembly - accession starts with "GCF_". For more info, see - https://www.ncbi.nlm.nih.gov/datasets/docs/v2/troubleshooting/faq - - Args: - genbank_id: ID for GenBank assembly accession. - - Raises: - httpx.ReadTimeout: If the request times out. - - Returns: - RefSeq assembly ID if the search is successful, otherwise an empty string. - """ - logger.info( - f"Attempting to resolve Genbank assembly accession {genbank_id} to RefSeq accession" - ) - # NCBI Datasets API https://www.ncbi.nlm.nih.gov/datasets/docs/v2/api/ - # Note that there is a API rate limit of 5 requests per second without using an API key - # For more info, see https://www.ncbi.nlm.nih.gov/datasets/docs/v2/troubleshooting/faq/ - - # API for getting revision history of a genome assembly - # For schema, see https://www.ncbi.nlm.nih.gov/datasets/docs/v2/api/rest-api/#get-/genome/accession/-accession-/revision_history - url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{genbank_id}/revision_history" - - refseq_id = "" - try: - resp = httpx.get( - url, headers={"User-Agent": USER_AGENT}, timeout=10.0, follow_redirects=True - ) - resp.raise_for_status() - - data = resp.json() - if not data: - raise ValueError("No Assembly Revision data found") - - assembly_entries = [ - entry for entry in data["assembly_revisions"] if "refseq_accession" in entry - ] - if not assembly_entries: - raise ValueError("No RefSeq assembly accession found") - - latest_entry = max(assembly_entries, key=lambda x: x["release_date"]) - refseq_id = latest_entry["refseq_accession"] - - except httpx.RequestError as exc: - logger.warning(f"An error occurred while requesting {exc.request.url!r}: {exc}") - except httpx.HTTPStatusError as exc: - logger.warning( - f"Error response {exc.response.status_code} while requesting {exc.request.url!r}" - ) - except httpx.ReadTimeout: - logger.warning("Timed out waiting for result of GenBank assembly lookup") - except ValueError as exc: - logger.warning(f"Error while resolving GenBank assembly accession {genbank_id}: {exc}") - - return refseq_id - - -def _resolve_jgi_accession(jgi_id: str) -> str: - """Try to get RefSeq id through given JGI id. - - Args: - jgi_id: JGI_Genome_ID for GenBank accession. - - Returns: - RefSeq ID if search is successful, otherwise an empty string. - """ - url = JGI_GENOME_LOOKUP_URL.format(jgi_id) - logger.info(f"Attempting to resolve JGI_Genome_ID {jgi_id} to GenBank accession via {url}") - # no User-Agent header produces a 403 Forbidden error on this site... - try: - resp = httpx.get( - url, headers={"User-Agent": USER_AGENT}, timeout=10.0, follow_redirects=True - ) - except httpx.ReadTimeout: - logger.warning("Timed out waiting for result of JGI_Genome_ID lookup") - return "" - - soup = BeautifulSoup(resp.content, "html.parser") - # Find the table entry giving the "NCBI Assembly Accession" ID - link = soup.find("a", href=re.compile("https://www.ncbi.nlm.nih.gov/datasets/genome/.*")) - if link is None: - return "" - - assembly_id = link.text - # check if the assembly ID is already a RefSeq ID - if assembly_id.startswith("GCF_"): - return assembly_id # type: ignore - else: - return _resolve_genbank_accession(assembly_id) - - -def _resolve_refseq_id(genome_id_data: Mapping[str, str]) -> str: - """Get the RefSeq ID to which the genome accession is linked. - - Check https://pairedomicsdata.bioinformatics.nl/schema.json. - - Args: - genome_id_data: dictionary containing information - for each genome record present. - - Returns: - RefSeq ID if present, otherwise an empty string. - """ - if "RefSeq_accession" in genome_id_data: - # best case, can use this directly - return genome_id_data["RefSeq_accession"] - if "GenBank_accession" in genome_id_data: - # resolve via NCBI - return _resolve_genbank_accession(genome_id_data["GenBank_accession"]) - if "JGI_Genome_ID" in genome_id_data: - # resolve via JGI => NCBI - return _resolve_jgi_accession(genome_id_data["JGI_Genome_ID"]) - - logger.warning(f"Unable to resolve genome_ID: {genome_id_data}") - return "" From 4ba85d01ce9bc6f5d8ded67e307f1c6cd2b169cd Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Thu, 20 Mar 2025 19:47:03 +0100 Subject: [PATCH 38/48] refactor: change resolved_refseq_id to resolved_id --- .../genomics/antismash/ncbi_downloader.py | 4 +- .../antismash/podp_antismash_downloader.py | 29 +++++++------ src/nplinker/genomics/utils.py | 2 +- .../schemas/genome_status_schema.json | 8 ++-- tests/unit/genomics/test_ncbi_downloader.py | 18 ++++---- .../test_podp_antismash_downloader.py | 42 +++++++++---------- tests/unit/genomics/test_utils.py | 6 +-- .../unit/schemas/test_genome_status_schema.py | 32 +++++++------- 8 files changed, 70 insertions(+), 71 deletions(-) diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py index 143102c8..fe016fd5 100644 --- a/src/nplinker/genomics/antismash/ncbi_downloader.py +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -21,10 +21,10 @@ def download_and_extract_ncbi_genome( extract_root: str | PathLike, max_attempts: int = 10, ) -> Path: - """Downloads and extracts an NCBI dataset for a given genome RefSeq ID. + """Downloads and extracts an NCBI dataset for a given genome assembly accession. This function retrieves a dataset from the NCBI database using the provided - RefSeq ID. It retries the download process up to a specified maximum number + genome assembly accession. It retries the download process up to a specified maximum number of attempts in case of errors. The function verifies the integrity of the downloaded files using MD5 checksums, extracts the dataset, and renames the GenBank file for easier access. Unnecessary files are removed after successful diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 4bba189b..ef53cbd6 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -37,7 +37,7 @@ class GenomeStatus: def __init__( self, original_id: str, - resolved_refseq_id: str = "", + resolved_id: str = "", resolve_attempted: bool = False, bgc_path: str = "", ): @@ -45,15 +45,14 @@ def __init__( Args: original_id: The original ID of the genome. - resolved_refseq_id: The resolved RefSeq ID of the - genome. Defaults to "". - resolve_attempted: A flag indicating whether an - attempt to resolve the RefSeq ID has been made. Defaults to False. + resolved_id: The resolved genome ID of the genome. Defaults to "". + resolve_attempted: A flag indicating whether an attempt to resolve + the genome ID has been made. Defaults to False. bgc_path: The path to the downloaded BGC file for the genome. Defaults to "". """ self.original_id = original_id - self.resolved_refseq_id = "" if resolved_refseq_id == "None" else resolved_refseq_id + self.resolved_id = "" if resolved_id == "None" else resolved_id self.resolve_attempted = resolve_attempted self.bgc_path = bgc_path @@ -118,7 +117,7 @@ def _to_dict(self) -> dict: """Convert the GenomeStatus object to a dict.""" return { "original_id": self.original_id, - "resolved_refseq_id": self.resolved_refseq_id, + "resolved_id": self.resolved_id, "resolve_attempted": self.resolve_attempted, "bgc_path": self.bgc_path, } @@ -192,7 +191,7 @@ def podp_download_and_extract_antismash_data( # resolve genome ID try: - gs.resolved_refseq_id = resolve_genome_accession(genome_record["genome_ID"]) + gs.resolved_id = resolve_genome_accession(genome_record["genome_ID"]) gs.resolve_attempted = True except Exception as e: logger.warning(f"Failed to resolve genome ID {gs.original_id}. Error: {e}") @@ -218,7 +217,7 @@ def podp_download_and_extract_antismash_data( f"genome ID {gs.original_id}." ) genome_path = download_and_extract_ncbi_genome( - gs.resolved_refseq_id, project_download_root, project_extract_root + gs.resolved_id, project_download_root, project_extract_root ) job_id = submit_antismash_job(genome_path) logger.info(f"Waiting for antiSMASH job {job_id} to complete.") @@ -315,12 +314,12 @@ def retrieve_antismash_db_data( """Retrieve antiSMASH database data for a given genome and update its status. This function downloads and extracts antiSMASH data for a genome identified - by its resolved RefSeq ID. It updates the `genome_status` object with the + by its resolved genome ID. It updates the `genome_status` object with the path to the downloaded data or sets it to an empty string if an error occurs. Args: genome_status (GenomeStatus): An object representing the genome's status, - including its resolved RefSeq ID and BGC path. + including its resolved genome ID and BGC path. download_root (str | PathLike): The root directory where the antiSMASH data will be downloaded. extract_root (str | PathLike): The root directory where the antiSMASH @@ -329,7 +328,7 @@ def retrieve_antismash_db_data( Raises: Exception: If an error occurs during the download or extraction process. """ - antismash_id = genome_status.resolved_refseq_id + antismash_id = genome_status.resolved_id extract_path = Path(extract_root, "antismash", antismash_id) download_path = Path(download_root, f"{antismash_id}.zip").absolute() @@ -347,13 +346,13 @@ def retrieve_antismash_job_data( """Retrieve antiSMASH API data for a given genome and update its status. This function downloads and extracts antiSMASH data for a genome identified - by its resolved RefSeq ID. It updates the `genome_status` object with the + by its resolved genome ID. It updates the `genome_status` object with the path to the downloaded data or sets it to an empty string if an error occurs. Args: job_id (str): The job ID for the antiSMASH API job. genome_status (GenomeStatus): An object representing the genome's status, - including its resolved RefSeq ID and BGC path. + including its resolved genome ID and BGC path. download_root (str | PathLike): The root directory where the antiSMASH data will be downloaded. extract_root (str | PathLike): The root directory where the antiSMASH @@ -362,7 +361,7 @@ def retrieve_antismash_job_data( Raises: Exception: If an error occurs during the download or extraction process. """ - antismash_id = genome_status.resolved_refseq_id + antismash_id = genome_status.resolved_id extract_path = Path(extract_root, "antismash", antismash_id) download_path = Path(download_root, f"{antismash_id}.zip").absolute() diff --git a/src/nplinker/genomics/utils.py b/src/nplinker/genomics/utils.py index ecf6521b..2b723dff 100644 --- a/src/nplinker/genomics/utils.py +++ b/src/nplinker/genomics/utils.py @@ -276,7 +276,7 @@ def extract_mappings_original_genome_id_resolved_genome_id( Generate strain mappings JSON file for PODP pipeline. """ gs_mappings_dict = GenomeStatus.read_json(genome_status_json_file) - return {gs.original_id: gs.resolved_refseq_id for gs in gs_mappings_dict.values()} + return {gs.original_id: gs.resolved_id for gs in gs_mappings_dict.values()} def extract_mappings_resolved_genome_id_bgc_id( diff --git a/src/nplinker/schemas/genome_status_schema.json b/src/nplinker/schemas/genome_status_schema.json index 470c74f4..5de93cbb 100644 --- a/src/nplinker/schemas/genome_status_schema.json +++ b/src/nplinker/schemas/genome_status_schema.json @@ -17,7 +17,7 @@ "type": "object", "required": [ "original_id", - "resolved_refseq_id", + "resolved_id", "resolve_attempted", "bgc_path" ], @@ -28,10 +28,10 @@ "description": "The original ID of the genome", "minLength": 1 }, - "resolved_refseq_id": { + "resolved_id": { "type": "string", - "title": "Resolved RefSeq ID", - "description": "The RefSeq ID that was resolved for this genome" + "title": "Resolved genome ID", + "description": "The genome ID that was resolved for this genome" }, "resolve_attempted": { "type": "boolean", diff --git a/tests/unit/genomics/test_ncbi_downloader.py b/tests/unit/genomics/test_ncbi_downloader.py index dadbe9ea..bc1dda48 100644 --- a/tests/unit/genomics/test_ncbi_downloader.py +++ b/tests/unit/genomics/test_ncbi_downloader.py @@ -15,29 +15,31 @@ def extract_root(tmp_path): def test_download_and_extract_ncbi_genome_success(download_root, extract_root): - refseq_id = "GCF_000514775.1" + assembly_accession = "GCF_000514775.1" - genome_path = download_and_extract_ncbi_genome(refseq_id, download_root, extract_root) + genome_path = download_and_extract_ncbi_genome(assembly_accession, download_root, extract_root) - assert genome_path == extract_root / "ncbi_genomes" / f"{refseq_id}.gbff" + assert genome_path == extract_root / "ncbi_genomes" / f"{assembly_accession}.gbff" assert not (extract_root / "ncbi_genomes" / "md5sum.txt").exists() assert not (extract_root / "ncbi_genomes" / "README.md").exists() assert not (extract_root / "ncbi_genomes" / "ncbi_dataset").exists() def test_download_and_extract_ncbi_genome_max_retries(download_root, extract_root): - refseq_id = "GCF_000514775.1" + assembly_accession = "GCF_000514775.1" with patch( "nplinker.genomics.antismash.ncbi_downloader.download_url", side_effect=httpx.ReadTimeout("Download failed"), ): with pytest.raises(httpx.ReadTimeout, match="Maximum download retries"): - download_and_extract_ncbi_genome(refseq_id, download_root, extract_root, max_attempts=1) + download_and_extract_ncbi_genome( + assembly_accession, download_root, extract_root, max_attempts=1 + ) -def test_download_and_extract_ncbi_genome_invalid_refseq_id(download_root, extract_root): - refseq_id = "invalid_ref_seq_id" +def test_download_and_extract_ncbi_genome_invalid_accession(download_root, extract_root): + assembly_accession = "invalid_ref_seq_id" with pytest.raises(ValueError, match="Not a valid genome assembly accession"): - download_and_extract_ncbi_genome(refseq_id, download_root, extract_root) + download_and_extract_ncbi_genome(assembly_accession, download_root, extract_root) diff --git a/tests/unit/genomics/test_podp_antismash_downloader.py b/tests/unit/genomics/test_podp_antismash_downloader.py index 302d354d..e13fc1af 100644 --- a/tests/unit/genomics/test_podp_antismash_downloader.py +++ b/tests/unit/genomics/test_podp_antismash_downloader.py @@ -36,7 +36,7 @@ def genome_status_file(download_root): ) def test_genome_status_init(params, expected): gs = GenomeStatus(*params) - assert [gs.original_id, gs.resolved_refseq_id, gs.resolve_attempted, gs.bgc_path] == expected + assert [gs.original_id, gs.resolved_id, gs.resolve_attempted, gs.bgc_path] == expected def test_genome_status_read_json(tmp_path): @@ -44,13 +44,13 @@ def test_genome_status_read_json(tmp_path): "genome_status": [ { "original_id": "genome1", - "resolved_refseq_id": "refseq1", + "resolved_id": "refseq1", "resolve_attempted": True, "bgc_path": "/path/to/bgc1", }, { "original_id": "genome2", - "resolved_refseq_id": "", + "resolved_id": "", "resolve_attempted": False, "bgc_path": "", }, @@ -64,11 +64,11 @@ def test_genome_status_read_json(tmp_path): assert len(genome_status_dict) == 2 assert genome_status_dict["genome1"].original_id == "genome1" - assert genome_status_dict["genome1"].resolved_refseq_id == "refseq1" + assert genome_status_dict["genome1"].resolved_id == "refseq1" assert genome_status_dict["genome1"].resolve_attempted is True assert genome_status_dict["genome1"].bgc_path == "/path/to/bgc1" assert genome_status_dict["genome2"].original_id == "genome2" - assert genome_status_dict["genome2"].resolved_refseq_id == "" + assert genome_status_dict["genome2"].resolved_id == "" assert genome_status_dict["genome2"].resolve_attempted is False assert genome_status_dict["genome2"].bgc_path == "" @@ -86,11 +86,11 @@ def test_genome_status_to_json(tmp_path): assert loaded_data["version"] == "1.0" assert len(loaded_data["genome_status"]) == 2 assert loaded_data["genome_status"][0]["original_id"] == "genome1" - assert loaded_data["genome_status"][0]["resolved_refseq_id"] == "refseq1" + assert loaded_data["genome_status"][0]["resolved_id"] == "refseq1" assert loaded_data["genome_status"][0]["resolve_attempted"] is True assert loaded_data["genome_status"][0]["bgc_path"] == "/path/to/bgc1" assert loaded_data["genome_status"][1]["original_id"] == "genome2" - assert loaded_data["genome_status"][1]["resolved_refseq_id"] == "" + assert loaded_data["genome_status"][1]["resolved_id"] == "" assert loaded_data["genome_status"][1]["resolve_attempted"] is False assert loaded_data["genome_status"][1]["bgc_path"] == "" @@ -105,9 +105,9 @@ def test_genome_status_to_json_nofile(): assert isinstance(result, str) assert ( result == '{"genome_status": ' - '[{"original_id": "genome1", "resolved_refseq_id": "refseq1", ' + '[{"original_id": "genome1", "resolved_id": "refseq1", ' '"resolve_attempted": true, "bgc_path": "/path/to/bgc1"}, ' - '{"original_id": "genome2", "resolved_refseq_id": "", ' + '{"original_id": "genome2", "resolved_id": "", ' '"resolve_attempted": false, "bgc_path": ""}], "version": "1.0"}' ) @@ -271,8 +271,8 @@ def test_refseq_id(download_root, extract_root, genome_status_file): genome_status = GenomeStatus.read_json(genome_status_file) genome_obj = genome_status["GCF_000016425.1"] - archive = download_root / Path(str(genome_obj.resolved_refseq_id) + ".zip") - extracted_folder = extract_root / "antismash" / genome_obj.resolved_refseq_id + archive = download_root / Path(str(genome_obj.resolved_id) + ".zip") + extracted_folder = extract_root / "antismash" / genome_obj.resolved_id extracted_files = list_files(extracted_folder, keep_parent=False) assert archive.exists() @@ -298,8 +298,8 @@ def test_genbank_id(download_root, extract_root, genome_status_file): genome_status = GenomeStatus.read_json(genome_status_file) genome_obj = genome_status["GCA_000016425.1"] - archive = download_root / Path(str(genome_obj.resolved_refseq_id) + ".zip") - extracted_folder = extract_root / "antismash" / genome_obj.resolved_refseq_id + archive = download_root / Path(str(genome_obj.resolved_id) + ".zip") + extracted_folder = extract_root / "antismash" / genome_obj.resolved_id extracted_files = list_files(extracted_folder, keep_parent=False) assert archive.exists() @@ -325,8 +325,8 @@ def test_jgi_id(download_root, extract_root, genome_status_file): genome_status = GenomeStatus.read_json(genome_status_file) genome_obj = genome_status["640427140"] - archive = download_root / Path(str(genome_obj.resolved_refseq_id) + ".zip") - extracted_folder = extract_root / "antismash" / genome_obj.resolved_refseq_id + archive = download_root / Path(str(genome_obj.resolved_id) + ".zip") + extracted_folder = extract_root / "antismash" / genome_obj.resolved_id extracted_files = list_files(extracted_folder, keep_parent=False) assert archive.exists() @@ -357,8 +357,8 @@ def test_refseq_jgi_id(download_root, extract_root, genome_status_file): genome_status = GenomeStatus.read_json(genome_status_file) genome_obj = genome_status["GCF_000016425.1"] - archive = download_root / Path(str(genome_obj.resolved_refseq_id) + ".zip") - extracted_folder = extract_root / "antismash" / genome_obj.resolved_refseq_id + archive = download_root / Path(str(genome_obj.resolved_id) + ".zip") + extracted_folder = extract_root / "antismash" / genome_obj.resolved_id extracted_files = list_files(extracted_folder, keep_parent=False) assert archive.exists() @@ -389,8 +389,8 @@ def test_refseq_genbank_id(download_root, extract_root, genome_status_file): genome_status = GenomeStatus.read_json(genome_status_file) genome_obj = genome_status["GCF_000016425.1"] - archive = download_root / Path(str(genome_obj.resolved_refseq_id) + ".zip") - extracted_folder = extract_root / "antismash" / genome_obj.resolved_refseq_id + archive = download_root / Path(str(genome_obj.resolved_id) + ".zip") + extracted_folder = extract_root / "antismash" / genome_obj.resolved_id extracted_files = list_files(extracted_folder, keep_parent=False) assert archive.exists() @@ -421,8 +421,8 @@ def test_genbank_jgi_id(download_root, extract_root, genome_status_file): genome_status = GenomeStatus.read_json(genome_status_file) genome_obj = genome_status["GCA_000016425.1"] - archive = download_root / Path(str(genome_obj.resolved_refseq_id) + ".zip") - extracted_folder = extract_root / "antismash" / genome_obj.resolved_refseq_id + archive = download_root / Path(str(genome_obj.resolved_id) + ".zip") + extracted_folder = extract_root / "antismash" / genome_obj.resolved_id extracted_files = list_files(extracted_folder, keep_parent=False) assert archive.exists() diff --git a/tests/unit/genomics/test_utils.py b/tests/unit/genomics/test_utils.py index 8a17aa69..9d1ff784 100644 --- a/tests/unit/genomics/test_utils.py +++ b/tests/unit/genomics/test_utils.py @@ -192,19 +192,19 @@ def test_extract_mappings_original_genome_id_resolved_genome_id(tmp_path): "genome_status": [ { "original_id": "id1", - "resolved_refseq_id": "refseq1", + "resolved_id": "refseq1", "resolve_attempted": True, "bgc_path": "", }, { "original_id": "id2", - "resolved_refseq_id": "refseq2", + "resolved_id": "refseq2", "resolve_attempted": True, "bgc_path": "", }, { "original_id": "id3", - "resolved_refseq_id": "refseq3", + "resolved_id": "refseq3", "resolve_attempted": True, "bgc_path": "", }, diff --git a/tests/unit/schemas/test_genome_status_schema.py b/tests/unit/schemas/test_genome_status_schema.py index de0679a5..6bd3401b 100644 --- a/tests/unit/schemas/test_genome_status_schema.py +++ b/tests/unit/schemas/test_genome_status_schema.py @@ -10,9 +10,7 @@ data_empty_genome_status = {"genome_status": [], "version": "1.0"} data_no_original_id = { - "genome_status": [ - {"resolved_refseq_id": "id1_refseq", "resolve_attempted": True, "bgc_path": ""} - ], + "genome_status": [{"resolved_id": "id1_refseq", "resolve_attempted": True, "bgc_path": ""}], "version": "1.0", } @@ -20,7 +18,7 @@ "genome_status": [ { "original_id": "", - "resolved_refseq_id": "id1_refseq", + "resolved_id": "id1_refseq", "resolve_attempted": True, "bgc_path": "", } @@ -32,7 +30,7 @@ "genome_status": [ { "original_id": 1, - "resolved_refseq_id": "id1_refseq", + "resolved_id": "id1_refseq", "resolve_attempted": True, "bgc_path": "", } @@ -40,20 +38,20 @@ "version": "1.0", } -data_no_resolved_refseq_id = { +data_no_resolved_id = { "genome_status": [{"original_id": "id1", "resolve_attempted": True, "bgc_path": ""}], "version": "1.0", } -data_invalid_resolved_refseq_id = { +data_invalid_resolved_id = { "genome_status": [ - {"original_id": "id1", "resolved_refseq_id": 1, "resolve_attempted": True, "bgc_path": ""} + {"original_id": "id1", "resolved_id": 1, "resolve_attempted": True, "bgc_path": ""} ], "version": "1.0", } data_no_resolve_attempted = { - "genome_status": [{"original_id": "id1", "resolved_refseq_id": "id1_refseq", "bgc_path": ""}], + "genome_status": [{"original_id": "id1", "resolved_id": "id1_refseq", "bgc_path": ""}], "version": "1.0", } @@ -61,7 +59,7 @@ "genome_status": [ { "original_id": "id1", - "resolved_refseq_id": "id1_refseq", + "resolved_id": "id1_refseq", "resolve_attempted": 1, "bgc_path": "", } @@ -71,7 +69,7 @@ data_no_bgc_path = { "genome_status": [ - {"original_id": "id1", "resolved_refseq_id": "id1_refseq", "resolve_attempted": True} + {"original_id": "id1", "resolved_id": "id1_refseq", "resolve_attempted": True} ], "version": "1.0", } @@ -80,7 +78,7 @@ "genome_status": [ { "original_id": "id1", - "resolved_refseq_id": "id1_refseq", + "resolved_id": "id1_refseq", "resolve_attempted": True, "bgc_path": 1, } @@ -94,7 +92,7 @@ data_empty_version = { "genome_status": [{"strain_id": "strain1", "strain_alias": ["alias1", "alias2"]}], - "version": "" "", + "version": "", } data_invalid_version = { @@ -112,8 +110,8 @@ [data_no_original_id, "'original_id' is a required property"], [data_empty_original_id, "'' should be non-empty"], [data_invalid_original_id, "1 is not of type 'string'"], - [data_no_resolved_refseq_id, "'resolved_refseq_id' is a required property"], - [data_invalid_resolved_refseq_id, "1 is not of type 'string'"], + [data_no_resolved_id, "'resolved_id' is a required property"], + [data_invalid_resolved_id, "1 is not of type 'string'"], [data_no_resolve_attempted, "'resolve_attempted' is a required property"], [data_invalid_resolve_attempted, "1 is not of type 'boolean'"], [data_no_bgc_path, "'bgc_path' is a required property"], @@ -135,13 +133,13 @@ def test_valid_data(): "genome_status": [ { "original_id": "id1", - "resolved_refseq_id": "id1_refseq", + "resolved_id": "id1_refseq", "resolve_attempted": True, "bgc_path": "", }, { "original_id": "id2", - "resolved_refseq_id": "id2_refseq", + "resolved_id": "id2_refseq", "resolve_attempted": False, "bgc_path": "", }, From 398d700eddf80df7be50247a7568eaae3dc5b67e Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Thu, 20 Mar 2025 19:51:03 +0100 Subject: [PATCH 39/48] fix: skip antiSMASH DB retrieval for non refseq ids --- .../genomics/antismash/podp_antismash_downloader.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index ef53cbd6..3e409195 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -328,6 +328,12 @@ def retrieve_antismash_db_data( Raises: Exception: If an error occurs during the download or extraction process. """ + if not genome_status.resolved_id.startswith("GCF_"): + raise ValueError( + f"Resolved genome ID '{genome_status.resolved_id}' is not a valid RefSeq assembly and " + "antiSMASH-DB only contains results for RefSeq assemblies." + ) + antismash_id = genome_status.resolved_id extract_path = Path(extract_root, "antismash", antismash_id) download_path = Path(download_root, f"{antismash_id}.zip").absolute() From a2e6071ad7b7760d6a503ca21d0ebddb0c043cd3 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Thu, 20 Mar 2025 20:59:07 +0100 Subject: [PATCH 40/48] refactor: change resolve_attempted to failed_previously in GenomeStatus --- .../antismash/podp_antismash_downloader.py | 15 +++++----- .../schemas/genome_status_schema.json | 8 +++--- .../test_podp_antismash_downloader.py | 20 ++++++------- tests/unit/genomics/test_utils.py | 6 ++-- .../unit/schemas/test_genome_status_schema.py | 28 +++++++++---------- 5 files changed, 39 insertions(+), 38 deletions(-) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 3e409195..7fa2db7c 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -38,7 +38,7 @@ def __init__( self, original_id: str, resolved_id: str = "", - resolve_attempted: bool = False, + failed_previously: bool = False, bgc_path: str = "", ): """Initialize a GenomeStatus object for the given genome. @@ -46,14 +46,14 @@ def __init__( Args: original_id: The original ID of the genome. resolved_id: The resolved genome ID of the genome. Defaults to "". - resolve_attempted: A flag indicating whether an attempt to resolve - the genome ID has been made. Defaults to False. + failed_previously: Indicates whether a previous attempt to get BGC data + for the genome has failed. Defaults to False. bgc_path: The path to the downloaded BGC file for the genome. Defaults to "". """ self.original_id = original_id self.resolved_id = "" if resolved_id == "None" else resolved_id - self.resolve_attempted = resolve_attempted + self.failed_previously = failed_previously self.bgc_path = bgc_path @staticmethod @@ -118,7 +118,7 @@ def _to_dict(self) -> dict: return { "original_id": self.original_id, "resolved_id": self.resolved_id, - "resolve_attempted": self.resolve_attempted, + "failed_previously": self.failed_previously, "bgc_path": self.bgc_path, } @@ -185,16 +185,16 @@ def podp_download_and_extract_antismash_data( gs.bgc_path = "" # Reset bgc path # Check if a previous attempt to get bgc data has failed - if gs.resolve_attempted: + if gs.failed_previously: logger.info(f"Genome ID {original_genome_id} skipped due to previous failed attempt") continue # resolve genome ID try: gs.resolved_id = resolve_genome_accession(genome_record["genome_ID"]) - gs.resolve_attempted = True except Exception as e: logger.warning(f"Failed to resolve genome ID {gs.original_id}. Error: {e}") + gs.failed_previously = True continue # retrieve antismash BGC data from antiSMASH-DB @@ -236,6 +236,7 @@ def podp_download_and_extract_antismash_data( if gs.bgc_path == "": logger.warning(f"Failed to retrieve BGC data for genome ID {gs.original_id}.") + gs.failed_previously = True # raise and log warning for failed downloads failed_ids = [gs.original_id for gs in gs_dict.values() if not gs.bgc_path] diff --git a/src/nplinker/schemas/genome_status_schema.json b/src/nplinker/schemas/genome_status_schema.json index 5de93cbb..b3dbdef7 100644 --- a/src/nplinker/schemas/genome_status_schema.json +++ b/src/nplinker/schemas/genome_status_schema.json @@ -18,7 +18,7 @@ "required": [ "original_id", "resolved_id", - "resolve_attempted", + "failed_previously", "bgc_path" ], "properties": { @@ -33,10 +33,10 @@ "title": "Resolved genome ID", "description": "The genome ID that was resolved for this genome" }, - "resolve_attempted": { + "failed_previously": { "type": "boolean", - "title": "Resolve Attempted", - "description": "Whether or not an attempt was made to resolve this genome" + "title": "Failed Attempt", + "description": "Indicates if a previous attempt to retrieve BGC data for this genome was unsuccessful" }, "bgc_path": { "type": "string", diff --git a/tests/unit/genomics/test_podp_antismash_downloader.py b/tests/unit/genomics/test_podp_antismash_downloader.py index e13fc1af..ff137b0a 100644 --- a/tests/unit/genomics/test_podp_antismash_downloader.py +++ b/tests/unit/genomics/test_podp_antismash_downloader.py @@ -36,7 +36,7 @@ def genome_status_file(download_root): ) def test_genome_status_init(params, expected): gs = GenomeStatus(*params) - assert [gs.original_id, gs.resolved_id, gs.resolve_attempted, gs.bgc_path] == expected + assert [gs.original_id, gs.resolved_id, gs.failed_previously, gs.bgc_path] == expected def test_genome_status_read_json(tmp_path): @@ -45,13 +45,13 @@ def test_genome_status_read_json(tmp_path): { "original_id": "genome1", "resolved_id": "refseq1", - "resolve_attempted": True, + "failed_previously": True, "bgc_path": "/path/to/bgc1", }, { "original_id": "genome2", "resolved_id": "", - "resolve_attempted": False, + "failed_previously": False, "bgc_path": "", }, ], @@ -65,11 +65,11 @@ def test_genome_status_read_json(tmp_path): assert len(genome_status_dict) == 2 assert genome_status_dict["genome1"].original_id == "genome1" assert genome_status_dict["genome1"].resolved_id == "refseq1" - assert genome_status_dict["genome1"].resolve_attempted is True + assert genome_status_dict["genome1"].failed_previously is True assert genome_status_dict["genome1"].bgc_path == "/path/to/bgc1" assert genome_status_dict["genome2"].original_id == "genome2" assert genome_status_dict["genome2"].resolved_id == "" - assert genome_status_dict["genome2"].resolve_attempted is False + assert genome_status_dict["genome2"].failed_previously is False assert genome_status_dict["genome2"].bgc_path == "" @@ -87,11 +87,11 @@ def test_genome_status_to_json(tmp_path): assert len(loaded_data["genome_status"]) == 2 assert loaded_data["genome_status"][0]["original_id"] == "genome1" assert loaded_data["genome_status"][0]["resolved_id"] == "refseq1" - assert loaded_data["genome_status"][0]["resolve_attempted"] is True + assert loaded_data["genome_status"][0]["failed_previously"] is True assert loaded_data["genome_status"][0]["bgc_path"] == "/path/to/bgc1" assert loaded_data["genome_status"][1]["original_id"] == "genome2" assert loaded_data["genome_status"][1]["resolved_id"] == "" - assert loaded_data["genome_status"][1]["resolve_attempted"] is False + assert loaded_data["genome_status"][1]["failed_previously"] is False assert loaded_data["genome_status"][1]["bgc_path"] == "" @@ -106,9 +106,9 @@ def test_genome_status_to_json_nofile(): assert ( result == '{"genome_status": ' '[{"original_id": "genome1", "resolved_id": "refseq1", ' - '"resolve_attempted": true, "bgc_path": "/path/to/bgc1"}, ' + '"failed_previously": true, "bgc_path": "/path/to/bgc1"}, ' '{"original_id": "genome2", "resolved_id": "", ' - '"resolve_attempted": false, "bgc_path": ""}], "version": "1.0"}' + '"failed_previously": false, "bgc_path": ""}], "version": "1.0"}' ) @@ -215,7 +215,7 @@ def test_caching(download_root, extract_root, genome_status_file, caplog): genome_status_old = GenomeStatus.read_json(genome_status_file) genome_obj = genome_status_old["GCF_000016425.1"] assert Path(genome_obj.bgc_path).exists() - assert genome_obj.resolve_attempted + assert genome_obj.failed_previously podp_download_and_extract_antismash_data(genome_records, download_root, extract_root) assert ( f"antiSMASH BGC data for genome ID {genome_obj.original_id} already downloaded to {genome_obj.bgc_path}" diff --git a/tests/unit/genomics/test_utils.py b/tests/unit/genomics/test_utils.py index 9d1ff784..7d68eb66 100644 --- a/tests/unit/genomics/test_utils.py +++ b/tests/unit/genomics/test_utils.py @@ -193,19 +193,19 @@ def test_extract_mappings_original_genome_id_resolved_genome_id(tmp_path): { "original_id": "id1", "resolved_id": "refseq1", - "resolve_attempted": True, + "failed_previously": True, "bgc_path": "", }, { "original_id": "id2", "resolved_id": "refseq2", - "resolve_attempted": True, + "failed_previously": True, "bgc_path": "", }, { "original_id": "id3", "resolved_id": "refseq3", - "resolve_attempted": True, + "failed_previously": True, "bgc_path": "", }, ], diff --git a/tests/unit/schemas/test_genome_status_schema.py b/tests/unit/schemas/test_genome_status_schema.py index 6bd3401b..ddde0fa9 100644 --- a/tests/unit/schemas/test_genome_status_schema.py +++ b/tests/unit/schemas/test_genome_status_schema.py @@ -10,7 +10,7 @@ data_empty_genome_status = {"genome_status": [], "version": "1.0"} data_no_original_id = { - "genome_status": [{"resolved_id": "id1_refseq", "resolve_attempted": True, "bgc_path": ""}], + "genome_status": [{"resolved_id": "id1_refseq", "failed_previously": True, "bgc_path": ""}], "version": "1.0", } @@ -19,7 +19,7 @@ { "original_id": "", "resolved_id": "id1_refseq", - "resolve_attempted": True, + "failed_previously": True, "bgc_path": "", } ], @@ -31,7 +31,7 @@ { "original_id": 1, "resolved_id": "id1_refseq", - "resolve_attempted": True, + "failed_previously": True, "bgc_path": "", } ], @@ -39,28 +39,28 @@ } data_no_resolved_id = { - "genome_status": [{"original_id": "id1", "resolve_attempted": True, "bgc_path": ""}], + "genome_status": [{"original_id": "id1", "failed_previously": True, "bgc_path": ""}], "version": "1.0", } data_invalid_resolved_id = { "genome_status": [ - {"original_id": "id1", "resolved_id": 1, "resolve_attempted": True, "bgc_path": ""} + {"original_id": "id1", "resolved_id": 1, "failed_previously": True, "bgc_path": ""} ], "version": "1.0", } -data_no_resolve_attempted = { +data_no_failed_previously = { "genome_status": [{"original_id": "id1", "resolved_id": "id1_refseq", "bgc_path": ""}], "version": "1.0", } -data_invalid_resolve_attempted = { +data_invalid_failed_previously = { "genome_status": [ { "original_id": "id1", "resolved_id": "id1_refseq", - "resolve_attempted": 1, + "failed_previously": 1, "bgc_path": "", } ], @@ -69,7 +69,7 @@ data_no_bgc_path = { "genome_status": [ - {"original_id": "id1", "resolved_id": "id1_refseq", "resolve_attempted": True} + {"original_id": "id1", "resolved_id": "id1_refseq", "failed_previously": True} ], "version": "1.0", } @@ -79,7 +79,7 @@ { "original_id": "id1", "resolved_id": "id1_refseq", - "resolve_attempted": True, + "failed_previously": True, "bgc_path": 1, } ], @@ -112,8 +112,8 @@ [data_invalid_original_id, "1 is not of type 'string'"], [data_no_resolved_id, "'resolved_id' is a required property"], [data_invalid_resolved_id, "1 is not of type 'string'"], - [data_no_resolve_attempted, "'resolve_attempted' is a required property"], - [data_invalid_resolve_attempted, "1 is not of type 'boolean'"], + [data_no_failed_previously, "'failed_previously' is a required property"], + [data_invalid_failed_previously, "1 is not of type 'boolean'"], [data_no_bgc_path, "'bgc_path' is a required property"], [data_invalid_bgc_path, "1 is not of type 'string'"], [data_no_version, "'version' is a required property"], @@ -134,13 +134,13 @@ def test_valid_data(): { "original_id": "id1", "resolved_id": "id1_refseq", - "resolve_attempted": True, + "failed_previously": True, "bgc_path": "", }, { "original_id": "id2", "resolved_id": "id2_refseq", - "resolve_attempted": False, + "failed_previously": False, "bgc_path": "", }, ], From 83876504df33244d0f044ee33fe394868a4a27bd Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Thu, 20 Mar 2025 21:00:01 +0100 Subject: [PATCH 41/48] feat: save updated genome status to json after each genome --- .../genomics/antismash/podp_antismash_downloader.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 7fa2db7c..0a63b843 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -187,6 +187,7 @@ def podp_download_and_extract_antismash_data( # Check if a previous attempt to get bgc data has failed if gs.failed_previously: logger.info(f"Genome ID {original_genome_id} skipped due to previous failed attempt") + GenomeStatus.to_json(gs_dict, gs_file) continue # resolve genome ID @@ -195,6 +196,7 @@ def podp_download_and_extract_antismash_data( except Exception as e: logger.warning(f"Failed to resolve genome ID {gs.original_id}. Error: {e}") gs.failed_previously = True + GenomeStatus.to_json(gs_dict, gs_file) continue # retrieve antismash BGC data from antiSMASH-DB @@ -203,6 +205,7 @@ def podp_download_and_extract_antismash_data( logger.info( f"antiSMASH BGC data for genome ID {gs.original_id} is downloaded and extracted" ) + GenomeStatus.to_json(gs_dict, gs_file) continue except Exception as e: logger.info( @@ -227,6 +230,7 @@ def podp_download_and_extract_antismash_data( logger.info( f"antiSMASH BGC data for genome ID {gs.original_id} is downloaded and extracted" ) + GenomeStatus.to_json(gs_dict, gs_file) continue except Exception as e: logger.info( @@ -237,6 +241,7 @@ def podp_download_and_extract_antismash_data( if gs.bgc_path == "": logger.warning(f"Failed to retrieve BGC data for genome ID {gs.original_id}.") gs.failed_previously = True + GenomeStatus.to_json(gs_dict, gs_file) # raise and log warning for failed downloads failed_ids = [gs.original_id for gs in gs_dict.values() if not gs.bgc_path] @@ -247,9 +252,6 @@ def podp_download_and_extract_antismash_data( logger.warning(warning_message) warnings.warn(warning_message, UserWarning) - # save updated genome status to json file - GenomeStatus.to_json(gs_dict, gs_file) - if len(failed_ids) == len(genome_records): raise ValueError("No antiSMASH data found for any genome") From 377981500aeac7e8aa1c3549d499991993f65618 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Thu, 20 Mar 2025 21:11:20 +0100 Subject: [PATCH 42/48] fix: assert failed_previously is False in caching test --- tests/unit/genomics/test_podp_antismash_downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/genomics/test_podp_antismash_downloader.py b/tests/unit/genomics/test_podp_antismash_downloader.py index ff137b0a..6be23f7f 100644 --- a/tests/unit/genomics/test_podp_antismash_downloader.py +++ b/tests/unit/genomics/test_podp_antismash_downloader.py @@ -215,7 +215,7 @@ def test_caching(download_root, extract_root, genome_status_file, caplog): genome_status_old = GenomeStatus.read_json(genome_status_file) genome_obj = genome_status_old["GCF_000016425.1"] assert Path(genome_obj.bgc_path).exists() - assert genome_obj.failed_previously + assert genome_obj.failed_previously is False podp_download_and_extract_antismash_data(genome_records, download_root, extract_root) assert ( f"antiSMASH BGC data for genome ID {genome_obj.original_id} already downloaded to {genome_obj.bgc_path}" From a57994562db78cdc36584a2286baf75c069c8388 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Thu, 20 Mar 2025 21:52:41 +0100 Subject: [PATCH 43/48] refactor: remove unneccessary if statement --- .../genomics/antismash/podp_antismash_downloader.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 0a63b843..18d8d6fb 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -238,10 +238,9 @@ def podp_download_and_extract_antismash_data( f"Error: {e}" ) - if gs.bgc_path == "": - logger.warning(f"Failed to retrieve BGC data for genome ID {gs.original_id}.") - gs.failed_previously = True - GenomeStatus.to_json(gs_dict, gs_file) + logger.warning(f"Failed to retrieve BGC data for genome ID {gs.original_id}.") + gs.failed_previously = True + GenomeStatus.to_json(gs_dict, gs_file) # raise and log warning for failed downloads failed_ids = [gs.original_id for gs in gs_dict.values() if not gs.bgc_path] From 092c94a8ff914f70824eea7d0115da25b3a92c4c Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Fri, 21 Mar 2025 12:05:01 +0100 Subject: [PATCH 44/48] update type hint for genome_id_data to comply with mypy --- src/nplinker/genomics/antismash/genome_accession_resolver.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/nplinker/genomics/antismash/genome_accession_resolver.py b/src/nplinker/genomics/antismash/genome_accession_resolver.py index 95fb1364..e30433c3 100644 --- a/src/nplinker/genomics/antismash/genome_accession_resolver.py +++ b/src/nplinker/genomics/antismash/genome_accession_resolver.py @@ -3,6 +3,7 @@ from typing import Any from typing import Callable from typing import Literal +from typing import Mapping import httpx from bs4 import BeautifulSoup @@ -45,7 +46,7 @@ def get_latest_assembly_accession(acc: str) -> str: raise ValueError("No valid genome accession found in assembly revision history") -def resolve_genome_accession(genome_id_data: dict) -> str: +def resolve_genome_accession(genome_id_data: Mapping[Any, Any]) -> str: """Gets the NCBI genome assembly accession. Gets the latest RefSeq genome assembly accession, or if not available, From 47f561885123496718da2ce3dc1de8166a16d970 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Fri, 21 Mar 2025 11:56:02 +0100 Subject: [PATCH 45/48] check if BGC data already downloaded --- .../antismash/podp_antismash_downloader.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 18d8d6fb..737bd2bd 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -199,6 +199,24 @@ def podp_download_and_extract_antismash_data( GenomeStatus.to_json(gs_dict, gs_file) continue + # check if the antismash BGC data is already in the downloads + bgc_path = Path(project_download_root, f"{gs.resolved_id}.zip").absolute() + if bgc_path.exists(): + logger.info( + f"antiSMASH BGC data for genome ID {original_genome_id} already downloaded to " + f"{bgc_path}" + ) + try: + process_existing_antismash_data(gs, project_extract_root) + gs.bgc_path = str(bgc_path) + GenomeStatus.to_json(gs_dict, gs_file) + continue + except Exception as e: + logger.warning( + "Failed to process existing antiSMASH BGC data for genome ID " + f"{original_genome_id}. Error: {e}" + ) + # retrieve antismash BGC data from antiSMASH-DB try: retrieve_antismash_db_data(gs, project_download_root, project_extract_root) From 91020c9a4e9b2f00c429e40e83c22a63eef1a267 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Fri, 21 Mar 2025 11:59:19 +0100 Subject: [PATCH 46/48] fix: correct spelling of antiSMASH in logging and comments --- .../genomics/antismash/podp_antismash_downloader.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 737bd2bd..9e4ff5ac 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -157,7 +157,7 @@ def podp_download_and_extract_antismash_data( for i, genome_record in enumerate(genome_records): logger.info( - f"Getting antismash BGC data for genome record {i + 1} of {len(genome_records)}." + f"Getting antiSMASH BGC data for genome record {i + 1} of {len(genome_records)}." ) # get the best available genome ID from the dict @@ -199,7 +199,7 @@ def podp_download_and_extract_antismash_data( GenomeStatus.to_json(gs_dict, gs_file) continue - # check if the antismash BGC data is already in the downloads + # check if the antiSMASH BGC data is already in the downloads bgc_path = Path(project_download_root, f"{gs.resolved_id}.zip").absolute() if bgc_path.exists(): logger.info( @@ -217,7 +217,7 @@ def podp_download_and_extract_antismash_data( f"{original_genome_id}. Error: {e}" ) - # retrieve antismash BGC data from antiSMASH-DB + # retrieve antiSMASH BGC data from antiSMASH-DB try: retrieve_antismash_db_data(gs, project_download_root, project_extract_root) logger.info( From bf6b6c54cdab8dca915f8b30f3ddd0d6986cb3a3 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Fri, 21 Mar 2025 12:27:48 +0100 Subject: [PATCH 47/48] fix: add bgc_path to genome status to ensure correct extraction path --- src/nplinker/genomics/antismash/podp_antismash_downloader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 9e4ff5ac..d36b5a77 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -207,8 +207,8 @@ def podp_download_and_extract_antismash_data( f"{bgc_path}" ) try: - process_existing_antismash_data(gs, project_extract_root) gs.bgc_path = str(bgc_path) + process_existing_antismash_data(gs, project_extract_root) GenomeStatus.to_json(gs_dict, gs_file) continue except Exception as e: @@ -216,6 +216,7 @@ def podp_download_and_extract_antismash_data( "Failed to process existing antiSMASH BGC data for genome ID " f"{original_genome_id}. Error: {e}" ) + gs.bgc_path = "" # retrieve antiSMASH BGC data from antiSMASH-DB try: From f197fef272685d93bfbc22c2be8d61324b17b172 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Fri, 21 Mar 2025 12:29:52 +0100 Subject: [PATCH 48/48] refactor: use original genome ID from the genome status object for consistency --- .../genomics/antismash/podp_antismash_downloader.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index d36b5a77..b630b963 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -171,7 +171,7 @@ def podp_download_and_extract_antismash_data( # Check if genomes already have antiSMASH BGC data if gs.bgc_path and Path(gs.bgc_path).exists(): logger.info( - f"antiSMASH BGC data for genome ID {original_genome_id} already downloaded to " + f"antiSMASH BGC data for genome ID {gs.original_id} already downloaded to " f"{gs.bgc_path}" ) try: @@ -180,13 +180,13 @@ def podp_download_and_extract_antismash_data( except Exception as e: logger.warning( "Failed to process existing antiSMASH BGC data for genome ID " - f"{original_genome_id}. Error: {e}" + f"{gs.original_id}. Error: {e}" ) gs.bgc_path = "" # Reset bgc path # Check if a previous attempt to get bgc data has failed if gs.failed_previously: - logger.info(f"Genome ID {original_genome_id} skipped due to previous failed attempt") + logger.info(f"Genome ID {gs.original_id} skipped due to previous failed attempt") GenomeStatus.to_json(gs_dict, gs_file) continue @@ -203,7 +203,7 @@ def podp_download_and_extract_antismash_data( bgc_path = Path(project_download_root, f"{gs.resolved_id}.zip").absolute() if bgc_path.exists(): logger.info( - f"antiSMASH BGC data for genome ID {original_genome_id} already downloaded to " + f"antiSMASH BGC data for genome ID {gs.original_id} already downloaded to " f"{bgc_path}" ) try: @@ -214,7 +214,7 @@ def podp_download_and_extract_antismash_data( except Exception as e: logger.warning( "Failed to process existing antiSMASH BGC data for genome ID " - f"{original_genome_id}. Error: {e}" + f"{gs.original_id}. Error: {e}" ) gs.bgc_path = ""