NPLinker · liannette · Mar 18, 2025 · Mar 18, 2025 · Mar 18, 2025 · Mar 18, 2025
diff --git a/.github/workflows/format-typing-check.yml b/.github/workflows/format-typing-check.yml
@@ -37,8 +37,8 @@ jobs:
       - name: Install ruff and mypy
         run: |
           pip install ruff mypy typing_extensions \
-            types-Deprecated types-beautifulsoup4 types-jsonschema \
-            types-networkx types-tabulate types-PyYAML pandas-stubs
+            types-Deprecated types-beautifulsoup4 types-jsonschema types-requests \
+            types-networkx types-tabulate types-PyYAML pandas-stubs 
       - name: Get all changed python files
         id: changed-python-files
         uses: tj-actions/changed-files@v44

diff --git a/pyproject.toml b/pyproject.toml
@@ -58,6 +58,7 @@ dev = [
     "mypy",
     "typing_extensions",
     # stub packages. Update the `format-typing-check.yml` too if you add more.
+    "types-requests",
     "types-beautifulsoup4",
     "types-jsonschema",
     "types-networkx",

diff --git a/src/nplinker/genomics/antismash/__init__.py b/src/nplinker/genomics/antismash/__init__.py
@@ -1,16 +1,28 @@
-from .antismash_downloader import download_and_extract_antismash_data
+from .antismash_api_client import antismash_job_is_done
+from .antismash_api_client import submit_antismash_job
+from .antismash_downloader import download_and_extract_from_antismash_api
+from .antismash_downloader import download_and_extract_from_antismash_db
+from .antismash_downloader import extract_antismash_data
 from .antismash_loader import AntismashBGCLoader
 from .antismash_loader import parse_bgc_genbank
+from .genome_accession_resolver import resolve_genome_accession
+from .ncbi_downloader import download_and_extract_ncbi_genome
 from .podp_antismash_downloader import GenomeStatus
 from .podp_antismash_downloader import get_best_available_genome_id
 from .podp_antismash_downloader import podp_download_and_extract_antismash_data
 
 
 __all__ = [
-    "download_and_extract_antismash_data",
+    "extract_antismash_data",
+    "resolve_genome_accession",
+    "download_and_extract_from_antismash_api",
+    "download_and_extract_from_antismash_db",
     "AntismashBGCLoader",
     "parse_bgc_genbank",
     "GenomeStatus",
     "get_best_available_genome_id",
     "podp_download_and_extract_antismash_data",
+    "download_and_extract_ncbi_genome",
+    "submit_antismash_job",
+    "antismash_job_is_done",
 ]
diff --git a/src/nplinker/genomics/antismash/antismash_api_client.py b/src/nplinker/genomics/antismash/antismash_api_client.py
@@ -0,0 +1,81 @@
+from __future__ import annotations
+import logging
+from os import PathLike
+from pathlib import Path
+import requests
+
+
+logger = logging.getLogger(__name__)
+
+
+def submit_antismash_job(genbank_filepath: str | PathLike) -> str:
+    """Submits an antiSMASH job using the provided GenBank file.
+
+    This function sends a GenBank file to the antiSMASH API
+    and retrieves the job ID if the submission is successful.
+
+    Args:
+        genbank_filepath (str | PathLike): The path to the GenBank file to be submitted.
+
+    Returns:
+        str: The job ID of the submitted antiSMASH job.
+
+    Raises:
+        requests.exceptions.RequestException: If there is an issue with the HTTP request.
+        RuntimeError: If the API response does not contain a job ID.
+    """
+    url = "https://antismash.secondarymetabolites.org/api/v1.0/submit"
+    genbank_filepath = Path(genbank_filepath)
+
+    with open(genbank_filepath, "rb") as file:
+        files = {"seq": file}
+        response = requests.post(url, files=files)
+        response.raise_for_status()  # Raise an exception for HTTP errors
+
+    data = response.json()
+    if "id" not in data:
+        raise RuntimeError("No antiSMASH job ID returned")
+    return str(data["id"])
+
+
+def antismash_job_is_done(job_id: str) -> bool:
+    """Determines if the antiSMASH job has completed by checking its status.
+
+    This function queries the antiSMASH API to retrieve the current state
+    of the job and determines whether it has finished successfully, is still
+    in progress, or has encountered an error.
+
+    Args:
+        job_id (str): The unique identifier of the antiSMASH job.
+
+    Returns:
+        bool: True if the job is completed successfully, False if it is still
+            running or queued.
+
+    Raises:
+        RuntimeError: If the job has failed or if the API response indicates an error.
+        ValueError: If the job state is missing or an unexpected state is encountered
+            in the API response.
+        requests.exceptions.HTTPError: If an HTTP error occurs during the API request.
+    """
+    url = f"https://antismash.secondarymetabolites.org/api/v1.0/status/{job_id}"
+
+    response = requests.get(url, timeout=10)
+    response.raise_for_status()  # Raise exception for HTTP errors
+    respose_data = response.json()
+
+    if "state" not in respose_data:
+        raise ValueError(f"Job state missing in response for job_id: {job_id}")
+
+    job_state = respose_data["state"]
-    respose_data = response.json()
-
-    if "state" not in respose_data:
-        raise ValueError(f"Job state missing in response for job_id: {job_id}")
-
-    job_state = respose_data["state"]
+    response_data = response.json()
+
+    if "state" not in response_data:
+        raise ValueError(f"Job state missing in response for job_id: {job_id}")
+
+    job_state = response_data["state"]
-    respose_data = response.json()
-
-    if "state" not in respose_data:
-        raise ValueError(f"Job state missing in response for job_id: {job_id}")
-
-    job_state = respose_data["state"]
+    response_data = response.json()
+
+    if "state" not in response_data:
+        raise ValueError(f"Job state missing in response for job_id: {job_id}")
+
+    job_state = response_data["state"]
+    if job_state in ("running", "queued"):
+        return False
+    if job_state == "done":
+        return True
+    if job_state == "failed":
+        job_status = respose_data.get("status", "No error message provided")
+        raise RuntimeError(f"AntiSMASH job {job_id} failed with an error: {job_status}")
+    else:
+        raise ValueError(
+            f"Unexpected job state for antismash job ID {job_id}. Job state: {job_state}"
+        )
diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py
@@ -4,7 +4,9 @@
 import shutil
 from os import PathLike
 from pathlib import Path
+import requests
 from nplinker.utils import download_and_extract_archive
+from nplinker.utils import extract_archive
 from nplinker.utils import list_dirs
 from nplinker.utils import list_files
 
@@ -15,10 +17,75 @@
 ANTISMASH_DB_DOWNLOAD_URL = "https://antismash-db.secondarymetabolites.org/output/{}/{}"
 # The antiSMASH DBV2 is for the availability of the old version, better to keep it.
 ANTISMASH_DBV2_DOWNLOAD_URL = "https://antismash-dbv2.secondarymetabolites.org/output/{}/{}"
+# antismash api to download results from submitted jobs
+ANTISMASH_API_DOWNLOAD_URL = "https://antismash.secondarymetabolites.org/upload/{}/{}"
 
 
 def download_and_extract_antismash_data(
-    antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike
+    url: str, antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike
+) -> None:
+    """Download and extract antiSMASH BGC archive for a specified genome.
+
+    This function downloads a BGC archive from the specified URL, extracts its contents,
+    and organizes the extracted files into a structured directory under the given `extract_root`.
+
+    Args:
+        url (str): The URL to download the BGC archive from.
+        antismash_id (str): The identifier for the antiSMASH genome, used to name the extraction directory.
+        download_root: Path to the directory where the downloaded archive will be stored.
+        extract_root: Path to the directory where the data files will be extracted.
+            Note that an `antismash` directory will be created in the specified `extract_root` if
+            it doesn't exist. The files will be extracted to `<extract_root>/antismash/<antismash_id>` directory.
+
+    Raises:
+        ValueError: if `<extract_root>/antismash/<antismash_id>` dir is not empty.
+        Exception: If any error occurs during the download or extraction process, the partially extracted
+            directory will be cleaned up, and the exception will be re-raised.
+
+    Examples:
+         >>> download_and_extract_antismash_data(
+                 "https://antismash-db.secondarymetabolites.org/output/GCF_001.1/GCF_001.1.zip",
+                 "GCF_001.1",
+                 "/data/download",
+                 "/data/extracted"
+             )
+    """
+    extract_path = Path(extract_root) / "antismash" / antismash_id
+
+    _prepare_extract_path(extract_path)
+    try:
+        download_and_extract_archive(url, download_root, extract_path, f"{antismash_id}.zip")
+        _cleanup_extracted_files(extract_path)
+    except Exception as e:
+        shutil.rmtree(extract_path)
+        raise e
+
+
+def download_and_extract_from_antismash_api(
+    job_id: str, antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike
+) -> None:
+    """Downloads and extracts results from an antiSMASH API job.
+
+    This function constructs the download URL using the provided job ID then
+    downloads the results as a ZIP file and extracts its contents to the specified directories.
+
+    Args:
+        antismash_id (str): The unique identifier for the antiSMASH dataset.
+        job_id (str): The job ID for the antiSMASH API job.
+        download_root (str or PathLike): The root directory where the ZIP file will be downloaded.
+        extract_root (str or PathLike): The root directory where the contents of the ZIP file will be extracted.
+
+    Raises:
+        requests.exceptions.RequestException: If there is an issue with the HTTP request.
+        zipfile.BadZipFile: If the downloaded file is not a valid ZIP file.
+        OSError: If there is an issue with file operations such as writing or extracting.
+    """
+    url = ANTISMASH_API_DOWNLOAD_URL.format(job_id, antismash_id + ".zip")
+    download_and_extract_antismash_data(url, antismash_id, download_root, extract_root)
+
+
+def download_and_extract_from_antismash_db(
+    refseq_acc: str, download_root: str | PathLike, extract_root: str | PathLike
 ) -> None:
     """Download and extract antiSMASH BGC archive for a specified genome.
 
@@ -27,7 +94,7 @@ def download_and_extract_antismash_data(
     of a genome as the id of the archive.
 
     Args:
-        antismash_id: The id used to download BGC archive from antiSMASH database.
+        refseq_acc: The id used to download BGC archive from antiSMASH database.
             If the id is versioned (e.g., "GCF_004339725.1") please be sure to
             specify the version as well.
         download_root: Path to the directory to place downloaded archive in.
@@ -36,45 +103,77 @@ def download_and_extract_antismash_data(
             it doesn't exist. The files will be extracted to `<extract_root>/antismash/<antismash_id>` directory.
 
     Raises:
-        ValueError: if `<extract_root>/antismash/<refseq_assembly_id>` dir is not empty.
+        ValueError: if `<extract_root>/antismash/<refseq_acc>` dir is not empty.
 
     Examples:
-        >>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted")
+        >>> download_and_extract_from_antismash_db("GCF_004339725.1", "/data/download", "/data/extracted")
     """
-    download_root = Path(download_root)
-    extract_root = Path(extract_root)
-    extract_path = extract_root / "antismash" / antismash_id
+    for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]:
+        url = base_url.format(refseq_acc, f"{refseq_acc}.zip")
+        if requests.head(url).status_code == 404:  # not found
+            continue
+        download_and_extract_antismash_data(url, refseq_acc, download_root, extract_root)
+        return  # Exit the loop once a valid URL is processed
 
-    try:
-        if extract_path.exists():
-            _check_extract_path(extract_path)
-        else:
-            extract_path.mkdir(parents=True, exist_ok=True)
+    # if both urls give 404 not found
+    raise RuntimeError(f"No results in antiSMASH DB for {refseq_acc}")
 
-        for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]:
-            url = base_url.format(antismash_id, antismash_id + ".zip")
-            download_and_extract_archive(url, download_root, extract_path, antismash_id + ".zip")
-            break
 
-        # delete subdirs
-        for subdir_path in list_dirs(extract_path):
-            shutil.rmtree(subdir_path)
+def extract_antismash_data(
+    archive: str | PathLike, extract_root: str | PathLike, antimash_id: str
+) -> None:
+    """Extracts antiSMASH results from a given archive into a specified directory.
 
-        # delete unnecessary files
-        files_to_keep = list_files(extract_path, suffix=(".json", ".gbk"))
-        for file in list_files(extract_path):
-            if file not in files_to_keep:
-                os.remove(file)
+    This function handles the extraction of antiSMASH results by preparing the
+    extraction path, extracting the archive, and performing cleanup of
+    unnecessary files. If an error occurs during the process, the partially
+    extracted files are removed, and the exception is re-raised.
 
-        logger.info("antiSMASH BGC data of %s is downloaded and extracted.", antismash_id)
+    Args:
+        archive (str | PathLike): The path to the archive file containing antiSMASH results.
+        extract_root (str | PathLike): The root directory where the data should
+            be extracted.
+        antimash_id (str): A unique identifier for the antiSMASH data, used to
+            create a subdirectory for the extracted files.
+
+    Raises:
+        Exception: If any error occurs during the extraction process, the
+            exception is re-raised after cleaning up the extraction directory.
+    """
+    extract_path = Path(extract_root) / "antismash" / antimash_id
+
+    _prepare_extract_path(extract_path)
+
+    try:
+        extract_archive(archive, extract_path, remove_finished=False)
+        _cleanup_extracted_files(extract_path)
 
     except Exception as e:
         shutil.rmtree(extract_path)
-        logger.warning(e)
         raise e
 
 
 def _check_extract_path(extract_path: Path):
     # check if extract_path is empty
     if any(extract_path.iterdir()):
         raise ValueError(f'Nonempty directory: "{extract_path}"')
+
+
+def _cleanup_extracted_files(extract_path: str | PathLike) -> None:
+    # delete subdirs
+    for subdir_path in list_dirs(extract_path):
+        shutil.rmtree(subdir_path)
+
+    # delete unnecessary files
+    files_to_keep = list_files(extract_path, suffix=(".json", ".gbk"))
+    for file in list_files(extract_path):
+        if file not in files_to_keep:
+            os.remove(file)
+
+
+def _prepare_extract_path(extract_path: str | PathLike) -> None:
+    extract_path = Path(extract_path)
+    if extract_path.exists():
+        _check_extract_path(extract_path)
+    else:
+        extract_path.mkdir(parents=True, exist_ok=True)