Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ampcombi 2.0.1 #427

Merged
merged 13 commits into from
Dec 16, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### `Added`

- [#427](https://github.com/nf-core/funcscan/pull/427) Updated AMPcombi from v0.2.2 to v2.0.1. AMP now can use multiple other databases for classifications. (by @darcy220606)

### `Fixed`

- [#427](https://github.com/nf-core/funcscan/pull/427) Fixed the AMP reference database issues reported by users, due to non-ASCII characters. (by @darcy220606)

### `Dependencies`

| Tool | Previous version | New version |
| -------- | ---------------- | ----------- |
| AMPcombi | 0.2.2 | 2.0.1 |

### `Deprecated`

## v2.0.0 - [2024-09-05]
Expand Down
184 changes: 125 additions & 59 deletions bin/ampcombi_download.py
Original file line number Diff line number Diff line change
@@ -1,78 +1,144 @@
#!/usr/bin/env python3

#########################################
# Authors: [Anan Ibrahim](https://github.com/brianjohnhaas), [Louisa Perelo](https://github.com/louperelo)
# Authors: [Anan Ibrahim](https://github.com/Darcy220606/AMPcombi), [Louisa Perelo](https://github.com/louperelo)
# File: amp_database.py
# Source: https://github.com/Darcy220606/AMPcombi/blob/main/ampcombi/amp_database.py
# Source+commit: https://github.com/Darcy220606/AMPcombi/commit/a75bc00c32ecf873a133b18cf01f172ad9cf0d2d/ampcombi/amp_database.py
# Download Date: 2023-03-08, commit: a75bc00c
# This source code is licensed under the MIT license
#########################################

# TITLE: Download the DRAMP database if input db empty AND and make database compatible for diamond
# TITLE: Download the reference database specified by the user.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this script be wrapped up within AMPcombi itself, e.g. ampcombi download ...?

Copy link
Contributor Author

@Darcy220606 Darcy220606 Dec 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is but u remember the issue with it downloading the ref. database every time for each sample? We did that to only let it run once in the pipeline. I didnt add a submodule for downloading the database, its wrapped in parse_table sub module

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also as i wrote in the PR comment, for this PR to run as it should we still need to update the module to v2.0.1, which has a sep. PR open in modules ;)


import pandas as pd
import requests
import os
from datetime import datetime
import re
import subprocess
from Bio import SeqIO
import tempfile
import shutil
import argparse

from datetime import datetime
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

########################################
# FUNCTION: DOWNLOAD DRAMP DATABASE AND CLEAN IT
# FUNCTION: DOWNLOAD DATABASES AND CLEAN DRAMP and APD
#########################################
def download_DRAMP(db):
##Download the (table) file and store it in a results directory
url = "http://dramp.cpu-bioinfor.org/downloads/download.php?filename=download_data/DRAMP3.0_new/general_amps.xlsx"
r = requests.get(url, allow_redirects=True)
with open(db + "/" + "general_amps.xlsx", "wb") as f:
f.write(r.content)
##Convert excel to tab sep file and write it to a file in the DRAMP_db directly with the date its downloaded
date = datetime.now().strftime("%Y_%m_%d")
ref_amps = pd.read_excel(db + "/" + r"general_amps.xlsx")
ref_amps.to_csv(db + "/" + f"general_amps_{date}.tsv", index=None, header=True, sep="\t")
##Download the (fasta) file and store it in a results directory
urlfasta = (
"http://dramp.cpu-bioinfor.org/downloads/download.php?filename=download_data/DRAMP3.0_new/general_amps.fasta"
)
z = requests.get(urlfasta)
fasta_path = os.path.join(db + "/" + f"general_amps_{date}.fasta")
with open(fasta_path, "wb") as f:
f.write(z.content)
##Cleaning step to remove ambigous aminoacids from sequences in the database (e.g. zeros and brackets)
new_fasta = db + "/" + f"general_amps_{date}_clean.fasta"
seq_record = SeqIO.parse(open(fasta_path), "fasta")
with open(new_fasta, "w") as f:
for record in seq_record:
id, sequence = record.id, str(record.seq)
letters = [
"A",
"C",
"D",
"E",
"F",
"G",
"H",
"I",
"K",
"L",
"M",
"N",
"P",
"Q",
"R",
"S",
"T",
"V",
"W",
"Y",
]
new = "".join(i for i in sequence if i in letters)
f.write(">" + id + "\n" + new + "\n")
return os.remove(fasta_path), os.remove(db + "/" + r"general_amps.xlsx")
def download_ref_db(database, threads):
"""
Downloads a specified AMP (antimicrobial peptide) reference database based on the
provided database name and saves it to the specified directory.
This supports downloading databases only from DRAMP, APD, and UniRef100.
Parameters:
----------
db : str
The directory path where the downloaded database should be saved.
database : str
The name of the database to download. Must be one of 'DRAMP', 'APD', or 'UniRef100'.
threads : int
Number of threads to use when downloading the UniRef100 database with `mmseqs`.
"""
# Check which database was given
if database == 'DRAMP':
# Create dir
db = 'amp_DRAMP_database'
os.makedirs(db, exist_ok=True)
# Download the file
try:
url = 'http://dramp.cpu-bioinfor.org/downloads/download.php?filename=download_data/DRAMP3.0_new/general_amps.txt'
response = requests.get(url, allow_redirects=True)
response.raise_for_status() # Check for any download errors
date = datetime.now().strftime("%Y_%m_%d")
with open(db + '/' + f'general_amps_{date}.txt', 'wb') as file:
file.write(response.content)
print(f"File downloaded successfully and saved to {db}/general_amps_{date}.txt")
# Create fasta version and clean it
db_df = pd.read_csv(f'{db}/general_amps_{date}.txt', sep='\t')
records = []
valid_sequence_pattern = re.compile("^[ACDEFGHIKLMNPQRSTVWY]+$")
for index, row in db_df.iterrows():
sequence = row['Sequence']
if valid_sequence_pattern.match(sequence):
record = SeqRecord(Seq(sequence), id=str(row['DRAMP_ID']), description="")
records.append(record)
output_file = f'{db}/general_amps_{date}.fasta'
SeqIO.write(records, output_file, "fasta")
except requests.exceptions.RequestException as e:
print(f"Failed to download DRAMP AMP general database file: {e}")
return

if database == 'APD':
# Create dir
db = 'amp_APD_database'
os.makedirs(db, exist_ok=True)
# Download the file
try:
url = 'https://aps.unmc.edu/assets/sequences/APD_sequence_release_09142020.fasta'
response = requests.get(url, allow_redirects=True, verify=False) # Disable SSL verification due to site certificate issue
response.raise_for_status()
content = response.text
print("APD AMP database downloaded successfully.")
except requests.exceptions.RequestException as e:
print(f"Failed to download content: {e}")
return
# Save the content line-by-line exactly as is
try:
with open(db + '/' + 'APD_orig.fasta', 'w') as file:
file.write(content)
with open(f'{db}/APD.fasta', 'w') as output_handle:
valid_sequence_pattern = re.compile("^[ACDEFGHIKLMNPQRSTVWY]+$")
for record in SeqIO.parse(f'{db}/APD_orig.fasta', "fasta"):
sequence = str(record.seq)
if valid_sequence_pattern.match(sequence):
SeqIO.write(record, output_handle, "fasta")
os.remove(db + '/' + 'APD_orig.fasta')
print(f"APD AMP database saved successfully to {db}/APD.fasta")
# Fasta to table
headers = []
sequences = []
seq_ids = []
for i, record in enumerate(SeqIO.parse(f'{db}/APD.fasta', "fasta")):
sequence_id = record.description.split('|')[0]
headers.append(record.description)
sequences.append(str(record.seq))
seq_ids.append(sequence_id)
db_df = pd.DataFrame({
"APD_ID": seq_ids,
"APD_Description": headers,
"APD_Sequence": sequences})
db_df.to_csv(f'{db}/APD.txt', sep='\t', index=False, header=True)
os.remove(db + '/' + 'APD.fasta')
# Table to fasta
records = []
for index, row in db_df.iterrows():
sequence = row['APD_Sequence']
record = SeqRecord(Seq(sequence), id=str(row['APD_ID']), description="")
records.append(record)
output_file = f'{db}/APD.fasta'
SeqIO.write(records, output_file, "fasta")
except Exception as e:
print(f"Failed to save APD AMP database: {e}")

if database == 'UniRef100':
# Create dir
db = 'amp_UniRef100_database'
os.makedirs(db, exist_ok=True)
# Download the file
try:
os.makedirs(f'{db}/mmseqs2', exist_ok=True)
command = f"mmseqs databases UniRef100 {db}/mmseqs2/ref_DB {db}/mmseqs2/tmp --remove-tmp-files true --threads {threads} -v 0"
subprocess.run(command, shell=True, check=True)
print(f"UniRef100 protein database downloaded successfully and saved to {db}/mmseqs2/UniRef100")
except subprocess.CalledProcessError as e:
print(f"Failed to download UniRef100 protein database: {e}")

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Downloads a specified AMP (antimicrobial peptide) reference database based on the provided database name and saves it to the specified directory.")
parser.add_argument("--database_id", dest="database", type=str, required=True, choices=["DRAMP", "APD", "UniRef100"],
help="Database ID - one of DRAMP, APD, or UniRef100. This parameter is required.")
parser.add_argument("--threads", type=int, default=4,
help="Number of threads supplied to mmseqs databases. Only relevant in the case of 'UniRef100'. Default is 4.")

download_DRAMP("amp_ref_database")
args = parser.parse_args()
download_ref_db(args.database, args.threads)
4 changes: 2 additions & 2 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -674,9 +674,9 @@ process {
]
}

withName: DRAMP_DOWNLOAD {
withName: AMP_DATABASE_DOWNLOAD {
publishDir = [
path: { "${params.outdir}/databases/dramp" },
path: { "${params.outdir}/databases/${params.amp_ampcombi_db}" },
mode: params.publish_dir_mode,
enabled: params.save_db,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
Expand Down
Loading
Loading