Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions microbetag/QS/HMMres_to_QS.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin python3

import os
import json
import argparse
import csv

def parse_args():
parser = argparse.ArgumentParser(
description="Annotate genomes with QS sending and receiving pathways based on HMMsearch results."
)
parser.add_argument("input_dir", type=str,
help="Directory containing HMMsearch .tbl result files")
parser.add_argument("qs_json", type=str,
help="Path to QS pathway definition JSON file (map02024_parse_dict.json)")
parser.add_argument("--evalue", type=float, default=1e-5,
help="E-value threshold for KO filtering (default: 1e-5)")
parser.add_argument("--output", type=str, default="qs_summary.csv",
help="Output CSV file path (default: qs_summary.csv)")
return parser.parse_args()

def load_qs_pathways(json_file):
with open(json_file, "r") as f:
raw_data = json.load(f)

formatted = {}
for pid, entries in raw_data.items():
entry_dict = {list(d.keys())[0]: list(d.values())[0] for d in entries}
formatted[int(pid)] = {
"Mode species": entry_dict["Mode species"],
"Signal Compound": entry_dict["Signal Compound"],
"Signal Type": entry_dict["Signal Type"],
"Biosynthesis": set(entry_dict["Biosysthesis"]),
"Transporter": set(entry_dict["Transporter"]),
"Sensing": set(entry_dict["Sensing"])
}
return formatted

def parse_tbl_file(file_path, evalue_threshold):
ko_set = set()
with open(file_path, "r") as f:
for line in f:
if line.startswith("#") or line.strip() == "":
continue
parts = line.strip().split()
if len(parts) < 5:
continue
ko = parts[2]
evalue = float(parts[4]) if "e" not in parts[4] else float(parts[4].replace("e", "E"))
if evalue < evalue_threshold:
ko_set.add(ko)
return ko_set

def analyze_genome(ko_set, qs_pathways):
sending = []
receiving = []
for pid, info in qs_pathways.items():
bios = info["Biosynthesis"]
trans = info["Transporter"]
sens = info["Sensing"]

is_sender = not bios or bios.issubset(ko_set)
is_receiver = trans.issubset(ko_set) and sens.issubset(ko_set)

if is_sender:
sending.append(str(pid))
if is_receiver:
receiving.append(str(pid))

return "/".join(sending), "/".join(receiving)

def main():
args = parse_args()
qs_pathways = load_qs_pathways(args.qs_json)

result_rows = []

for filename in os.listdir(args.input_dir):
if filename.endswith("_results.tbl"):
file_path = os.path.join(args.input_dir, filename)
genome = filename.split("_results.tbl")[0]
kos = parse_tbl_file(file_path, args.evalue)
send, receive = analyze_genome(kos, qs_pathways)
result_rows.append((genome, send, receive))

with open(args.output, "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["Genome", "Sending Pathway", "Receiving Pathway"])
for row in result_rows:
writer.writerow(row)

print(f"Summary file saved to: {args.output}")

if __name__ == "__main__":
main()
6 changes: 6 additions & 0 deletions microbetag/QS/QS_annotation.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Command for using QS database to annotate quorum sensing functions in users' genome

for genome in ../genome_dir/*.faa;do
genome_name=$(basename "$genome" .faa)
hmmsearch --tblout "kegg_hmm_results/${genome_name}_results.tbl" m02024_genes.hmm "$genome" > "kegg_hmm_results/${genome_name}_log.txt"
done
105 changes: 105 additions & 0 deletions microbetag/QS/Readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# QS Function Annotation

## Step 0: Build Quorum Sensing Database

Script: build_QSdb.sh

Required file(s): **m02024_genes.txt** in the same directory.

Purpose: This script downloads the KEGG Orthology HMM database, extracts HMMs related to KEGG pathway m02024 (quorum sensing), builds an HMMsearch database, and keeps only the final outputs in the current directory.

Command: `bash /path/to/build_QSdb.sh`

Output(s):
`m02024_genes.hmm`
`m02024_genes.hmm.h3f`
`m02024_genes.hmm.h3i`
`m02024_genes.hmm.h3m`
`m02024_genes.hmm.h3p`

## Step 1: Annotate QS functions against QSdb in users' genomes

Script: QS_annotation.sh

Required file(s): 5 related HMM database files generated in Step 0, Users' genomes in protein sequences (.faa).

Purpose: Command for using QS database to annotate quorum sensing functions in users' genome.

Command: `bash /path/to/QS_annotation.sh`

Output(s): HMM result tabular files.

## Step 2: Annotate genomes with QS sending and receiving pathways based on HMMsearch results

Script: HMMres_to_QS.py

Required file(s): **map02024_parse_dict.json** and HMM result tabular files generated in Step 1.

Command and help:

```
python annotate_qs_pathways.py -h
usage: annotate_qs_pathways.py [-h] [--evalue EVALUE] [--output OUTPUT]
input_dir qs_json

Annotate genomes with QS sending and receiving pathways based on HMMsearch
results.

positional arguments:
input_dir Directory containing HMMsearch .tbl result files
qs_json Path to QS pathway definition JSON file
(map02024_parse_dict.json)

optional arguments:
-h, --help show this help message and exit
--evalue EVALUE E-value threshold for KO filtering (default: 1e-5)
--output OUTPUT Output CSV file path (default: qs_summary.csv)
```

Result preview:

```
Genome,Sending Pathway,Receiving Pathway
GCF_900111605.1,3/6/7/8/9/10/11/12/13/14/15/17/18/19/20/22/25/27/28/29/30/34,2/3/4/5/6/7/8/9/10/11/12/13/14/15/17/18/20/21/22/23/24/25/26/27/28/29/30/31/32/34/36/37/38/39/40/41
GCA_001883705.1,3/4/5/7/8/9/10/11/12/13/14/15/17/18/19/20/21/22/23/24/25/26/27/28/29/30/31/34,1/2/3/4/5/6/7/8/9/10/11/12/13/14/15/16/17/18/19/20/21/22/23/24/25/26/27/28/29/30/31/32/34/36/37/39/40/41
GCF_014643695.1,1/9/10/11/12/13/14/15/16/27/28/34,4/5/7/8/9/10/11/12/13/14/15/17/18/19/20/21/22/23/24/25/26/27/28/29/30/31/34/37/39/40/41
```

## Step 3: Generate sender–receiver–pathway table from QS summary file

Script: Sender_Receiver_tab.py

Required file(s): **qs_summary.csv** generated in Step 2.

Command and help:

```
python decide_pair.py -h
usage: decide_pair.py [-h] -i INPUT -o OUTPUT [--force-parallel]
[--no-parallel] [--threshold THRESHOLD]

Generate sender–receiver–pathway table from QS summary file.

optional arguments:
-h, --help show this help message and exit
-i INPUT, --input INPUT
Input CSV: qs_sending_receiving_summary.csv
-o OUTPUT, --output OUTPUT
Output CSV: sender_receiver_pathway_table.csv
--force-parallel Force use of parallel processing
--no-parallel Disable parallel processing
--threshold THRESHOLD
Genome count threshold to enable parallel mode
```

Result preview:

```
Sender,Receiver,Pathways
GCF_900111605.1,GCF_900111605.1,3/6/7/8/9/10/11/12/13/14/15/17/18/20/22/25/27/28/29/30/34
GCF_900111605.1,GCA_001883705.1,3/6/7/8/9/10/11/12/13/14/15/17/18/19/20/22/25/27/28/29/30/34
GCF_900111605.1,GCF_014643695.1,7/8/9/10/11/12/13/14/15/17/18/19/20/22/25/27/28/29/30/34
GCF_900111605.1,GCA_017542125.1,3/7/8/9/10/11/12/13/14/15/17/18/20/22/25/27/28/29/30/34
GCF_900111605.1,GCF_018881715.1,6/7/8/9/10/11/12/13/15/17/18/20/22/25/27/28/29/30/34
GCF_900111605.1,GCA_006226495.1,3/6/7/8/9/10/11/12/13/14/15/17/18/19/20/22/25/27/28/29/30/34
```
86 changes: 86 additions & 0 deletions microbetag/QS/Sender_Receiver_tab.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import csv
import argparse
from multiprocessing import Pool, cpu_count
from collections import defaultdict

def parse_pathway_string(pw_str):
return set(pw_str.split("/")) if pw_str else set()

def load_genome_data(input_csv):
genome_data = {}
with open(input_csv, newline='') as f:
reader = csv.DictReader(f)
for row in reader:
genome = row["Genome"]
genome_data[genome] = {
"send": parse_pathway_string(row["Sending Pathway"]),
"recv": parse_pathway_string(row["Receiving Pathway"])
}
return genome_data

def compare_sender_with_all(args):
sender, send_set, genome_data = args
rows = []
for receiver, recv_dict in genome_data.items():
recv_set = recv_dict["recv"]
common = send_set & recv_set
if common:
rows.append((sender, receiver, "/".join(sorted(common, key=int))))
return rows

def generate_pathway_pairs_parallel(genome_data, output_csv, n_processes=None):
senders = [(s, d["send"], genome_data) for s, d in genome_data.items() if d["send"]]
with Pool(processes=n_processes or cpu_count()) as pool:
all_rows_nested = pool.map(compare_sender_with_all, senders)

all_rows = [row for sublist in all_rows_nested for row in sublist]

with open(output_csv, "w", newline='') as f:
writer = csv.writer(f)
writer.writerow(["Sender", "Receiver", "Pathways"])
writer.writerows(all_rows)

def generate_pathway_pairs_serial(genome_data, output_csv):
genomes = list(genome_data)
rows = []
for sender in genomes:
send_set = genome_data[sender]["send"]
if not send_set:
continue
for receiver in genomes:
recv_set = genome_data[receiver]["recv"]
common = send_set & recv_set
if common:
rows.append((sender, receiver, "/".join(sorted(common, key=int))))
with open(output_csv, "w", newline='') as f:
writer = csv.writer(f)
writer.writerow(["Sender", "Receiver", "Pathways"])
writer.writerows(rows)

def main():
parser = argparse.ArgumentParser(
description="Generate sender–receiver–pathway table from QS summary file."
)
parser.add_argument("-i", "--input", required=True, help="Input CSV: qs_sending_receiving_summary.csv")
parser.add_argument("-o", "--output", required=True, help="Output CSV: sender_receiver_pathway_table.csv")
parser.add_argument("--force-parallel", action="store_true", help="Force use of parallel processing")
parser.add_argument("--no-parallel", action="store_true", help="Disable parallel processing")
parser.add_argument("--threshold", type=int, default=1000, help="Genome count threshold to enable parallel mode")

args = parser.parse_args()

genome_data = load_genome_data(args.input)
genome_count = len(genome_data)

if args.no_parallel:
print("Running in serial mode (manual override).")
generate_pathway_pairs_serial(genome_data, args.output)
elif args.force_parallel or genome_count > args.threshold:
print(f"Running in parallel mode (genome count = {genome_count}).")
generate_pathway_pairs_parallel(genome_data, args.output)
else:
print(f"Running in serial mode (genome count = {genome_count}).")
generate_pathway_pairs_serial(genome_data, args.output)

if __name__ == "__main__":
main()
39 changes: 39 additions & 0 deletions microbetag/QS/build_QSdb.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash

## This script downloads the KEGG Orthology HMM database,
## extracts HMMs related to KEGG pathway m02024 (quorum sensing),
## builds an HMMsearch database, and keeps only the final outputs in the current directory.

# Exit on any error
set -e

# Step 0: Check for m02024_genes.txt in current directory
if [[ ! -f m02024_genes.txt ]]; then
echo "[ERROR] File 'm02024_genes.txt' not found in the current directory."
echo "Please make sure 'm02024_genes.txt' is placed in the same directory as this script."
exit 1
fi

# Step 1: Create temporary working directory
workdir=$(mktemp -d)

# Step 2: Download KEGG HMM database
echo "[INFO] Downloading KEGG HMM profiles (around 1Gb, this may take a while) ..."
#wget https://www.genome.jp/ftp/db/kofam/profiles.tar.gz -O "$workdir/profiles.tar.gz"
curl -o "$workdir/profiles.tar.gz" https://www.genome.jp/ftp/db/kofam/profiles.tar.gz
tar -xzf "$workdir/profiles.tar.gz" -C "$workdir"
rm "$workdir/profiles.tar.gz"

# Step 3: Copy required HMMs
mkdir "$workdir/required_hmm"
xargs -a m02024_genes.txt -I {} sh -c 'cp "'"$workdir"'/profiles/{}.hmm" "'"$workdir"'/required_hmm/" 2>/dev/null || echo "[WARNING] File not found: {}.hmm"'

# Step 4: Build HMM database
cat "$workdir"/required_hmm/*.hmm > m02024_genes.hmm
hmmpress m02024_genes.hmm

# Step 5: Clean up all temporary files and folders
rm -rf "$workdir"

echo "[INFO] HMM database successfully built:"
ls m02024_genes.hmm*
Loading