msysbio · EmmettPeng · Jul 8, 2025 · Jul 8, 2025 · Jul 30, 2025 · Jul 30, 2025
diff --git a/microbetag/QS/HMMres_to_QS.py b/microbetag/QS/HMMres_to_QS.py
@@ -0,0 +1,95 @@
+#!/usr/bin python3
+
+import os
+import json
+import argparse
+import csv
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Annotate genomes with QS sending and receiving pathways based on HMMsearch results."
+    )
+    parser.add_argument("input_dir", type=str,
+                        help="Directory containing HMMsearch .tbl result files")
+    parser.add_argument("qs_json", type=str,
+                        help="Path to QS pathway definition JSON file (map02024_parse_dict.json)")
+    parser.add_argument("--evalue", type=float, default=1e-5,
+                        help="E-value threshold for KO filtering (default: 1e-5)")
+    parser.add_argument("--output", type=str, default="qs_summary.csv",
+                        help="Output CSV file path (default: qs_summary.csv)")
+    return parser.parse_args()
+
+def load_qs_pathways(json_file):
+    with open(json_file, "r") as f:
+        raw_data = json.load(f)
+
+    formatted = {}
+    for pid, entries in raw_data.items():
+        entry_dict = {list(d.keys())[0]: list(d.values())[0] for d in entries}
+        formatted[int(pid)] = {
+            "Mode species": entry_dict["Mode species"],
+            "Signal Compound": entry_dict["Signal Compound"],
+            "Signal Type": entry_dict["Signal Type"],
+            "Biosynthesis": set(entry_dict["Biosysthesis"]),
+            "Transporter": set(entry_dict["Transporter"]),
+            "Sensing": set(entry_dict["Sensing"])
+        }
+    return formatted
+
+def parse_tbl_file(file_path, evalue_threshold):
+    ko_set = set()
+    with open(file_path, "r") as f:
+        for line in f:
+            if line.startswith("#") or line.strip() == "":
+                continue
+            parts = line.strip().split()
+            if len(parts) < 5:
+                continue
+            ko = parts[2]
+            evalue = float(parts[4]) if "e" not in parts[4] else float(parts[4].replace("e", "E"))
+            if evalue < evalue_threshold:
+                ko_set.add(ko)
+    return ko_set
+
+def analyze_genome(ko_set, qs_pathways):
+    sending = []
+    receiving = []
+    for pid, info in qs_pathways.items():
+        bios = info["Biosynthesis"]
+        trans = info["Transporter"]
+        sens = info["Sensing"]
+
+        is_sender = not bios or bios.issubset(ko_set)
+        is_receiver = trans.issubset(ko_set) and sens.issubset(ko_set)
+
+        if is_sender:
+            sending.append(str(pid))
+        if is_receiver:
+            receiving.append(str(pid))
+
+    return "/".join(sending), "/".join(receiving)
+
+def main():
+    args = parse_args()
+    qs_pathways = load_qs_pathways(args.qs_json)
+
+    result_rows = []
+
+    for filename in os.listdir(args.input_dir):
+        if filename.endswith("_results.tbl"):
+            file_path = os.path.join(args.input_dir, filename)
+            genome = filename.split("_results.tbl")[0]
+            kos = parse_tbl_file(file_path, args.evalue)
+            send, receive = analyze_genome(kos, qs_pathways)
+            result_rows.append((genome, send, receive))
+
+    with open(args.output, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["Genome", "Sending Pathway", "Receiving Pathway"])
+        for row in result_rows:
+            writer.writerow(row)
+
+    print(f"Summary file saved to: {args.output}")
+
+if __name__ == "__main__":
+    main()
diff --git a/microbetag/QS/QS_annotation.sh b/microbetag/QS/QS_annotation.sh
@@ -0,0 +1,6 @@
+# Command for using QS database to annotate quorum sensing functions in users' genome
+
+for genome in ../genome_dir/*.faa;do
+    genome_name=$(basename "$genome" .faa)
+    hmmsearch --tblout "kegg_hmm_results/${genome_name}_results.tbl" m02024_genes.hmm "$genome" > "kegg_hmm_results/${genome_name}_log.txt"
+done
diff --git a/microbetag/QS/Readme.md b/microbetag/QS/Readme.md
@@ -0,0 +1,105 @@
+# QS Function Annotation
+
+## Step 0: Build Quorum Sensing Database
+
+Script: build_QSdb.sh
+
+Required file(s): **m02024_genes.txt** in the same directory.
+
+Purpose: This script downloads the KEGG Orthology HMM database, extracts HMMs related to KEGG pathway m02024 (quorum sensing), builds an HMMsearch database, and keeps only the final outputs in the current directory.
+
+Command: `bash /path/to/build_QSdb.sh`
+
+Output(s): 
+`m02024_genes.hmm`
+`m02024_genes.hmm.h3f`
+`m02024_genes.hmm.h3i`
+`m02024_genes.hmm.h3m`
+`m02024_genes.hmm.h3p`
+
+## Step 1: Annotate QS functions against QSdb in users' genomes
+
+Script: QS_annotation.sh
+
+Required file(s): 5 related HMM database files generated in Step 0, Users' genomes in protein sequences (.faa).
+
+Purpose: Command for using QS database to annotate quorum sensing functions in users' genome.
+
+Command: `bash /path/to/QS_annotation.sh`
+
+Output(s): HMM result tabular files.
+
+## Step 2: Annotate genomes with QS sending and receiving pathways based on HMMsearch results
+
+Script: HMMres_to_QS.py
+
+Required file(s): **map02024_parse_dict.json** and HMM result tabular files generated in Step 1.
+
+Command and help:
+
+```
+python annotate_qs_pathways.py -h
+usage: annotate_qs_pathways.py [-h] [--evalue EVALUE] [--output OUTPUT]
+                               input_dir qs_json
+
+Annotate genomes with QS sending and receiving pathways based on HMMsearch
+results.
+
+positional arguments:
+  input_dir        Directory containing HMMsearch .tbl result files
+  qs_json          Path to QS pathway definition JSON file
+                   (map02024_parse_dict.json)
+
+optional arguments:
+  -h, --help       show this help message and exit
+  --evalue EVALUE  E-value threshold for KO filtering (default: 1e-5)
+  --output OUTPUT  Output CSV file path (default: qs_summary.csv)
+```
+
+Result preview:
+
+```
+Genome,Sending Pathway,Receiving Pathway
+GCF_900111605.1,3/6/7/8/9/10/11/12/13/14/15/17/18/19/20/22/25/27/28/29/30/34,2/3/4/5/6/7/8/9/10/11/12/13/14/15/17/18/20/21/22/23/24/25/26/27/28/29/30/31/32/34/36/37/38/39/40/41
+GCA_001883705.1,3/4/5/7/8/9/10/11/12/13/14/15/17/18/19/20/21/22/23/24/25/26/27/28/29/30/31/34,1/2/3/4/5/6/7/8/9/10/11/12/13/14/15/16/17/18/19/20/21/22/23/24/25/26/27/28/29/30/31/32/34/36/37/39/40/41
+GCF_014643695.1,1/9/10/11/12/13/14/15/16/27/28/34,4/5/7/8/9/10/11/12/13/14/15/17/18/19/20/21/22/23/24/25/26/27/28/29/30/31/34/37/39/40/41
+```
+
+## Step 3: Generate sender–receiver–pathway table from QS summary file
+
+Script: Sender_Receiver_tab.py
+
+Required file(s): **qs_summary.csv** generated in Step 2.
+
+Command and help:
+
+```
+python decide_pair.py -h
+usage: decide_pair.py [-h] -i INPUT -o OUTPUT [--force-parallel]
+                      [--no-parallel] [--threshold THRESHOLD]
+
+Generate sender–receiver–pathway table from QS summary file.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -i INPUT, --input INPUT
+                        Input CSV: qs_sending_receiving_summary.csv
+  -o OUTPUT, --output OUTPUT
+                        Output CSV: sender_receiver_pathway_table.csv
+  --force-parallel      Force use of parallel processing
+  --no-parallel         Disable parallel processing
+  --threshold THRESHOLD
+                        Genome count threshold to enable parallel mode
+```
+
+Result preview:
+
+```
+Sender,Receiver,Pathways
+GCF_900111605.1,GCF_900111605.1,3/6/7/8/9/10/11/12/13/14/15/17/18/20/22/25/27/28/29/30/34
+GCF_900111605.1,GCA_001883705.1,3/6/7/8/9/10/11/12/13/14/15/17/18/19/20/22/25/27/28/29/30/34
+GCF_900111605.1,GCF_014643695.1,7/8/9/10/11/12/13/14/15/17/18/19/20/22/25/27/28/29/30/34
+GCF_900111605.1,GCA_017542125.1,3/7/8/9/10/11/12/13/14/15/17/18/20/22/25/27/28/29/30/34
+GCF_900111605.1,GCF_018881715.1,6/7/8/9/10/11/12/13/15/17/18/20/22/25/27/28/29/30/34
+GCF_900111605.1,GCA_006226495.1,3/6/7/8/9/10/11/12/13/14/15/17/18/19/20/22/25/27/28/29/30/34
+```
diff --git a/microbetag/QS/Sender_Receiver_tab.py b/microbetag/QS/Sender_Receiver_tab.py
@@ -0,0 +1,86 @@
+import csv
+import argparse
+from multiprocessing import Pool, cpu_count
+from collections import defaultdict
+
+def parse_pathway_string(pw_str):
+    return set(pw_str.split("/")) if pw_str else set()
+
+def load_genome_data(input_csv):
+    genome_data = {}
+    with open(input_csv, newline='') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            genome = row["Genome"]
+            genome_data[genome] = {
+                "send": parse_pathway_string(row["Sending Pathway"]),
+                "recv": parse_pathway_string(row["Receiving Pathway"])
+            }
+    return genome_data
+
+def compare_sender_with_all(args):
+    sender, send_set, genome_data = args
+    rows = []
+    for receiver, recv_dict in genome_data.items():
+        recv_set = recv_dict["recv"]
+        common = send_set & recv_set
+        if common:
+            rows.append((sender, receiver, "/".join(sorted(common, key=int))))
+    return rows
+
+def generate_pathway_pairs_parallel(genome_data, output_csv, n_processes=None):
+    senders = [(s, d["send"], genome_data) for s, d in genome_data.items() if d["send"]]
+    with Pool(processes=n_processes or cpu_count()) as pool:
+        all_rows_nested = pool.map(compare_sender_with_all, senders)
+
+    all_rows = [row for sublist in all_rows_nested for row in sublist]
+
+    with open(output_csv, "w", newline='') as f:
+        writer = csv.writer(f)
+        writer.writerow(["Sender", "Receiver", "Pathways"])
+        writer.writerows(all_rows)
+
+def generate_pathway_pairs_serial(genome_data, output_csv):
+    genomes = list(genome_data)
+    rows = []
+    for sender in genomes:
+        send_set = genome_data[sender]["send"]
+        if not send_set:
+            continue
+        for receiver in genomes:
+            recv_set = genome_data[receiver]["recv"]
+            common = send_set & recv_set
+            if common:
+                rows.append((sender, receiver, "/".join(sorted(common, key=int))))
+    with open(output_csv, "w", newline='') as f:
+        writer = csv.writer(f)
+        writer.writerow(["Sender", "Receiver", "Pathways"])
+        writer.writerows(rows)
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate sender–receiver–pathway table from QS summary file."
+    )
+    parser.add_argument("-i", "--input", required=True, help="Input CSV: qs_sending_receiving_summary.csv")
+    parser.add_argument("-o", "--output", required=True, help="Output CSV: sender_receiver_pathway_table.csv")
+    parser.add_argument("--force-parallel", action="store_true", help="Force use of parallel processing")
+    parser.add_argument("--no-parallel", action="store_true", help="Disable parallel processing")
+    parser.add_argument("--threshold", type=int, default=1000, help="Genome count threshold to enable parallel mode")
+
+    args = parser.parse_args()
+
+    genome_data = load_genome_data(args.input)
+    genome_count = len(genome_data)
+
+    if args.no_parallel:
+        print("Running in serial mode (manual override).")
+        generate_pathway_pairs_serial(genome_data, args.output)
+    elif args.force_parallel or genome_count > args.threshold:
+        print(f"Running in parallel mode (genome count = {genome_count}).")
+        generate_pathway_pairs_parallel(genome_data, args.output)
+    else:
+        print(f"Running in serial mode (genome count = {genome_count}).")
+        generate_pathway_pairs_serial(genome_data, args.output)
+
+if __name__ == "__main__":
+    main()
diff --git a/microbetag/QS/build_QSdb.sh b/microbetag/QS/build_QSdb.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+## This script downloads the KEGG Orthology HMM database,
+## extracts HMMs related to KEGG pathway m02024 (quorum sensing),
+## builds an HMMsearch database, and keeps only the final outputs in the current directory.
+
+# Exit on any error
+set -e
+
+# Step 0: Check for m02024_genes.txt in current directory
+if [[ ! -f m02024_genes.txt ]]; then
+    echo "[ERROR] File 'm02024_genes.txt' not found in the current directory."
+    echo "Please make sure 'm02024_genes.txt' is placed in the same directory as this script."
+    exit 1
+fi
+
+# Step 1: Create temporary working directory
+workdir=$(mktemp -d)
+
+# Step 2: Download KEGG HMM database
+echo "[INFO] Downloading KEGG HMM profiles (around 1Gb, this may take a while) ..."
+#wget https://www.genome.jp/ftp/db/kofam/profiles.tar.gz -O "$workdir/profiles.tar.gz"
+curl -o "$workdir/profiles.tar.gz" https://www.genome.jp/ftp/db/kofam/profiles.tar.gz
+tar -xzf "$workdir/profiles.tar.gz" -C "$workdir"
+rm "$workdir/profiles.tar.gz"
+
+# Step 3: Copy required HMMs
+mkdir "$workdir/required_hmm"
+xargs -a m02024_genes.txt -I {} sh -c 'cp "'"$workdir"'/profiles/{}.hmm" "'"$workdir"'/required_hmm/" 2>/dev/null || echo "[WARNING] File not found: {}.hmm"'
+
+# Step 4: Build HMM database
+cat "$workdir"/required_hmm/*.hmm > m02024_genes.hmm
+hmmpress m02024_genes.hmm
+
+# Step 5: Clean up all temporary files and folders
+rm -rf "$workdir"
+
+echo "[INFO] HMM database successfully built:"
+ls m02024_genes.hmm*