Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5,228 changes: 5,228 additions & 0 deletions .github/data/card_db/aro_index.tsv

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions .github/test_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ card:
num_parts: 1 # number of chunks the fastqs are split into
max_threads: 4

github_action_test: True
similarity_search_mode: "test" # Put here "test" or "full" for strand/s to be included in the similarity search

seq_tech: "Illumina" # Put here "Illumina" or "ONT"
14 changes: 9 additions & 5 deletions .test_steps/test_ERMA.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -242,12 +242,15 @@
"# === Filter Script ===\n",
"dtype_dict = {\n",
" \"query_id\": \"string\",\n",
" \"subject_id\": \"string\",\n",
" \"perc_identity\": \"float\",\n",
" \"align_length\": \"int\",\n",
" \"evalue\": \"float\",\n",
" \"part\": \"string\",\n",
" \"genus\": \"string\",\n",
" \"AMR Gene Family\": \"string\",\n",
" \"Drug Class\": \"string\",\n",
" \"ARO Name\": \"string\"\n",
"}\n",
"\n",
"\n",
Expand Down Expand Up @@ -307,6 +310,7 @@
"def rename_for_merge(df,part):\n",
" df_renamed = df.rename(columns={\n",
" \"perc_identity\": \"perc_identity_\"+part,\n",
" \"subject_id\": \"subject_id_\"+part,\n",
" \"align_length\": \"align_length_\"+part,\n",
" \"evalue\": \"evalue_\"+part,\n",
" })\n",
Expand Down Expand Up @@ -339,8 +343,8 @@
"\n",
" # Merge side-by-side on query_id\n",
" merged = pd.merge(\n",
" abr_final[[\"query_id\", \"AMR Gene Family\", \"perc_identity_ABR\", \"align_length_ABR\", \"evalue_ABR\"]],\n",
" s16_final[[\"query_id\", \"genus\", \"perc_identity_16S\", \"align_length_16S\", \"evalue_16S\"]],\n",
" abr_final[[\"query_id\", \"AMR Gene Family\", \"perc_identity_ABR\", \"align_length_ABR\", \"evalue_ABR\", \"Drug Class\", \"ARO Name\"]],\n",
" s16_final[[\"query_id\", \"subject_id_16S\",\"genus\", \"perc_identity_16S\", \"align_length_16S\", \"evalue_16S\"]],\n",
" on=\"query_id\",\n",
" how=\"inner\",\n",
" )\n",
Expand Down Expand Up @@ -1635,9 +1639,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "sm",
"display_name": "ipynb",
"language": "python",
"name": "python3"
"name": "ipynb"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -1649,7 +1653,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
"version": "3.9.19"
}
},
"nbformat": 4,
Expand Down
14 changes: 11 additions & 3 deletions config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,24 +1,32 @@
runname: "Rozman_24_07_25"
runname: "test"

# Setting up base directory and location of fastq files. Generally, no changes needed here.

base_dir: "."
fastq_dir: "data/fastq" # Copy target fastq.gz files in ERMA/data/fastq or change this path
outdir: "results" # Output directory of the final report

min_similarity: "0.85" # threshold to pre-filter blast hits by percentage identity
min_similarity: "0.80" # threshold to pre-filter blast hits by percentage identity
min_abundance: "0.01" # genera with lower abundance will be binned as "Other" in stacked bar abundance plot

silva:
download_path_seq: "https://www.arb-silva.de/fileadmin/silva_databases/release_138_2/Exports/SILVA_138.2_SSURef_NR99_tax_silva.fasta.gz"
#download_path_seq: "https://www.arb-silva.de/fileadmin/silva_databases/release_138_1/Exports/SILVA_138.1_SSURef_NR99_tax_silva.fasta.gz"
#download_path_seq: "https://www.arb-silva.de/fileadmin/silva_databases/release_138/Exports/SILVA_138_SSURef_NR99_tax_silva.fasta.gz"
#download_path_seq: "https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva.fasta.gz"
download_path_tax: "https://www.arb-silva.de/fileadmin/silva_databases/release_138_2/Exports/taxonomy/taxmap_slv_ssu_ref_nr_138.2.txt.gz"
#download_path_tax: "https://www.arb-silva.de/fileadmin/silva_databases/release_138_1/Exports/taxonomy/taxmap_slv_ssu_ref_nr_138.1.txt.gz"
#download_path_tax: "https://www.arb-silva.de/fileadmin/silva_databases/release_138/Exports/taxonomy/taxmap_slv_ssu_ref_nr_138.txt.gz"
#download_path_tax: "https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/taxonomy/taxmap_slv_ssu_ref_nr_128.txt.gz"

card:
download_path: "https://card.mcmaster.ca/download/0/broadstreet-v3.3.0.tar.bz2"
#download_path: "https://card.mcmaster.ca/download/0/broadstreet-v3.3.0.tar.bz2"
download_path: "https://card.mcmaster.ca/download/0/broadstreet-v4.0.1.tar.bz2"

num_parts: 1 # number of chunks the fastqs are split into
max_threads: 16

github_action_test: False
similarity_search_mode: "full" # Put here "test" or "full" for strand/s to be included in the similarity search

input_validation: "yes"
Expand Down
59 changes: 33 additions & 26 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ configfile: "config/config.yaml"
include: "rules/common.smk"
include: "rules/load_db.smk"
include: "rules/preprocess.smk"
include: "rules/postprocess.smk"
include: "rules/qc.smk"
include: "rules/simsearch_and_process.smk"
include: "rules/boxplots.smk"
Expand Down Expand Up @@ -40,34 +41,40 @@ print(
f"Configured sequencing technology in preprocessing (for merging/demultiplexing):\t{seq_tech}\n"
)

if not config["github_action_test"]:
rule all:
input:
local(f"{outdir}/{runname}_report.zip"),
local("results/single_sample_similarity_search_data.tar.gz"),

rule all:
input:
local(f"{outdir}/{runname}_report.zip"),

report: "../report/workflow.rst"

report: "../report/workflow.rst"

rule snakemake_report:
input:
local("results/single_sample_similarity_search_data.tar.gz"),
local("logs/logs.json"),
log:
local("logs/report/report.log"),
conda:
"envs/snakemake.yaml"
output:
local(f"{outdir}/{runname}_report.zip"),
shell:
"""
snakemake --nolock --report {output} --report-stylesheet config/custom-stylesheet.css 2>> {log}
"""

rule snakemake_report:
input:
local("results/abundance/stacked_bar_abundance_plot.html"),
local("results/abundance/combined_genus_abundance_bubbleplot.html"),
local("results/abundance/abundance_data.html"),
local("results/boxplots/combined_allength_boxplot.png"),
local("results/boxplots/combined_evalue_boxplot.png"),
local("results/boxplots/combined_percidt_boxplot.png"),
local("results/qc/multiqc.html"),
local("results/qc/attrition_plot.png"),
local("results/qc/overview_table.html"),
local(expand("results/{sample}/genus_idt_per_genus_plot.png", sample=samples)),
log:
local("logs/report/report.log"),
conda:
"envs/snakemake.yaml"
output:
local(f"{outdir}/{runname}_report.zip"),
shell:
"""
snakemake --nolock --report {output} --report-stylesheet config/custom-stylesheet.css 2>> {log}
"""
elif config["github_action_test"]:
rule all:
input:
local("results/abundance/stacked_bar_abundance_plot.html"),
local("results/abundance/combined_genus_abundance_bubbleplot.html"),
local("results/abundance/abundance_data.html"),
local("results/boxplots/combined_allength_boxplot.png"),
local("results/boxplots/combined_evalue_boxplot.png"),
local("results/boxplots/combined_percidt_boxplot.png"),
local("results/qc/multiqc.html"),
local("results/qc/attrition_plot.png"),
local("results/qc/overview_table.html"),
2 changes: 1 addition & 1 deletion workflow/rules/boxplots.smk
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ rule generate_percidt_genus:
),
output:
report(
local("results/{sample}/genus_idt_per_genus_plot.png"),
local("results/boxplots/{sample}_idt_genus_plot.png"),
caption="../../report/identity_read_count_per_genus.rst",
category="2. Samplewise Genus per Percentage Identity",
subcategory="{sample}",
Expand Down
38 changes: 38 additions & 0 deletions workflow/rules/postprocess.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright 2024 Adrian Dörr.
# Licensed under the MIT License (https://opensource.org/license/mit)
# This file may not be copied, modified, or distributed
# except according to those terms.


rule tar_single_sample_dirs:
input:
local("results/abundance/stacked_bar_abundance_plot.html"),
local("results/abundance/combined_genus_abundance_bubbleplot.html"),
local("results/abundance/abundance_data.html"),
local("results/boxplots/combined_allength_boxplot.png"),
local("results/boxplots/combined_evalue_boxplot.png"),
local("results/boxplots/combined_percidt_boxplot.png"),
local("results/qc/multiqc.html"),
local("results/qc/attrition_plot.png"),
local("results/qc/overview_table.html"),
output:
local("results/single_sample_similarity_search_data.tar.gz")
log:
local("logs/tar/tar.log")
conda:
"../envs/snakemake.yaml"
script:
"../scripts/postprocess_tar_intermediates.py"


rule concatenate_logs:
input:
local("results/single_sample_similarity_search_data.tar.gz")
output:
local("logs/logs.json")
log:
local("logs/tar/concat.log")
conda:
"../envs/snakemake.yaml"
script:
"../scripts/postprocess_concat_logs.py"
6 changes: 3 additions & 3 deletions workflow/rules/simsearch_and_process.smk
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ rule diamond_card:
card=local("data/card_db/card_db.dmnd"),
output:
card_results=local("results/{sample}/{part}/card_results.txt"),
overview_table=local("results/{sample}/{part}/overview_table.txt"),
overview_table=local(temp("results/{sample}/{part}/overview_table.txt")),
params:
internal_threads=config["max_threads"],
log:
Expand Down Expand Up @@ -54,7 +54,7 @@ if config["similarity_search_mode"] == "full":

rule usearch_silva:
input:
overview_table=local("results/{sample}/{part}/overview_table.txt"),
overview_table=local( "results/{sample}/{part}/overview_table.txt"),
fasta=local("results/fastq/split/{sample}.part_{part}.fasta"),
silva=local("data/silva_db/silva_seq.fasta"),
output:
Expand Down Expand Up @@ -153,7 +153,7 @@ rule table_combined_genera_abundance:
params:
sample_name=samples,
log:
local("logs/genera_abundance_table.log"),
local("logs/genera_abundance/genera_abundance_table.log"),
conda:
"../envs/python.yaml"
threads: config["max_threads"]
Expand Down
6 changes: 3 additions & 3 deletions workflow/rules/visualizing.smk
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ rule plot_abundance_data:
labels={"HTML": "Abundance data"},
),
log:
local("logs/genera_abundance_plot.log"),
local("logs/genera_abundance/genera_abundance_plot.log"),
conda:
"../envs/python.yaml"
threads: config["max_threads"]
Expand All @@ -33,7 +33,7 @@ rule plot_abundance_bubble:
labels={"figure": "Abundance Bubble Plot"},
),
log:
local("logs/genera_abundance_plot.log"),
local("logs/genera_abundance/genera_abundance_plot.log"),
conda:
"../envs/python.yaml"
threads: config["max_threads"]
Expand All @@ -52,7 +52,7 @@ rule plot_stacked_bar_abundance:
labels={"figure": "Stacked Bar Abundance Plot"},
),
log:
local("logs/stacked_bar_abundance_plot.log"),
local("logs/stacked_bar_abundance/stacked_bar_abundance_plot.log"),
params:
min_abundance = config["min_abundance"]
conda:
Expand Down
18 changes: 17 additions & 1 deletion workflow/scripts/filter_blast_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,15 @@

dtype_dict = {
"query_id": "string",
"subject_id": "string",
"perc_identity": "float",
"align_length": "int",
"evalue": "float",
"part": "string",
"genus": "string",
"AMR Gene Family": "string",
"Drug Class": "string",
"ARO Name": "string",
}


Expand All @@ -34,7 +37,10 @@ def write_dummy_line(output_file):
print("Detected only a dummy 16S line — generating merged dummy output.")
dummy_line = {
"query_id": "dummy",
"subject_id_16S": "dummy",
"AMR Gene Family": "NA",
"Drug Class": "string",
"ARO Name": "string",
"perc_identity_ABR": 0,
"align_length_ABR": 0,
"evalue_ABR": 0,
Expand Down Expand Up @@ -96,6 +102,7 @@ def rename_for_merge(df, part):
"perc_identity": "perc_identity_" + part,
"align_length": "align_length_" + part,
"evalue": "evalue_" + part,
"subject_id": "subject_id_" + part,
}
)
return df_renamed
Expand Down Expand Up @@ -174,10 +181,19 @@ def filter_blast_results(input_file, output_file, min_similarity, overview_table
"perc_identity_ABR",
"align_length_ABR",
"evalue_ABR",
"Drug Class",
"ARO Name",
]
],
s16_final[
["query_id", "genus", "perc_identity_16S", "align_length_16S", "evalue_16S"]
[
"query_id",
"genus",
"perc_identity_16S",
"align_length_16S",
"evalue_16S",
"subject_id_16S",
]
],
on="query_id",
how="inner",
Expand Down
18 changes: 16 additions & 2 deletions workflow/scripts/integrate_blast_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,23 @@ def write_dummy_line(output_file, part):
dummy_row = ["dummy.dummy", "dummy", "100"] + ["0"] * 9
if part == "16S":
header = blast_columns + ["part", "primaryAccession", "genus"]
dummy_row = dummy_row + ["16S", "dummy", "0", "dummy", "dummy"]
dummy_row = dummy_row + ["16S", "dummy", "dummy"]
elif part == "ABR":
header = blast_columns + ["part","ARO Accession", "CVTERM ID","Model Sequence ID","Model ID","Model Name","ARO Name","Protein Accession","DNA Accession","AMR Gene Family","Drug Class","Resistance Mechanism","CARD Short Name"]
header = blast_columns + [
"part",
"ARO Accession",
"CVTERM ID",
"Model Sequence ID",
"Model ID",
"Model Name",
"ARO Name",
"Protein Accession",
"DNA Accession",
"AMR Gene Family",
"Drug Class",
"Resistance Mechanism",
"CARD Short Name",
]
dummy_row = dummy_row + ["ABR", "dummy"] + ["0"] * 3 + ["dummy"] * 8
else:
raise ValueError("Invalid part specified. Must be 'ABR' or '16S'.")
Expand Down
Loading