IKIM-Essen · adriandrr · Jul 14, 2025 · Jun 12, 2025 · Jun 19, 2025 · Jul 2, 2025
diff --git a/.test_steps/test_ERMA.ipynb b/.test_steps/test_ERMA.ipynb
diff --git a/config/config.yaml b/config/config.yaml
@@ -7,6 +7,7 @@ fastq_dir: "data/fastq" # Copy target fastq.gz files in ERMA/data/fastq or chang
 outdir: "results" # Output directory of the final report
 
 min_similarity: "0.8" # threshold to pre-filter blast hits by percentage identity
+min_abundance: "0.01" # genera with lower abundance will be binned as "Other" in stacked bar abundance plot
 
 silva:
   download_path_seq: "https://www.arb-silva.de/fileadmin/silva_databases/release_138_2/Exports/SILVA_138.2_SSURef_NR99_tax_silva.fasta.gz"
@@ -18,7 +19,7 @@ card:
 num_parts: 1 # number of chunks the fastqs are split into
 max_threads: 16
 
-similarity_search_mode: "test" # Put here "test" or "full" for strand/s to be included in the similarity search
+similarity_search_mode: "full" # Put here "test" or "full" for strand/s to be included in the similarity search
 
 ### Preprocessing ###
 # if data is already in format 'one fastq.gz per sample', this section can be ignored

diff --git a/report/abundance_bubble_plot.rst b/report/abundance_bubble_plot.rst
@@ -1,7 +1,8 @@
 Abundance Bubble Plot
 
 The Abundance Bubble Plot visualizes the distribution and abundance of genera linked to antimicrobial resistance genes.
-Each bubble represents a genus-AMR gene pair, with the bubble size indicating the relative count of similarity search hits found for this genus-AMR pair per sample. The colour of the bubble indicates the total counts of AMR-genus hits after filtering.
+Each bubble represents a genus-AMR gene pair, with the bubble size indicating the relative count of similarity search hits found for this genus-AMR pair per sample. 
+The colour of the bubble indicates the number of assigned fusion reads after filtering.
 The position of bubbles allows comparisons of genera and resistance genes across samples.
 
 The bubble plot is generated once for every AMR with more than 100 total respective similarity search hits. For representability, only 20 genera are shown.

diff --git a/report/abundance_data.rst b/report/abundance_data.rst
@@ -0,0 +1,7 @@
+Abundance data
+
+The Abundance data table shows a breakdown of the number of Fusion Reads per Genus and the assignment to AMR Gene Family and Sample.
+Additionally, a relative distribution is shown related to the Total Fusion Read sum per sample. 
+The Total Fusion Read sum per AMR Gene Family is shown beneath the AMR name.
+
+This data is the basis for the abundance bubble plot and the Stacked bar abundance plot.
diff --git a/report/attrition_plot.rst b/report/attrition_plot.rst
@@ -0,0 +1,23 @@
+Attrition Plot
+
+The Plot shows the number for:
+
+- all fasta files (Number of FastQ input reads), 
+
+- the sum of generated hits through similarity search before (Merged similarity hits),
+
+- and after filtering (Filtered fusion reads)
+
+In front of the second plot, there is a thinner stacked bar showing the count discrepancy between unfiltered and filtered hits.
+The reasons for filtering are explained as following:
+- "Diamond hits < similarity threshold": All hits that have a smaller query/subject - percentage identity as the minimum threshold defined in the config file
+
+- "Diamond hits NOT highest percentage identity per query":  Only the hits with the maximum percentage identity per query ID are accepted. The number of hits filtered for this reason can be seen here
+
+- "Usearch hits > similarity threshold": analogous to "ABR < similarity threshold"
+
+- "Usearch hits NOT highest percentage identity per query": analogous to "ABR Hit not max identity for query ID"
+
+- "Query hit in only one of two databases": In the final filtering step, only those hits gets accepted for which querys can be found in both databases after the previous filterings
+
+Raw data used for this plot can be found in "4. QC/Count Overview Table"
diff --git a/report/count_overview_per_sample.rst b/report/count_overview_per_sample.rst
diff --git a/report/reads_per_AMR.rst b/report/reads_per_AMR.rst
diff --git a/report/stacked_bar_abundance_plot.rst b/report/stacked_bar_abundance_plot.rst
@@ -0,0 +1,7 @@
+Stacked bar abundance plot
+
+The Interactive plot visualizes the relative abundance of genera per sample according to assigned fusion reads after filtration.
+
+According to the 'min_abundance'-parameter in the config file, genera with a lower abundance than this threshold are binned to "Other".
+
+If several AMR's are found with a Fusion Read count of more than 1% to the total, one plot is created per AMR.
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -7,12 +7,12 @@ configfile: "config/config.yaml"
 
 
 include: "rules/common.smk"
-include: "rules/qc.smk"
 include: "rules/load_db.smk"
+include: "rules/preprocess.smk"
+include: "rules/qc.smk"
 include: "rules/simsearch_and_process.smk"
 include: "rules/boxplots.smk"
-include: "rules/abundance.smk"
-include: "rules/preprocess.smk"
+include: "rules/visualizing.smk"
 
 # Build a list of all "fastq.gz" files in "ERMA/data/fastq" which will be processed subsequently
 samples = [
@@ -48,15 +48,15 @@ report: "../report/workflow.rst"
 
 rule snakemake_report:
     input:
+        local("results/abundance/stacked_bar_abundance_plot.html"),
         local("results/abundance/combined_genus_abundance_bubbleplot.html"),
-        local("results/abundance/reads_per_found_AMR.html"),
+        local("results/abundance/abundance_data.html"),
         local("results/boxplots/combined_allength_boxplot.png"),
         local("results/boxplots/combined_evalue_boxplot.png"),
         local("results/boxplots/combined_percidt_boxplot.png"),
         local("results/qc/multiqc.html"),
-        local("results/qc/overview_plot.png"),
-        local(expand("results/{sample}/overview_table.html", sample=samples)),
-        local(expand("results/{sample}/genus_abundance.html", sample=samples)),
+        local("results/qc/attrition_plot.png"),
+        local("results/qc/overview_table.html"),        
         local(expand("results/{sample}/genus_idt_per_genus_plot.png", sample=samples)),
     log:
         local("logs/report/report.log"),

diff --git a/workflow/rules/abundance.smk b/workflow/rules/abundance.smk
diff --git a/workflow/rules/boxplots.smk b/workflow/rules/boxplots.smk
@@ -16,9 +16,9 @@ rule generate_percidt_genus:
         report(
             local("results/{sample}/genus_idt_per_genus_plot.png"),
             caption="../../report/identity_read_count_per_genus.rst",
-            category="2. Single Sample Abundance Data",
+            category="2. Samplewise Genus per Percentage Identity",
             subcategory="{sample}",
-            labels={"sample": "{sample}", "figure": "Identity/Read Count per Genus"},
+            labels={"sample": "{sample}", "Plot": "Identity/Read Count per Genus"},
         ),
     log:
         local("logs/generate_percidt_genus/{sample}.log"),

diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk
@@ -37,47 +37,15 @@ rule multiqc_report:
             local("results/qc/multiqc.html"),
             caption="../../report/multiqc.rst",
             category="4. QC",
-            labels={"File": "MultiQC Report"},
+            labels={"HTML": "MultiQC Report"},
         ),
     log:
         local("logs/multiqc/multiqc.log"),
     wrapper:
         "v6.0.1/bio/multiqc"
 
 
-rule merge_overview_per_sample:
-    input:
-        checkpoint=local(
-            expand(
-                "results/{sample}/{part}/checkpoint.txt",
-                sample=samples,
-                part=get_numpart_list(),
-            )
-        ),
-        overview_tables=local(
-            expand(
-                "results/{{sample}}/{part}/overview_table.txt", part=get_numpart_list()
-            )
-        ),
-    output:
-        report(
-            local("results/{sample}/overview_table.html"),
-            caption="../../report/count_overview_per_sample.rst",
-            category="2. Single Sample Abundance Data",
-            subcategory="{sample}",
-            labels={"sample": "{sample}", "table": "Count Overview"},
-        ),
-    params:
-        sample_name="{sample}",
-    log:
-        local("logs/merge_overview/{sample}/combined.log"),
-    conda:
-        "../envs/python.yaml"
-    script:
-        "../scripts/merge_overview_smpl.py"
-
-
-rule merge_overview_to_one:
+rule table_overview_to_one:
     input:
         checkpoint=local(
             expand(
@@ -98,28 +66,8 @@ rule merge_overview_to_one:
     params:
         sample_name=samples,
     log:
-        local("logs/merge_overview/combined.log"),
-    conda:
-        "../envs/python.yaml"
-    script:
-        "../scripts/merge_overview_all.py"
-
-
-rule plot_overview:
-    input:
-        overview_table=local("results/qc/overview_table.txt"),
-    output:
-        report(
-            local("results/qc/overview_plot.png"),
-            caption="../../report/count_overview.rst",
-            category="4. QC",
-            labels={"File": "Count Overview"},
-        ),
-    params:
-        sample_name=samples,
-    log:
-        local("logs/plot_overview/combined.log"),
+        local("logs/table_overview/combined.log"),
     conda:
         "../envs/python.yaml"
     script:
-        "../scripts/plot_overview.py"
+        "../scripts/table_overview_all.py"
diff --git a/workflow/rules/simsearch_and_process.smk b/workflow/rules/simsearch_and_process.smk
@@ -21,8 +21,8 @@ rule diamond_card:
     shell:
         """
         diamond blastx -d {input.card} -q {input.fasta} -o {output.card_results} --outfmt 6 --evalue 1e-5 --quiet --threads {params.internal_threads} 2> {log}
-        echo -ne "fastq input reads,{wildcards.sample},{wildcards.part},$(cat {input.fasta}|grep -c '^>')\n" >> {output.overview_table}
-        echo -ne "diamond output hits,{wildcards.sample},{wildcards.part},$(cat {output.card_results}|wc -l)\n" >> {output.overview_table}
+        echo -ne "Number of FastQ input reads,{wildcards.sample},{wildcards.part},$(cat {input.fasta}|grep -c '^>')\n" >> {output.overview_table}
+        echo -ne "Diamond output hits,{wildcards.sample},{wildcards.part},$(cat {output.card_results}|wc -l)\n" >> {output.overview_table}
         """
 
 
@@ -46,7 +46,7 @@ if config["similarity_search_mode"] == "test":
         shell:
             """
             usearch -usearch_local {input.fasta} -db {input.silva} -blast6out {output.silva_results} -evalue 1e-5 -threads {params.internal_threads} -strand plus -mincols 200 > {log} 2>&1
-            echo -ne "usearch output hits,{wildcards.sample},{wildcards.part},$(cat {output.silva_results}|wc -l)\n" >> {input.overview_table}
+            echo -ne "Usearch output hits,{wildcards.sample},{wildcards.part},$(cat {output.silva_results}|wc -l)\n" >> {input.overview_table}
             """
 
 
@@ -70,7 +70,7 @@ if config["similarity_search_mode"] == "full":
         shell:
             """
             usearch -usearch_local {input.fasta} -db {input.silva} -blast6out {output.silva_results} -evalue 1e-5 -threads {params.internal_threads} -strand both -mincols 200 2> {log}
-            echo -ne "usearch output hits,{wildcards.sample},{wildcards.part},$(cat {output.silva_results}|wc -l)\n" >> {input.overview_table}
+            echo -ne "Usearch output hits,{wildcards.sample},{wildcards.part},$(cat {output.silva_results}|wc -l)\n" >> {input.overview_table}
             """
 
 
@@ -80,8 +80,6 @@ rule integrate_blast_data:
         card_results=local("results/{sample}/{part}/card_results.txt"),
         silva_results=local("results/{sample}/{part}/SILVA_results.txt"),
         aro_mapping=local("data/card_db/aro_index.tsv"),
-        dummy_ABR=local("data/dummy/ABR.dummy"),
-        dummy_16S=local("data/dummy/16S.dummy"),
     output:
         intermed_card_results=local(
             temp("results/{sample}/{part}/intermed_card_results.csv")
@@ -139,3 +137,25 @@ rule gzip_intermediates:
         gzip {input.filt_data} 2>> {log}
         touch {output.checkpoint}
         """
+
+
+rule table_combined_genera_abundance:
+    input:
+        filtered_data=local(
+            expand(
+                "results/{sample}/{part}/filtered_results.csv.gz",
+                sample=samples,
+                part=get_numpart_list(),
+            )
+        ),
+    output:
+        csv=local("results/abundance/combined_genus_abundance.csv"),
+    params:
+        sample_name=samples,
+    log:
+        local("logs/genera_abundance_table.log"),
+    conda:
+        "../envs/python.yaml"
+    threads: config["max_threads"]
+    script:
+        "../scripts/table_combined_genera_abundance.py"