IKIM-Essen · adriandrr · May 20, 2025 · May 20, 2025 · May 20, 2025 · May 20, 2025
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 [![Snakemake](https://img.shields.io/badge/snakemake-≥9.0-brightgreen.svg)](https://snakemake.bitbucket.io)
 [![Snakemake CI](https://github.com/IKIM-Essen/ERMA/actions/workflows/snakemake-ci.yml/badge.svg)](https://github.com/IKIM-Essen/ERMA/actions/workflows/snakemake-ci.yml)
 
-This pipeline is designed to process sequencing reads obtained from epicPCR experiments, linking antimicrobial resistance (AMR) genes with 16S rRNA genes. The pipeline uses **Snakemake** to manage the workflow and integrates tools for downloading necessary databases, running sequence alignments, filtering, and generating visual reports.
+This **Snakemake**-based pipeline processes sequencing reads from epicPCR experiments to link antimicrobial resistance (AMR) genes with genus-specific 16S rRNA gene sequences from prokaryotic cells. It orchestrates all workflow steps, including database downloads, sequence alignment, read filtering, and the generation of visual reports, ensuring reproducibility and streamlined analysis.
 
 ## Features
 - Downloads and prepares SILVA and CARD databases for use in the analysis.
@@ -17,10 +17,11 @@ This pipeline is designed to process sequencing reads obtained from epicPCR expe
 ## Prerequisites
 
 ### Install Snakemake
-The pipeline requires **Snakemake** with support for conda environments. You can install it using conda (via Miniconda or Anaconda):
+The pipeline requires **Snakemake** (≥ Version 9.0) with support for conda environments. You can install it using conda (via Miniconda or Anaconda):
 
 ```bash
-mamba install -c conda-forge -c bioconda snakemake --yes; mamba install -c bioconda snakemake-storage-plugin-fs --yes
+mamba create -n snakemake9 -c conda-forge -c bioconda snakemake=9 --yes; mamba install -n snakemake9 -c bioconda snakemake-storage-plugin-fs --yes
+mamba activate snakemake9
 ```
 
 Alternatively, follow the official [Snakemake installation](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html) guide for more documentation and options.
@@ -29,10 +30,10 @@ Install Dependencies: This pipeline uses conda environments to manage its depend
 
 ## Usage Instructions
 
-Clone the repository: First, clone the pipeline repository to your local machine:
+First, clone the pipeline repository to your local machine:
 
 ```bash
-git clone https://github.com/your-username/ERMA.git
+git clone https://github.com/IKIM-essen/ERMA.git
 cd ERMA
 ```
 Prepare Data Folder: You need to place your raw sequencing files (fastq.gz format) in the data/fastq/ directory or change this path to the desired directory.
@@ -111,6 +112,14 @@ To run the pipeline in the default mode, a profile with snakemake specific param
 snakemake --profile profile
 ```
 
+### Testing
+
+For testing the workflow you can copy the provided dummy data:
+
+```
+cp .github/data/fastq/test_epic_data.fastq.gz data/fastq/
+```
+
 ## Additional Notes
 
 The pipeline is designed to handle large sequencing datasets in parallel, so it's recommended to run it on a machine with sufficient computational resources. However, to run the pipeline on machines with less resources, it is recommended to split the fastq files or the tables in smaller chunks to prevent RAM  overflow. This can be done by increase num_parts in the config file. However, the higher the number of parts per FASTQ file, the higher the chance some blast results for the same read will be split and lost.

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -1,6 +1,9 @@
 import glob, os
+
+
 configfile: "config/config.yaml"
 
+
 include: "rules/common.smk"
 include: "rules/qc.smk"
 include: "rules/load_db.smk"
@@ -9,50 +12,56 @@ include: "rules/boxplots.smk"
 include: "rules/abundance.smk"
 include: "rules/preprocess.smk"
 
+
 # Build a list of all "fastq.gz" files in "ERMA/data/fastq" which will be processed subsequently
-samples = [os.path.basename(f).replace(".fastq.gz", "") for f in glob.glob(os.path.join(config["base_dir"], config["fastq_dir"], "*.fastq.gz"))]
+samples = [
+    os.path.basename(f).replace(".fastq.gz", "")
+    for f in glob.glob(
+        os.path.join(config["base_dir"], config["fastq_dir"], "*.fastq.gz")
+    )
+]
 runname = "".join(config["runname"])
 seq_tech = "".join(config["seq_tech"])
 outdir = "".join(config["outdir"])
 
-print("Important Run Parameter:\n")
+print("Run Parameters:\n")
 print(f"Base directory: \t{os.path.abspath(config['base_dir'])}")
-print(f"Sample location:\t{os.path.abspath(config['fastq_dir'])}")
-print(f"Report out location:\t{os.path.abspath(config['outdir'])}")
+print(f"Input location:\t{os.path.abspath(config['fastq_dir'])}")
+print(f"Output location:\t{os.path.abspath(config['outdir'])}")
 print(f"Detected samples:\t{samples}")
-print(f"Detected Runname:\t{runname}")
-print(f"Detected Seqtech:\t{seq_tech} (This Parameter is for demultiplexing/merging preprocess only!)\n")
+print(f"Detected runname:\t{runname}")
+print(
+    f"Configured sequencing technology in preprocessing (for merging/demultiplexing):\t{seq_tech}\n"
+)
+
 
 rule all:
     input:
-        local(f"{outdir}/{runname}_report.zip")
+        local(f"{outdir}/{runname}_report.zip"),
+
 
 report: "../report/workflow.rst"
 
+
 rule snakemake_report:
-    input:              
+    input:
         local("results/abundance/combined_genus_abundance_bubbleplot.html"),
         local("results/abundance/reads_per_found_AMR.html"),
-        local("results/boxplots/combined_allength_boxplot.png"),        
-        local("results/boxplots/combined_evalue_boxplot.png"),        
-        local("results/boxplots/combined_percidt_boxplot.png"),        
+        local("results/boxplots/combined_allength_boxplot.png"),
+        local("results/boxplots/combined_evalue_boxplot.png"),
+        local("results/boxplots/combined_percidt_boxplot.png"),
         local("results/qc/multiqc.html"),
         local("results/qc/overview_plot.png"),
-        local(expand("results/{sample}/overview_table.html",sample=samples)),
-        local(expand("results/{sample}/genus_abundance.html", sample=samples)),        
-        local(expand("results/{sample}/genus_idt_per_genus_plot.png", sample=samples)),        
+        local(expand("results/{sample}/overview_table.html", sample=samples)),
+        local(expand("results/{sample}/genus_abundance.html", sample=samples)),
+        local(expand("results/{sample}/genus_idt_per_genus_plot.png", sample=samples)),
     log:
         local("logs/report/report.log"),
     conda:
         "envs/snakemake.yaml"
     output:
-        local(f"{outdir}/{runname}_report.zip")
+        local(f"{outdir}/{runname}_report.zip"),
     shell:
         """        
         snakemake --nolock --report {output} --report-stylesheet config/custom-stylesheet.css 2>> {log}
         """
-
-# Prevents wildcard confusion by preventing dots or slashes in sample and keeps part numerical
-wildcard_constraints:
-    sample = "[a-zA-Z0-9_]+",
-    part = "\\d+"
diff --git a/workflow/envs/diamond.yaml b/workflow/envs/diamond.yaml
@@ -2,6 +2,7 @@ channels:
   - bioconda
   - conda-forge
   - anaconda
+  - nodefaults
 dependencies:
   - diamond=2.1.10
   - libzlib=1.2.13

diff --git a/workflow/envs/python.yaml b/workflow/envs/python.yaml
@@ -2,11 +2,12 @@ channels:
   - conda-forge
   - bioconda
   - anaconda
+  - nodefaults
 dependencies:
-  - pandas
-  - python
+  - pandas=2.2.3
+  - python=3.12
   - samtools=1.3
-  - matplotlib
+  - matplotlib=3.10
   - seaborn=0.13.2
   - altair=5.5.0
   - seqtk=1.3

diff --git a/workflow/envs/snakemake.yaml b/workflow/envs/snakemake.yaml
@@ -1,5 +1,6 @@
 channels:
   - conda-forge
   - bioconda
+  - nodefaults
 dependencies:
   - snakemake=7.32
diff --git a/workflow/envs/usearch.yaml b/workflow/envs/usearch.yaml
@@ -1,4 +1,5 @@
 channels:                                                                                                                                                            
-  - bioconda                                                                                                                                                           
+  - bioconda
+  - nodefaults                                                                                                                                                           
 dependencies:
   - usearch=12.0
diff --git a/workflow/rules/abundance.smk b/workflow/rules/abundance.smk
@@ -1,75 +1,92 @@
+# Copyright 2024 Adrian Dörr.
+# Licensed under the MIT License (https://opensource.org/license/mit)
+# This file may not be copied, modified, or distributed
+# except according to those terms.
+
+
 rule single_genera_abundance_table:
     input:
-        filtered_data = local(expand("results/{{sample}}/{part}/filtered_results.csv.gz",part=get_numpart_list())),
+        filtered_data=local(
+            expand(
+                "results/{{sample}}/{part}/filtered_results.csv.gz",
+                part=get_numpart_list(),
+            )
+        ),
     output:
         report(
             local("results/{sample}/genus_abundance.html"),
-            caption = "../../report/genus_abundance_table.rst",
+            caption="../../report/genus_abundance_table.rst",
             category="2. Single Sample Abundance Data",
             subcategory="{sample}",
-            labels={
-                "sample": "{sample}",
-                "table": "Genera Abundance"
-            }
-        )        
+            labels={"sample": "{sample}", "table": "Genera Abundance"},
+        ),
     params:
-        sample_name = "{sample}",
-        parts = get_numpart_list()
+        sample_name="{sample}",
+        parts=get_numpart_list(),
     log:
-        local("logs/{sample}/genera_abundance_table.log")
+        local("logs/{sample}/genera_abundance_table.log"),
     conda:
         "../envs/python.yaml"
     threads: config["max_threads"]
     script:
         "../scripts/single_genera_abundance_table.py"
 
+
 rule combined_genera_abundance_table:
     input:
-        filtered_data = local(expand("results/{sample}/{part}/filtered_results.csv.gz",sample=samples,part=get_numpart_list())),
+        filtered_data=local(
+            expand(
+                "results/{sample}/{part}/filtered_results.csv.gz",
+                sample=samples,
+                part=get_numpart_list(),
+            )
+        ),
     output:
-        csv = local("results/abundance/combined_genus_abundance.csv"),
+        csv=local("results/abundance/combined_genus_abundance.csv"),
     params:
-        sample_name = samples,
+        sample_name=samples,
     log:
-        local("logs/genera_abundance_table.log")
+        local("logs/genera_abundance_table.log"),
     conda:
         "../envs/python.yaml"
     threads: config["max_threads"]
     script:
         "../scripts/combined_genera_abundance_table.py"
 
+
 rule abundance_bubble_plot:
     input:
-        abundance_data = local("results/abundance/combined_genus_abundance.csv"),
+        abundance_data=local("results/abundance/combined_genus_abundance.csv"),
     output:
         report(
             local("results/abundance/combined_genus_abundance_bubbleplot.html"),
-            caption = "../../report/abundance_bubble_plot.rst",
+            caption="../../report/abundance_bubble_plot.rst",
             category="1. Combined Abundance Data",
-            labels={"figure":"Abundance Bubble Plot"}
-        ),    
+            labels={"figure": "Abundance Bubble Plot"},
+        ),
     log:
-        local("logs/genera_abundance_plot.log")
+        local("logs/genera_abundance_plot.log"),
     conda:
         "../envs/python.yaml"
     threads: config["max_threads"]
     script:
         "../scripts/combined_genera_abundance_plot.py"
 
+
 rule reads_per_AMR:
     input:
-        abundance_data = local("results/abundance/combined_genus_abundance.csv"),
+        abundance_data=local("results/abundance/combined_genus_abundance.csv"),
     output:
         report(
             local("results/abundance/reads_per_found_AMR.html"),
-            caption = "../../report/reads_per_AMR.rst",
+            caption="../../report/reads_per_AMR.rst",
             category="1. Combined Abundance Data",
-            labels={"table":"Reads per AMR"}
-        ),    
+            labels={"table": "Reads per AMR"},
+        ),
     log:
-        local("logs/genera_abundance_plot.log")
+        local("logs/genera_abundance_plot.log"),
     conda:
         "../envs/python.yaml"
     threads: config["max_threads"]
     script:
-        "../scripts/combined_reads_per_amr.py"
+        "../scripts/combined_reads_per_amr.py"