Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
[![Snakemake](https://img.shields.io/badge/snakemake-≥9.0-brightgreen.svg)](https://snakemake.bitbucket.io)
[![Snakemake CI](https://github.com/IKIM-Essen/ERMA/actions/workflows/snakemake-ci.yml/badge.svg)](https://github.com/IKIM-Essen/ERMA/actions/workflows/snakemake-ci.yml)

This pipeline is designed to process sequencing reads obtained from epicPCR experiments, linking antimicrobial resistance (AMR) genes with 16S rRNA genes. The pipeline uses **Snakemake** to manage the workflow and integrates tools for downloading necessary databases, running sequence alignments, filtering, and generating visual reports.
This **Snakemake**-based pipeline processes sequencing reads from epicPCR experiments to link antimicrobial resistance (AMR) genes with genus-specific 16S rRNA gene sequences from prokaryotic cells. It orchestrates all workflow steps, including database downloads, sequence alignment, read filtering, and the generation of visual reports, ensuring reproducibility and streamlined analysis.

## Features
- Downloads and prepares SILVA and CARD databases for use in the analysis.
Expand All @@ -17,10 +17,11 @@ This pipeline is designed to process sequencing reads obtained from epicPCR expe
## Prerequisites

### Install Snakemake
The pipeline requires **Snakemake** with support for conda environments. You can install it using conda (via Miniconda or Anaconda):
The pipeline requires **Snakemake** (≥ Version 9.0) with support for conda environments. You can install it using conda (via Miniconda or Anaconda):

```bash
mamba install -c conda-forge -c bioconda snakemake --yes; mamba install -c bioconda snakemake-storage-plugin-fs --yes
mamba create -n snakemake9 -c conda-forge -c bioconda snakemake=9 --yes; mamba install -n snakemake9 -c bioconda snakemake-storage-plugin-fs --yes
mamba activate snakemake9
```

Alternatively, follow the official [Snakemake installation](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html) guide for more documentation and options.
Expand All @@ -29,10 +30,10 @@ Install Dependencies: This pipeline uses conda environments to manage its depend

## Usage Instructions

Clone the repository: First, clone the pipeline repository to your local machine:
First, clone the pipeline repository to your local machine:

```bash
git clone https://github.com/your-username/ERMA.git
git clone https://github.com/IKIM-essen/ERMA.git
cd ERMA
```
Prepare Data Folder: You need to place your raw sequencing files (fastq.gz format) in the data/fastq/ directory or change this path to the desired directory.
Expand Down Expand Up @@ -111,6 +112,14 @@ To run the pipeline in the default mode, a profile with snakemake specific param
snakemake --profile profile
```

### Testing

For testing the workflow you can copy the provided dummy data:

```
cp .github/data/fastq/test_epic_data.fastq.gz data/fastq/
```

## Additional Notes

The pipeline is designed to handle large sequencing datasets in parallel, so it's recommended to run it on a machine with sufficient computational resources. However, to run the pipeline on machines with less resources, it is recommended to split the fastq files or the tables in smaller chunks to prevent RAM overflow. This can be done by increase num_parts in the config file. However, the higher the number of parts per FASTQ file, the higher the chance some blast results for the same read will be split and lost.
Expand Down
49 changes: 29 additions & 20 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import glob, os


configfile: "config/config.yaml"


include: "rules/common.smk"
include: "rules/qc.smk"
include: "rules/load_db.smk"
Expand All @@ -9,50 +12,56 @@ include: "rules/boxplots.smk"
include: "rules/abundance.smk"
include: "rules/preprocess.smk"


# Build a list of all "fastq.gz" files in "ERMA/data/fastq" which will be processed subsequently
samples = [os.path.basename(f).replace(".fastq.gz", "") for f in glob.glob(os.path.join(config["base_dir"], config["fastq_dir"], "*.fastq.gz"))]
samples = [
os.path.basename(f).replace(".fastq.gz", "")
for f in glob.glob(
os.path.join(config["base_dir"], config["fastq_dir"], "*.fastq.gz")
)
]
runname = "".join(config["runname"])
seq_tech = "".join(config["seq_tech"])
outdir = "".join(config["outdir"])

print("Important Run Parameter:\n")
print("Run Parameters:\n")
print(f"Base directory: \t{os.path.abspath(config['base_dir'])}")
print(f"Sample location:\t{os.path.abspath(config['fastq_dir'])}")
print(f"Report out location:\t{os.path.abspath(config['outdir'])}")
print(f"Input location:\t{os.path.abspath(config['fastq_dir'])}")
print(f"Output location:\t{os.path.abspath(config['outdir'])}")
print(f"Detected samples:\t{samples}")
print(f"Detected Runname:\t{runname}")
print(f"Detected Seqtech:\t{seq_tech} (This Parameter is for demultiplexing/merging preprocess only!)\n")
print(f"Detected runname:\t{runname}")
print(
f"Configured sequencing technology in preprocessing (for merging/demultiplexing):\t{seq_tech}\n"
)


rule all:
input:
local(f"{outdir}/{runname}_report.zip")
local(f"{outdir}/{runname}_report.zip"),


report: "../report/workflow.rst"


rule snakemake_report:
input:
input:
local("results/abundance/combined_genus_abundance_bubbleplot.html"),
local("results/abundance/reads_per_found_AMR.html"),
local("results/boxplots/combined_allength_boxplot.png"),
local("results/boxplots/combined_evalue_boxplot.png"),
local("results/boxplots/combined_percidt_boxplot.png"),
local("results/boxplots/combined_allength_boxplot.png"),
local("results/boxplots/combined_evalue_boxplot.png"),
local("results/boxplots/combined_percidt_boxplot.png"),
local("results/qc/multiqc.html"),
local("results/qc/overview_plot.png"),
local(expand("results/{sample}/overview_table.html",sample=samples)),
local(expand("results/{sample}/genus_abundance.html", sample=samples)),
local(expand("results/{sample}/genus_idt_per_genus_plot.png", sample=samples)),
local(expand("results/{sample}/overview_table.html", sample=samples)),
local(expand("results/{sample}/genus_abundance.html", sample=samples)),
local(expand("results/{sample}/genus_idt_per_genus_plot.png", sample=samples)),
log:
local("logs/report/report.log"),
conda:
"envs/snakemake.yaml"
output:
local(f"{outdir}/{runname}_report.zip")
local(f"{outdir}/{runname}_report.zip"),
shell:
"""
snakemake --nolock --report {output} --report-stylesheet config/custom-stylesheet.css 2>> {log}
"""

# Prevents wildcard confusion by preventing dots or slashes in sample and keeps part numerical
wildcard_constraints:
sample = "[a-zA-Z0-9_]+",
part = "\\d+"
1 change: 1 addition & 0 deletions workflow/envs/diamond.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ channels:
- bioconda
- conda-forge
- anaconda
- nodefaults
dependencies:
- diamond=2.1.10
- libzlib=1.2.13
Expand Down
7 changes: 4 additions & 3 deletions workflow/envs/python.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@ channels:
- conda-forge
- bioconda
- anaconda
- nodefaults
dependencies:
- pandas
- python
- pandas=2.2.3
- python=3.12
- samtools=1.3
- matplotlib
- matplotlib=3.10
- seaborn=0.13.2
- altair=5.5.0
- seqtk=1.3
Expand Down
1 change: 1 addition & 0 deletions workflow/envs/snakemake.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
channels:
- conda-forge
- bioconda
- nodefaults
dependencies:
- snakemake=7.32
3 changes: 2 additions & 1 deletion workflow/envs/usearch.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
channels:
- bioconda
- bioconda
- nodefaults
dependencies:
- usearch=12.0
67 changes: 42 additions & 25 deletions workflow/rules/abundance.smk
Original file line number Diff line number Diff line change
@@ -1,75 +1,92 @@
# Copyright 2024 Adrian Dörr.
# Licensed under the MIT License (https://opensource.org/license/mit)
# This file may not be copied, modified, or distributed
# except according to those terms.


rule single_genera_abundance_table:
input:
filtered_data = local(expand("results/{{sample}}/{part}/filtered_results.csv.gz",part=get_numpart_list())),
filtered_data=local(
expand(
"results/{{sample}}/{part}/filtered_results.csv.gz",
part=get_numpart_list(),
)
),
output:
report(
local("results/{sample}/genus_abundance.html"),
caption = "../../report/genus_abundance_table.rst",
caption="../../report/genus_abundance_table.rst",
category="2. Single Sample Abundance Data",
subcategory="{sample}",
labels={
"sample": "{sample}",
"table": "Genera Abundance"
}
)
labels={"sample": "{sample}", "table": "Genera Abundance"},
),
params:
sample_name = "{sample}",
parts = get_numpart_list()
sample_name="{sample}",
parts=get_numpart_list(),
log:
local("logs/{sample}/genera_abundance_table.log")
local("logs/{sample}/genera_abundance_table.log"),
conda:
"../envs/python.yaml"
threads: config["max_threads"]
script:
"../scripts/single_genera_abundance_table.py"


rule combined_genera_abundance_table:
input:
filtered_data = local(expand("results/{sample}/{part}/filtered_results.csv.gz",sample=samples,part=get_numpart_list())),
filtered_data=local(
expand(
"results/{sample}/{part}/filtered_results.csv.gz",
sample=samples,
part=get_numpart_list(),
)
),
output:
csv = local("results/abundance/combined_genus_abundance.csv"),
csv=local("results/abundance/combined_genus_abundance.csv"),
params:
sample_name = samples,
sample_name=samples,
log:
local("logs/genera_abundance_table.log")
local("logs/genera_abundance_table.log"),
conda:
"../envs/python.yaml"
threads: config["max_threads"]
script:
"../scripts/combined_genera_abundance_table.py"


rule abundance_bubble_plot:
input:
abundance_data = local("results/abundance/combined_genus_abundance.csv"),
abundance_data=local("results/abundance/combined_genus_abundance.csv"),
output:
report(
local("results/abundance/combined_genus_abundance_bubbleplot.html"),
caption = "../../report/abundance_bubble_plot.rst",
caption="../../report/abundance_bubble_plot.rst",
category="1. Combined Abundance Data",
labels={"figure":"Abundance Bubble Plot"}
),
labels={"figure": "Abundance Bubble Plot"},
),
log:
local("logs/genera_abundance_plot.log")
local("logs/genera_abundance_plot.log"),
conda:
"../envs/python.yaml"
threads: config["max_threads"]
script:
"../scripts/combined_genera_abundance_plot.py"


rule reads_per_AMR:
input:
abundance_data = local("results/abundance/combined_genus_abundance.csv"),
abundance_data=local("results/abundance/combined_genus_abundance.csv"),
output:
report(
local("results/abundance/reads_per_found_AMR.html"),
caption = "../../report/reads_per_AMR.rst",
caption="../../report/reads_per_AMR.rst",
category="1. Combined Abundance Data",
labels={"table":"Reads per AMR"}
),
labels={"table": "Reads per AMR"},
),
log:
local("logs/genera_abundance_plot.log")
local("logs/genera_abundance_plot.log"),
conda:
"../envs/python.yaml"
threads: config["max_threads"]
script:
"../scripts/combined_reads_per_amr.py"
"../scripts/combined_reads_per_amr.py"
Loading