From 6ba7b4843da6884e81009e0eaaff36bef5c9b713 Mon Sep 17 00:00:00 2001 From: Thomas Cokelaer Date: Wed, 24 Jan 2024 23:32:52 +0100 Subject: [PATCH] add minimap2 --- .github/workflows/apptainer.yml | 1 + .github/workflows/main.yml | 1 + .pre-commit-config.yaml | 33 ++++++ README.rst | 27 +++-- environment.yml | 1 + pyproject.toml | 2 +- sequana_pipelines/variant_calling/config.yaml | 33 ++++-- sequana_pipelines/variant_calling/main.py | 95 ++++++++++++--- sequana_pipelines/variant_calling/schema.yaml | 35 +++++- .../variant_calling/variant_calling.rules | 110 +++++++++++------- 10 files changed, 253 insertions(+), 85 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.github/workflows/apptainer.yml b/.github/workflows/apptainer.yml index 841eff1..a8cf6be 100644 --- a/.github/workflows/apptainer.yml +++ b/.github/workflows/apptainer.yml @@ -49,6 +49,7 @@ jobs: - name: install package itself run: | pip install . + pip install "pulp==2.7.0" --no-deps - name: testing run: | diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 908a638..aa207f0 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -41,6 +41,7 @@ jobs: shell: bash -l {0} run: | pip install . + pip install "pulp==2.7.0" --no-deps - name: Install dependencies diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..6707eac --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,33 @@ + +files: '\.(py|rst|sh)$' +fail_fast: false + +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v3.2.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + #- id: check-executables-have-shebangs + - id: check-ast + +- repo: https://github.com/pycqa/flake8 + rev: 6.1.0 + hooks: + - id: flake8 + args: ["-j8", "--ignore=E203,E501,W503,E722", "--max-line-length=120", "--exit-zero"] + +- repo: https://github.com/psf/black + rev: 22.10.0 + hooks: + - id: black + args: ["--line-length=120"] + exclude: E501 + +- repo: https://github.com/pycqa/isort + rev: 5.12.0 + hooks: + - id: isort + args: ["--profile", "black"] # solves conflicts between black and isort + diff --git a/README.rst b/README.rst index 5775b5d..ffd8127 100644 --- a/README.rst +++ b/README.rst @@ -80,7 +80,7 @@ if you decide to use snakemake manually, do not forget to add singularity option snakemake -s variant_calling.rules -c config.yaml --cores 4 --stats stats.txt --use-singularity --singularity-prefix ~/.sequana/apptainers --singularity-args "-B /home:/home" - + Requirements ~~~~~~~~~~~~ @@ -91,6 +91,7 @@ This pipelines requires the following executable(s): - freebayes - picard (picard-tools) - sambamba +- minimap2 - samtools - snpEff you will need 5.0 or 5.1d (note the d); 5.1 does not work. @@ -105,7 +106,7 @@ written by Erik Garrison. Input reads (paired or single) are mapped using `bwa `_ and sorted with `sambamba-sort `_. PCR duplicates are marked with -`sambamba-markdup `_. +`sambamba-markdup `_. `Freebayes `_ is used to detect SNPs and short INDELs. The INDEL realignment and base quality recalibration are not necessary with Freebayes. For more information, please refer to a post by Brad Chapman on @@ -130,15 +131,22 @@ Changelog ========= ====================================================================== Version Description ========= ====================================================================== -1.1.2 * add -Xmx8g option in snpeff rule at the build stage. +1.2.0 * -Xmx8g option previously added is not robust. Does not work with + snpEff 5.1 for instance. + * add minimap aligner + * add --nanopore and --pacbio to automatically set minimap2 as the + aligner and the minimap options (map-pb or map-ont) + * add minimap2 container. + * add missing resources in snpeff section +1.1.2 * add -Xmx8g option in snpeff rule at the build stage. * add resources (8G) in the snpeff rule at run stage * fix missing output_directory in sequana_coverage rule * fix joint calling (regression) input function and inputs 1.1.1 * Fix regression in coverage rule 1.1.0 * add specific apptainer for freebayes (v1.2.0) - * Update API to use click + * Update API to use click 1.0.2 * Fixed failure in multiqc if coverage and snpeff are off -1.0.1 * automatically fill the bwa index algorithm and fix bwa_index rule to +1.0.1 * automatically fill the bwa index algorithm and fix bwa_index rule to use the options in the config file (not the harcoded one) 1.0.0 * use last warppers and graphviz apptainer 0.12.0 * set all apptainers containers and add vcf to bcf conversions @@ -149,7 +157,7 @@ Version Description 0.9.5 * fix typo in the onsuccess and update sequana requirements to use most up-to-date snakemake rules 0.9.4 * fix typo related to the reference-file option new name not changed - everyhere in the pipeline. + everyhere in the pipeline. 0.9.3 * use new framework (faster --help, --from-project option) * rename --reference into --reference-file and --annotation to --annotation-file @@ -159,7 +167,7 @@ Version Description samplesnpeff) * add multiqc to show sequana_coverage and snpeff summary sections * cleanup onsuccess section - * more options sanity checks and options (e.g., + * more options sanity checks and options (e.g., * genbank_file renamed into annotation_file in the config * use --legacy in freebayes options * fix coverage section to use new sequana api @@ -172,7 +180,6 @@ Version Description Contribute & Code of Conduct ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -To contribute to this project, please take a look at the -`Contributing Guidelines `_ first. Please note that this project is released with a +To contribute to this project, please take a look at the +`Contributing Guidelines `_ first. Please note that this project is released with a `Code of Conduct `_. By contributing to this project, you agree to abide by its terms. - diff --git a/environment.yml b/environment.yml index 641180d..7d6560e 100644 --- a/environment.yml +++ b/environment.yml @@ -13,6 +13,7 @@ dependencies: - picard>2.26 - samtools>=1.15 - bamtools + - minimap2 - pip - pip: - sequana diff --git a/pyproject.toml b/pyproject.toml index 2749a92..d11b8e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "sequana-variant-calling" -version = "1.1.2" +version = "1.2.0" description = "A multi-sample variant calling pipeline" authors = ["Sequana Team"] license = "BSD-3" diff --git a/sequana_pipelines/variant_calling/config.yaml b/sequana_pipelines/variant_calling/config.yaml index b6b275b..955621f 100644 --- a/sequana_pipelines/variant_calling/config.yaml +++ b/sequana_pipelines/variant_calling/config.yaml @@ -8,20 +8,33 @@ # use it, otherwise use input_samples. # ============================================================================ # -sequana_wrappers: "v23.12.5" +sequana_wrappers: "v24.1.14" # Mandatory fields input_directory: input_readtag: _R[12]_ input_pattern: '*fastq.gz' -annotation_file: -reference_file: + + +########################################################################### +# General section: +# +# :Parameters: +# +# +# - aligner_choice: either bwa or minimap2 +# +general: + aligner_choice: bwa + annotation_file: + reference_file: apptainers: sequana_coverage: "https://zenodo.org/record/10209929/files/sequana_0.16.1.img" sequana_tools: "https://zenodo.org/record/7963917/files/sequana_tools_0.15.1.img" graphviz: "https://zenodo.org/record/7928262/files/graphviz_7.0.5.img" + minimap2: "https://zenodo.org/record/7341710/files/sequana_tools_0.14.5.img" multiqc: "https://zenodo.org/record/10205070/files/multiqc_1.16.0.img" freebayes: "https://zenodo.org/record/10259279/files/freebayes_1.2.0.img" @@ -32,6 +45,12 @@ add_read_group: options: '' +minimap2: + options: "-x map-ont" + threads: 4 + resources: + mem: 8G + ############################################################################## # samtools_depth @@ -78,12 +97,12 @@ bwa_mem: # - do: if unchecked, this rule is ignored. # - reference: genbank file. # - options: any options recognised by snpEff. -# - build_options: use at the build stage of snpEff. +# - build_options: use at the build stage of snpEff. # depending on the GFF, you may need to add options to skip some sanity checks # such as noCheckProtein or -noCheckCds. Note also that newest version of snpeff # may need more memory. To do so, you must set the -XmxNg option where N is to be # replaced by the required memory in Gb. Version of snpEff 5.1 does not seem to accept -# that option. +# that option though. # Requires the annotation file # # Results filter options: @@ -95,8 +114,8 @@ bwa_mem: # snpeff: do: true - build_options: " -Xmx8g " # -noCheckCds -noCheckProtein - options: -no-downstream -no-upstream + build_options: # -noCheckCds -noCheckProtein + options: -no-downstream -no-upstream resources: mem: 8G diff --git a/sequana_pipelines/variant_calling/main.py b/sequana_pipelines/variant_calling/main.py index 00a177c..5fc1f81 100755 --- a/sequana_pipelines/variant_calling/main.py +++ b/sequana_pipelines/variant_calling/main.py @@ -10,17 +10,16 @@ # documentation: http://sequana.readthedocs.io # ############################################################################## -import sys import os +import sys -import rich_click as click import click_completion +import rich_click as click click_completion.init() -from sequana_pipetools.options import * from sequana_pipetools import SequanaManager - +from sequana_pipetools.options import * NAME = "variant_calling" @@ -30,11 +29,14 @@ groups={ "Pipeline Specific": [ "--reference-file", + "--aligner-choice", "--annotation-file", "--circular", "--do-coverage", "--do-joint-calling", "--freebayes-ploidy", + "--nanopore", + "--pacbio", ], }, ) @@ -45,11 +47,58 @@ @include_options_from(ClickSlurmOptions) @include_options_from(ClickInputOptions) @include_options_from(ClickGeneralOptions) -@click.option("--annotation-file", "annotation", default=None, help="The annotation for snpeff. This is optional but highly recommended to obtain meaningful HTML report.") -@click.option("--do-coverage", "do_coverage", is_flag=True, default=False, show_default=True, help="perform the coverage analysis using sequana_coverage.") +@click.option( + "--aligner-choice", + "aligner", + type=click.Choice(["bwa", "minimap2"]), + default="bwa", + show_default=True, + help="The aligner in bwa / minimap2 .", +) +@click.option( + "--do-coverage", + "do_coverage", + is_flag=True, + default=False, + show_default=True, + help="perform the coverage analysis using sequana_coverage.", +) +@click.option( + "--annotation-file", + "annotation", + default=None, + help="The annotation for snpeff. This is optional but highly recommended to obtain meaningful HTML report.", +) +@click.option( + "--do-coverage", + "do_coverage", + is_flag=True, + default=False, + show_default=True, + help="perform the coverage analysis using sequana_coverage.", +) +@click.option( + "--nanopore", + is_flag=True, + default=False, + show_default=True, + help="if set, fix minimap2 as the aligner, and uses -x map-ont for the minimap2 options", +) +@click.option( + "--pacbio", + is_flag=True, + default=False, + show_default=True, + help="if set, fix minimap2 as the aligner, and uses -x map-pb for the minimap2 options", +) @click.option("--do-joint-calling", "do_joint_calling", is_flag=True, help="do the joint calling analysis") -@click.option("--freebayes-ploidy", type=int, default=1, show_default=True, - help="""For population, or eukaryotes, change the ploidy to the correct values. For population, you may set it to 10.""") +@click.option( + "--freebayes-ploidy", + type=int, + default=1, + show_default=True, + help="""For population, or eukaryotes, change the ploidy to the correct values. For population, you may set it to 10.""", +) @click.option("-o", "--circular", is_flag=True, help="Recommended for bacteria genomes and circularised genomes") @click.option("--reference-file", "reference", required=True, help="The input reference to mapped reads onto") def main(**options): @@ -66,11 +115,17 @@ def main(**options): def fill_annotation_file(): if options.annotation: cfg.snpeff.do = True - cfg.annotation_file = os.path.abspath(options.annotation) + cfg.general.annotation_file = os.path.abspath(options.annotation) # manager.exists(cfg.annotation_file) # print("." in cfg.annotation_file, cfg.annotation_file.split(".")[-1]) - if "." not in cfg.annotation_file or cfg.annotation_file.split(".")[-1] not in ["gbk", "gff", "gff3"]: - click.echo("The annotation file must in in .gbk or .gff or .gff3. You provided {cfg.annotation_file}") + if "." not in cfg.general.annotation_file or cfg.general.annotation_file.split(".")[-1] not in [ + "gbk", + "gff", + "gff3", + ]: + click.echo( + "The annotation file must in in .gbk or .gff or .gff3. You provided {cfg.general.annotation_file}" + ) sys.exit(1) else: cfg.snpeff.do = False @@ -90,12 +145,18 @@ def fill_ploidy_freebayes(): def fill_reference_file(): # required argument - cfg.reference_file = os.path.abspath(options.reference) + cfg.general.reference_file = os.path.abspath(options.reference) + + # first if option --long-read is used (overwritten by other options) + if options["nanopore"]: + cfg.general.aligner_choice = "minimap2" + cfg.minimap2.options = "-x map-ont" + elif options["pacbio"]: + cfg.general.aligner_choice = "minimap2" + cfg.minimap2.options = "-x map-pb" if options["from_project"]: - if "--reference-file" in options: - fill_reference_file() - + raise NotImplementedError else: fill_annotation_file() fill_do_coverage() @@ -104,10 +165,10 @@ def fill_reference_file(): fill_ploidy_freebayes() fill_reference_file() - # Given the reference, let us compute its length and st the index algorithm + # Given the reference, let us compute its length and the index algorithm from sequana import FastA - f = FastA(cfg.reference_file) + f = FastA(cfg.general.reference_file) N = f.get_stats()["total_length"] # seems to be a hardcoded values in bwa according to the documentation diff --git a/sequana_pipelines/variant_calling/schema.yaml b/sequana_pipelines/variant_calling/schema.yaml index dc29e5a..20cffd6 100644 --- a/sequana_pipelines/variant_calling/schema.yaml +++ b/sequana_pipelines/variant_calling/schema.yaml @@ -14,12 +14,20 @@ mapping: "input_pattern": type: str required: True - "annotation_file": - type: str - required: False - "reference_file": - type: str - required: True + + + "general": + type: map + mapping: + "annotation_file": + type: str + required: False + "reference_file": + type: str + required: True + "aligner_choice": + type: str + enum: [bwa, minimap2] "apptainers": type: any @@ -66,6 +74,21 @@ mapping: "input_directory": type: str + "minimap2": + type: map + mapping: + "options": + type: str + "threads": + type: int + range: { min: 1 } + "resources": + type: map + mapping: + "mem": + type: str + + "snpeff": type: map mapping: diff --git a/sequana_pipelines/variant_calling/variant_calling.rules b/sequana_pipelines/variant_calling/variant_calling.rules index 23f77dc..4b0fd71 100644 --- a/sequana_pipelines/variant_calling/variant_calling.rules +++ b/sequana_pipelines/variant_calling/variant_calling.rules @@ -60,8 +60,8 @@ rule all: others -reference_file = config["reference_file"] -annotation_file = config["annotation_file"] +reference_file = config["general"]["reference_file"] +annotation_file = config["general"]["annotation_file"] new_reference = f"reference/{os.path.basename(reference_file)}" @@ -71,8 +71,8 @@ if config["snpeff"]["do"]: rule snpeff_add_locus_in_fasta: input: - config["reference_file"], - config["annotation_file"] + reference_file, + annotation_file output: new_reference params: @@ -104,45 +104,67 @@ else: # ========================================================= BWA # The pipeline can be started with sorted BAM files # +aligner = config["general"]["aligner_choice"] if not config["input_pattern"].endswith("bam"): - # ========================================================= bwa indexing - rule bwa_index: - input: - reference=new_reference - output: - bwa_bwt=new_reference + ".bwt", - fai=new_reference + ".fai" - log: - "reference/build.log" - params: - options=config['bwa_mem'].get('index_options', ""), - index_algorithm=config['bwa_mem'].get('index_algorithm', "is") - container: - config['apptainers']['sequana_tools'] - threads: 2 - wrapper: - f"{manager.wrappers}/wrappers/bwa/build" + if aligner == "bwa": + # ========================================================= bwa indexing + rule bwa_index: + input: + reference=new_reference + output: + bwa_bwt=new_reference + ".bwt", + fai=new_reference + ".fai" + log: + "reference/build.log" + params: + options=config['bwa_mem'].get('index_options', ""), + index_algorithm=config['bwa_mem'].get('index_algorithm', "is") + container: + config['apptainers']['sequana_tools'] + threads: 2 + wrapper: + f"{manager.wrappers}/wrappers/bwa/build" - # ========================================================= bwa mapping - rule bwa: - input: - fastq=manager.getrawdata(), - bwa_bwt=new_reference + ".bwt", - fai=new_reference + ".fai", - reference=new_reference - output: - sorted="{sample}/bwa/{sample}.sorted.bam" - log: - "{sample}/bwa/{sample}.log" - params: - options=config["bwa_mem"]["options"], - tmp_directory=config["bwa_mem"]["tmp_directory"] - container: - config['apptainers']['sequana_tools'] - threads: 2 - wrapper: - f"{manager.wrappers}/wrappers/bwa/align" + # ========================================================= bwa mapping + rule bwa: + input: + fastq=manager.getrawdata(), + bwa_bwt=new_reference + ".bwt", + fai=new_reference + ".fai", + reference=new_reference + output: + sorted="{sample}/bwa/{sample}.sorted.bam" + log: + "{sample}/bwa/{sample}.log" + params: + options=config["bwa_mem"]["options"], + tmp_directory=config["bwa_mem"]["tmp_directory"] + container: + config['apptainers']['sequana_tools'] + threads: 2 + wrapper: + f"{manager.wrappers}/wrappers/bwa/align" + + elif aligner == "minimap2": + rule minimap2: + input: + fastq=manager.getrawdata(), + reference=new_reference + output: + sorted="{sample}/minimap2/{sample}.sorted.bam" + threads: + config["minimap2"]["threads"] + params: + options=config['minimap2']['options'] + container: + config['apptainers']['minimap2'] + resources: + **config["minimap2"]["resources"] + wrapper: + f"{manager.wrappers}/wrappers/minimap2" + else: + raise ValueError(f"aligner_choice must be set to bwa or minimap2. You set {aligner}") # ========================================================= add read group @@ -150,7 +172,7 @@ if not config["input_pattern"].endswith("bam"): rule add_read_group: input: manager.getrawdata() if config["input_pattern"].endswith(".bam") - else "{sample}/bwa/{sample}.sorted.bam" + else "{sample}/" + aligner + "/{sample}.sorted.bam" output: bam="{sample}/add_read_group/{sample}.sorted.bam" log: @@ -274,8 +296,8 @@ if config["sequana_coverage"]["do"]: "bed": "{sample}/double_bed/{sample}.bed", "fasta": reference_file } - if config["annotation_file"]: - input_file['gbk'] = config["annotation_file"] + if config["general"]["annotation_file"]: + input_file['gbk'] = config["general"]["annotation_file"] return input_file @@ -285,7 +307,7 @@ if config["sequana_coverage"]["do"]: output: "{sample}/sequana_coverage/sequana_coverage.html" params: - annotation=config["annotation_file"], + annotation=annotation_file, circular=config["sequana_coverage"]["circular"], chunksize=config["sequana_coverage"]["chunksize"], double_threshold=config["sequana_coverage"]["double_threshold"],