diff --git a/CHANGELOG.md b/CHANGELOG.md index c682d313..f7a33065 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ Initial release of nf-core/seqinspector, created with the [nf-core](https://nf-c - [#20](https://github.com/nf-core/seqinspector/pull/20) Use tags to generate group reports - [#13](https://github.com/nf-core/seqinspector/pull/13) Generate reports per run, per project and per lane. - [#49](https://github.com/nf-core/seqinspector/pull/49) Merge with template 3.0.2. +- [#50](https://github.com/nf-core/seqinspector/pull/50) Add an optional subsampling step. - [#51](https://github.com/nf-core/seqinspector/pull/51) Add nf-test to CI. ### `Fixed` diff --git a/CITATIONS.md b/CITATIONS.md index ecbfb16e..8a4e350b 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -18,6 +18,8 @@ > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. +- [Seqtk](https://github.com/lh3/seqtk) + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/README.md b/README.md index f647af08..6cf36dcc 100644 --- a/README.md +++ b/README.md @@ -31,8 +31,9 @@ workflows use the "tube map" design for that. See https://nf-co.re/docs/contributing/design_guidelines#examples for examples. --> -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +1. Subsample reads ([`Seqtk`](https://github.com/lh3/seqtk)) +2. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) +3. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) ## Usage diff --git a/conf/modules.config b/conf/modules.config index c8838224..d3c597b3 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -18,6 +18,10 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] + withName: SEQTK_SAMPLE { + ext.args = '-s100' + } + withName: FASTQC { ext.args = '--quiet' } diff --git a/docs/output.md b/docs/output.md index e14c3ad6..2d4efb02 100644 --- a/docs/output.md +++ b/docs/output.md @@ -10,10 +10,23 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: +- [Seqtk](#seqtk) - Subsample a specific number of reads per sample - [FastQC](#fastqc) - Raw read QC - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +### Seqtk + +
+Output files + +- `seqtk/` + - `*_fastq`: FastQ file after being subsampled to the sample_size value. + +
+ +[Seqtk](https://github.com/lh3/seqtk) samples sequences by number. + ### FastQC
diff --git a/docs/usage.md b/docs/usage.md index d75e0fbd..31ab91ef 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -93,6 +93,12 @@ genome: 'GRCh37' You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). +Optionally, the `sample_size` parameter allows you to subset a random number of reads to be analysed. Note that it refers to an absolute number. + +```bash +nextflow run nf-core/seqinspector --input ./samplesheet.csv --outdir ./results --sample_size 1000000 -profile docker +``` + ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: diff --git a/modules.json b/modules.json index a27c7506..7e57ea15 100644 --- a/modules.json +++ b/modules.json @@ -14,6 +14,11 @@ "branch": "master", "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d", "installed_by": ["modules"] + }, + "seqtk/sample": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] } } }, diff --git a/modules/nf-core/seqtk/sample/environment.yml b/modules/nf-core/seqtk/sample/environment.yml new file mode 100644 index 00000000..693aa5c1 --- /dev/null +++ b/modules/nf-core/seqtk/sample/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::seqtk=1.4 diff --git a/modules/nf-core/seqtk/sample/main.nf b/modules/nf-core/seqtk/sample/main.nf new file mode 100644 index 00000000..ea9b839e --- /dev/null +++ b/modules/nf-core/seqtk/sample/main.nf @@ -0,0 +1,58 @@ +process SEQTK_SAMPLE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqtk:1.4--he4a0461_1' : + 'biocontainers/seqtk:1.4--he4a0461_1' }" + + input: + tuple val(meta), path(reads), val(sample_size) + + output: + tuple val(meta), path("*.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if (!(args ==~ /.*-s[0-9]+.*/)) { + args += " -s100" + } + if ( !sample_size ) { + error "SEQTK/SAMPLE must have a sample_size value included" + } + """ + printf "%s\\n" $reads | while read f; + do + seqtk \\ + sample \\ + $args \\ + \$f \\ + $sample_size \\ + | gzip --no-name > ${prefix}_\$(basename \$f) + done + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + echo "" | gzip > ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + +} diff --git a/modules/nf-core/seqtk/sample/meta.yml b/modules/nf-core/seqtk/sample/meta.yml new file mode 100644 index 00000000..42f67d8f --- /dev/null +++ b/modules/nf-core/seqtk/sample/meta.yml @@ -0,0 +1,52 @@ +name: seqtk_sample +description: Subsample reads from FASTQ files +keywords: + - sample + - fastx + - reads +tools: + - seqtk: + description: Seqtk is a fast and lightweight tool for processing sequences in + the FASTA or FASTQ format. Seqtk sample command subsamples sequences. + homepage: https://github.com/lh3/seqtk + documentation: https://docs.csc.fi/apps/seqtk/ + tool_dev_url: https://github.com/lh3/seqtk + licence: ["MIT"] + identifier: biotools:seqtk +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: List of input FastQ files + pattern: "*.{fastq.gz}" + - sample_size: + type: integer + description: Number of reads to sample. +output: + - reads: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.fastq.gz": + type: file + description: Subsampled FastQ files + pattern: "*.{fastq.gz}" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@kaurravneet4123" + - "@sidorov-si" + - "@adamrtalbot" +maintainers: + - "@kaurravneet4123" + - "@sidorov-si" + - "@adamrtalbot" diff --git a/modules/nf-core/seqtk/sample/tests/main.nf.test b/modules/nf-core/seqtk/sample/tests/main.nf.test new file mode 100644 index 00000000..c121c9d9 --- /dev/null +++ b/modules/nf-core/seqtk/sample/tests/main.nf.test @@ -0,0 +1,80 @@ +nextflow_process { + + name "Test Process SEQTK_SAMPLE" + script "modules/nf-core/seqtk/sample/main.nf" + process "SEQTK_SAMPLE" + config "./standard.config" + + tag "modules" + tag "modules_nfcore" + tag "seqtk" + tag "seqtk/sample" + + test("sarscov2_sample_singleend_fqgz") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + 50 + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sarscov2_sample_pairedend_fqgz") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + 50 + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sarscov2_sample_singlend_fqgz_stub") { + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + 50 + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + +} diff --git a/modules/nf-core/seqtk/sample/tests/main.nf.test.snap b/modules/nf-core/seqtk/sample/tests/main.nf.test.snap new file mode 100644 index 00000000..a9fec3c4 --- /dev/null +++ b/modules/nf-core/seqtk/sample/tests/main.nf.test.snap @@ -0,0 +1,95 @@ +{ + "sarscov2_sample_singlend_fqgz_stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.sampled.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + "versions.yml:md5,0529f2d163df9e2cd2ae8254dfb63806" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test.sampled.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,0529f2d163df9e2cd2ae8254dfb63806" + ] + } + ], + "timestamp": "2024-02-22T15:58:45.902956" + }, + "sarscov2_sample_pairedend_fqgz": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.sampled_test_1.fastq.gz:md5,e5f44fafd7351c5abb9925a075132941" + ] + ], + "1": [ + "versions.yml:md5,0529f2d163df9e2cd2ae8254dfb63806" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test.sampled_test_1.fastq.gz:md5,e5f44fafd7351c5abb9925a075132941" + ] + ], + "versions": [ + "versions.yml:md5,0529f2d163df9e2cd2ae8254dfb63806" + ] + } + ], + "timestamp": "2024-02-22T15:58:37.679954" + }, + "sarscov2_sample_singleend_fqgz": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.sampled_test_1.fastq.gz:md5,e5f44fafd7351c5abb9925a075132941" + ] + ], + "1": [ + "versions.yml:md5,0529f2d163df9e2cd2ae8254dfb63806" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test.sampled_test_1.fastq.gz:md5,e5f44fafd7351c5abb9925a075132941" + ] + ], + "versions": [ + "versions.yml:md5,0529f2d163df9e2cd2ae8254dfb63806" + ] + } + ], + "timestamp": "2024-02-22T15:58:29.474491" + } +} \ No newline at end of file diff --git a/modules/nf-core/seqtk/sample/tests/standard.config b/modules/nf-core/seqtk/sample/tests/standard.config new file mode 100644 index 00000000..b2dd4b9f --- /dev/null +++ b/modules/nf-core/seqtk/sample/tests/standard.config @@ -0,0 +1,6 @@ +process { + withName: SEQTK_SAMPLE { + ext.args = '-s100' + ext.prefix = { "${meta.id}.sampled" } + } +} \ No newline at end of file diff --git a/modules/nf-core/seqtk/sample/tests/tags.yml b/modules/nf-core/seqtk/sample/tests/tags.yml new file mode 100644 index 00000000..e5d113b8 --- /dev/null +++ b/modules/nf-core/seqtk/sample/tests/tags.yml @@ -0,0 +1,2 @@ +seqtk/sample: + - "modules/nf-core/seqtk/sample/**" diff --git a/nextflow.config b/nextflow.config index 50c1ecbb..38eb3127 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,7 +12,7 @@ params { // TODO nf-core: Specify your pipeline's command line flags // Input options input = null - + sample_size = 0 // References genome = null fasta = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 88fd607b..49742b28 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -23,8 +23,15 @@ "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/seqinspector/usage#samplesheet-input).", "fa_icon": "fas fa-file-csv" }, + "sample_size": { + "type": "integer", + "default": 0, + "description": "Take this number of reads as a subset.", + "help_text": "Choose the size of the subset or 0, if no subsampling shall be performed. Note that it refers to an absolute number." + }, "outdir": { "type": "string", + "default": null, "format": "directory-path", "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open" diff --git a/tests/MiSeq.main.nf.test.snap b/tests/MiSeq.main.nf.test.snap index 0742b5a8..de0afa29 100644 --- a/tests/MiSeq.main.nf.test.snap +++ b/tests/MiSeq.main.nf.test.snap @@ -6,9 +6,9 @@ "multiqc_general_stats.txt:md5,5b28a83b14cb2fe88d084d08900ebdbf" ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.10.0" + "nf-test": "0.9.1", + "nextflow": "24.04.4" }, - "timestamp": "2024-10-28T13:18:10.3675973" + "timestamp": "2024-10-30T09:08:29.692511055" } -} +} \ No newline at end of file diff --git a/tests/NovaSeq6000.main.nf.test.snap b/tests/NovaSeq6000.main.nf.test.snap index fc38ac52..62ccd4aa 100644 --- a/tests/NovaSeq6000.main.nf.test.snap +++ b/tests/NovaSeq6000.main.nf.test.snap @@ -18,9 +18,9 @@ "multiqc_general_stats.txt:md5,6a1c16f068d7ba3a9225a17eb570ed9a" ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.10.0" + "nf-test": "0.9.1", + "nextflow": "24.04.4" }, - "timestamp": "2024-10-28T13:19:13.226135825" + "timestamp": "2024-10-30T09:09:57.158871165" } -} +} \ No newline at end of file diff --git a/tests/NovaSeq6000.main_subsample.nf.test b/tests/NovaSeq6000.main_subsample.nf.test new file mode 100644 index 00000000..fe2b0685 --- /dev/null +++ b/tests/NovaSeq6000.main_subsample.nf.test @@ -0,0 +1,45 @@ +nextflow_pipeline { + + name "Test Workflow main.nf on NovaSeq6000 data sample size 90" + script "../main.nf" + tag "seqinspector" + tag "PIPELINE" + + test("NovaSeq6000 data test sample size") { + + when { + config "./NovaSeq6000.main_subsample.nf.test.config" + params { + outdir = "$outputDir" + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_citations.txt"), + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_fastqc.txt"), + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_general_stats.txt"), + + path("$outputDir/multiqc/group_reports/lane1/multiqc_data/multiqc_citations.txt"), + path("$outputDir/multiqc/group_reports/lane1/multiqc_data/multiqc_fastqc.txt"), + path("$outputDir/multiqc/group_reports/lane1/multiqc_data/multiqc_general_stats.txt"), + + path("$outputDir/multiqc/group_reports/group1/multiqc_data/multiqc_citations.txt"), + path("$outputDir/multiqc/group_reports/group1/multiqc_data/multiqc_fastqc.txt"), + path("$outputDir/multiqc/group_reports/group1/multiqc_data/multiqc_general_stats.txt"), + + path("$outputDir/multiqc/group_reports/group2/multiqc_data/multiqc_citations.txt"), + path("$outputDir/multiqc/group_reports/group2/multiqc_data/multiqc_fastqc.txt"), + path("$outputDir/multiqc/group_reports/group2/multiqc_data/multiqc_general_stats.txt"), + + path("$outputDir/multiqc/group_reports/test/multiqc_data/multiqc_citations.txt"), + path("$outputDir/multiqc/group_reports/test/multiqc_data/multiqc_fastqc.txt"), + path("$outputDir/multiqc/group_reports/test/multiqc_data/multiqc_general_stats.txt"), + ).match() + }, + ) + } + } +} diff --git a/tests/NovaSeq6000.main_subsample.nf.test.config b/tests/NovaSeq6000.main_subsample.nf.test.config new file mode 100644 index 00000000..acda74dd --- /dev/null +++ b/tests/NovaSeq6000.main_subsample.nf.test.config @@ -0,0 +1,8 @@ +// Load the basic test config +includeConfig 'nextflow.config' + +// Load the correct samplesheet for that test +params { + input = params.pipelines_testdata_base_path + 'seqinspector/testdata/NovaSeq6000/samplesheet.csv' + sample_size = 90 +} diff --git a/tests/NovaSeq6000.main_subsample.nf.test.snap b/tests/NovaSeq6000.main_subsample.nf.test.snap new file mode 100644 index 00000000..651973b1 --- /dev/null +++ b/tests/NovaSeq6000.main_subsample.nf.test.snap @@ -0,0 +1,26 @@ +{ + "NovaSeq6000 data test sample size": { + "content": [ + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_fastqc.txt:md5,aba942d1e6996b579f19798e5673f514", + "multiqc_general_stats.txt:md5,ad1ec9c64cbdb1131a26aeb6de51e31c", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_fastqc.txt:md5,aa1b8d6adae86005ea7a8b2e901099b8", + "multiqc_general_stats.txt:md5,c73c8d10568a56f6534d280fff701e60", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_fastqc.txt:md5,ff996e1d3dc4a46e0c9535e54d51ccab", + "multiqc_general_stats.txt:md5,834e1868b887171cfda72029bbbe2d3f", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_fastqc.txt:md5,3df36ecfe76b25b0c22dcda84bce2b3b", + "multiqc_general_stats.txt:md5,274a001b007521970f14d68bd176e5be", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_fastqc.txt:md5,ce61b4ce4b1d76ec3f20de3bf0c9ec7f", + "multiqc_general_stats.txt:md5,d476ad59458a035a329605d5284b6012" + ], + "meta": { + "nf-test": "0.9.1", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-30T09:37:46.182191597" + } +} \ No newline at end of file diff --git a/tests/PromethION.main.nf.test.snap b/tests/PromethION.main.nf.test.snap index 875673c0..dfa4eb6f 100644 --- a/tests/PromethION.main.nf.test.snap +++ b/tests/PromethION.main.nf.test.snap @@ -6,9 +6,9 @@ "multiqc_general_stats.txt:md5,409cefc7f17f95d176ced6032bf8fb32" ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.10.0" + "nf-test": "0.9.1", + "nextflow": "24.04.4" }, - "timestamp": "2024-10-28T13:19:57.261730412" + "timestamp": "2024-10-30T09:12:03.048502046" } -} +} \ No newline at end of file diff --git a/workflows/seqinspector.nf b/workflows/seqinspector.nf index ab6cda41..7a2dfae5 100644 --- a/workflows/seqinspector.nf +++ b/workflows/seqinspector.nf @@ -3,6 +3,8 @@ IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ + +include { SEQTK_SAMPLE } from '../modules/nf-core/seqtk/sample/main' include { FASTQC } from '../modules/nf-core/fastqc/main' include { MULTIQC as MULTIQC_GLOBAL } from '../modules/nf-core/multiqc/main' @@ -30,11 +32,28 @@ workflow SEQINSPECTOR { ch_multiqc_extra_files = Channel.empty() ch_multiqc_reports = Channel.empty() + // + // MODULE: Run Seqtk sample to perform subsampling + // + if (params.sample_size > 0 ) { + ch_sample_sized = SEQTK_SAMPLE( + ch_samplesheet.map { + meta, reads -> [meta, reads, params.sample_size] + } + ).reads + ch_versions = ch_versions.mix(SEQTK_SAMPLE.out.versions.first()) + } else { + // No do subsample + ch_sample_sized = ch_samplesheet + } + // // MODULE: Run FastQC // FASTQC ( - ch_samplesheet + ch_sample_sized.map { + meta, subsampled -> [meta, subsampled] + } ) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip) ch_versions = ch_versions.mix(FASTQC.out.versions.first())