Skip to content

Commit d4dd5c3

Browse files
committed
Kate's stuff copy pasted + pre-commit
1 parent a81ae16 commit d4dd5c3

10 files changed

Lines changed: 482 additions & 15 deletions

File tree

README.md

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,27 @@
2121

2222
## Introduction
2323

24-
**nf-core/seqsubmit** is a bioinformatics pipeline that ...
24+
**nf-core/seqsubmit** is a bioinformatics pipeline that submits data to public archives such as [ENA](https://www.ebi.ac.uk/ena/browser/home)
25+
26+
Pipeline will have several modes
27+
28+
- `mags` for MAGs submission with **genome_submitter** wf
29+
- `bins` for bins submission with **genome_submitter** wf
30+
- `assemblies` for assembly submission with **assembly_submitter** wf
31+
32+
## Requirements
33+
34+
- Webin account registered https://www.ebi.ac.uk/ena/submit/webin/login
35+
- Raw reads submitted into [INSDC](https://www.insdc.org/)
36+
37+
## genome_submitter
38+
39+
Workflow to submit MAGs and/or bins to ENA.
40+
41+
It takes input `samplesheet.csv` with fields required for [genome_uploader](https://github.com/EBI-Metagenomics/genome_uploader). Fields described in [docs](https://github.com/EBI-Metagenomics/genome_uploader/blob/main/README.md#input-tsv-and-fields).
42+
For now workflow converts CSV into required TSV.
43+
44+
_Future implementation will consider missing fields (for example completeness and contamination) and would run steps to fill in the gaps._
2545

2646
<!-- TODO nf-core:
2747
Complete this sentence with a 2-3 sentence summary of what types of data the pipeline ingests, a brief overview of the

assets/schema_input.json

Lines changed: 61 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,73 @@
1313
"errorMessage": "Sample name must be provided and cannot contain spaces",
1414
"meta": ["id"]
1515
},
16-
"fastq_1": {
16+
"fasta": {
1717
"type": "string",
1818
"format": "file-path",
1919
"exists": true,
2020
"pattern": "^([\\S\\s]*\\/)?[^\\s\\/]+\\.f(ast)?q\\.gz$",
21-
"errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
21+
"errorMessage": "FASTA file for sequences 1 must be provided, cannot contain spaces and must have extension '.fa.gz' or '.fasta.gz'",
22+
"description": "MAG/bin sequence file"
2223
},
23-
"fastq_2": {
24+
"accession": {
2425
"type": "string",
25-
"format": "file-path",
26-
"exists": true,
27-
"pattern": "^([\\S\\s]*\\/)?[^\\s\\/]+\\.f(ast)?q\\.gz$",
28-
"errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
26+
"description": "Run or assembly ENA accession"
27+
},
28+
"assembly_software": {
29+
"type": "string",
30+
"description": "Tool name and version that was used to assemble data"
31+
},
32+
"binning_software": {
33+
"type": "string",
34+
"description": "Tool name and version that was used to bin data"
35+
},
36+
"binning_parameters": {
37+
"type": "string",
38+
"description": "Arguments used to bin data different from default"
39+
},
40+
"stats_generation_software": {
41+
"type": "string",
42+
"description": "Tool(s) used to estimate completeness and contamination"
43+
},
44+
"completeness": {
45+
"type": "number",
46+
"description": "MAG/bin completeness value"
47+
},
48+
"contamination": {
49+
"type": "number",
50+
"description": "MAG/bin contamination value"
51+
},
52+
"genome_coverage": {
53+
"type": "number",
54+
"description": "MAG/bin coverage value"
55+
},
56+
"metagenome": {
57+
"type": "string",
58+
"description": "ENA accepted metagenome name"
59+
},
60+
"co-assembly": {
61+
"type": "boolean",
62+
"description": "True if data was co-assembled, False otherwise"
63+
},
64+
"broad_environment": {
65+
"type": "string",
66+
"description": "broad ecological context of a sample"
67+
},
68+
"local_environment": {
69+
"type": "string",
70+
"description": "local ecological context"
71+
},
72+
"environmental_medium": {
73+
"type": "string",
74+
"description": "material displaced by the sample"
75+
},
76+
"rRNA_presence": {
77+
"type": "boolean",
78+
"description": "True/False if rRNA genes detected"
79+
},
80+
"NCBI_lineage": {
81+
"type": "string",
82+
"description": "full NCBI lineage - format: x;y;z"
2983
}
3084
},
3185
"required": ["sample", "fastq_1"]

conf/test.config

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,5 +25,12 @@ params {
2525
// Input data
2626
// TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
2727
// TODO nf-core: Give any required params for the test so that command line flags are not needed
28-
input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv'
28+
input = params.pipelines_testdata_base_path + 'seqsubmit/samplesheet/sampleshee.csv'
29+
30+
mode = "mags"
31+
ena_genome_study_accession = "PRJEB98843"
32+
centre_name = "TEST_CENTER"
33+
webin_account = "Webin-47019"
34+
webin_password = 'ws2017'
35+
2936
}

main.nf

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1616
*/
1717

18-
include { SEQSUBMIT } from './workflows/seqsubmit'
18+
include { SEQSUBMIT } from './workflows/seqsubmit'
19+
include { GENOMESUBMIT } from './workflows/genomesubmit'
1920
include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_seqsubmit_pipeline'
2021
include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_seqsubmit_pipeline'
2122
/*
@@ -33,15 +34,27 @@ workflow NFCORE_SEQSUBMIT {
3334
samplesheet // channel: samplesheet read in from --input
3435

3536
main:
36-
37+
ch_multiqc_report = Channel.empty()
3738
//
3839
// WORKFLOW: Run pipeline
3940
//
40-
SEQSUBMIT (
41-
samplesheet
42-
)
41+
if ((params.mode == "mags") || (params.mode == "bins")) {
42+
GENOMESUBMIT (
43+
samplesheet,
44+
params.mode
45+
)
46+
ch_multiqc_report = GENOMESUBMIT.out.multiqc_report
47+
} else {
48+
SEQSUBMIT (
49+
samplesheet
50+
)
51+
ch_multiqc_report = SEQSUBMIT.out.multiqc_report
52+
}
53+
54+
55+
4356
emit:
44-
multiqc_report = SEQSUBMIT.out.multiqc_report // channel: /path/to/multiqc_report.html
57+
multiqc_report = ch_multiqc_report // channel: /path/to/multiqc_report.html
4558
}
4659
/*
4760
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
process ENA_WEBIN_CLI {
2+
3+
label 'process_low'
4+
tag "${id}"
5+
stageInMode 'copy'
6+
container "quay.io/biocontainers/ena-webin-cli:9.0.1--hdfd78af_1"
7+
8+
secret 'WEBIN_ACCOUNT'
9+
secret 'WEBIN_PASSWORD'
10+
11+
input:
12+
tuple val(id), path(submission_item), path(manifest)
13+
14+
output:
15+
tuple val(id), path("*webin-cli.report") , emit: webin_report
16+
tuple val(id), env('STATUS') , emit: upload_status
17+
path "versions.yml" , emit: versions
18+
19+
script:
20+
21+
def mode = params.test_upload ? "-test" : ""
22+
def submit_or_validate = params.webincli_submit ? "-submit": "-validate"
23+
24+
"""
25+
# change FASTA path in manifest to current workdir
26+
export ITEM_FULL_PATH=\$(readlink -f ${submission_item})
27+
sed 's|^FASTA\t.*|FASTA\t'"\${ITEM_FULL_PATH}"'|g' ${manifest} > ${id}_updated_manifest.manifest
28+
29+
ena-webin-cli \\
30+
-context=genome \\
31+
-manifest=${id}_updated_manifest.manifest \\
32+
-userName='\$WEBIN_ACCOUNT' \\
33+
-password='\$WEBIN_PASSWORD' \\
34+
${submit_or_validate} \\
35+
${mode}
36+
37+
mv webin-cli.report "${id}_webin-cli.report"
38+
39+
cat <<-END_VERSIONS > versions.yml
40+
"${task.process}":
41+
ena-webin-cli: \$(ena-webin-cli -version 2>&1 )
42+
END_VERSIONS
43+
44+
# status check
45+
if grep -q "submission has been completed successfully" "${id}_webin-cli.report"; then
46+
# first time submission completed successfully
47+
export STATUS="success"
48+
true
49+
elif grep -q "object being added already exists in the submission account with accession" "${id}_webin-cli.report"; then
50+
# there was attempt to re-submit already submitted genome
51+
export STATUS="success"
52+
true
53+
else
54+
export STATUS="failed"
55+
false
56+
fi
57+
"""
58+
}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
name: ena_webin_cli
2+
description: ENA data submission tool using Webin account details
3+
keywords:
4+
- ena
5+
- submission
6+
- upload
7+
- webin
8+
tools:
9+
- ena_webin_cli:
10+
description: |
11+
Data submissions to ENA can be made using the Webin command line submission interface (Webin-CLI). Webin submission account credentials are required to use the program.
12+
documentation: https://github.com/enasequence/webin-cli
13+
licence: ["Apache License"]
14+
input:
15+
- meta:
16+
type: map
17+
description: |
18+
Groovy Map containing sample information.
19+
- submission_item:
20+
type: file
21+
description: |
22+
Target FASTA file for submission (mag/bin/assembly)
23+
- manifest:
24+
type: file
25+
description: |
26+
Submission manifest
27+
28+
output:
29+
- meta:
30+
type: map
31+
description: |
32+
Groovy Map containing sample information
33+
e.g. [ id:'test', single_end:false ]
34+
- webin_report:
35+
type: file
36+
description: Submission report
37+
pattern: "*webin-cli.report"
38+
- STATUS:
39+
type: bool
40+
description: Submission status success(true) or failed(false
41+
- versions:
42+
type: file
43+
description: File containing software versions
44+
pattern: "versions.yml"
45+
46+
authors:
47+
- "@KateSakharova"
48+
- "@ochkalova"
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
process GENOME_UPLOAD {
2+
3+
label 'process_low'
4+
5+
container "community.wave.seqera.io/library/pip_genome-uploader:e2815984bcdc3e83"
6+
7+
secret 'WEBIN_ACCOUNT'
8+
secret 'WEBIN_PASSWORD'
9+
10+
input:
11+
path(table_for_upload)
12+
tuple val(meta), path(mags)
13+
val(mags_or_bins_flag)
14+
15+
output:
16+
path "results/{MAG,bin}_upload/manifests*/*.manifest" , emit: manifests
17+
path "results/{MAG,bin}_upload/ENA_backup.json" , emit: ena_upload_backup_json
18+
path "results/{MAG,bin}_upload/genome_samples.xml" , emit: upload_genome_samples
19+
path "results/{MAG,bin}_upload/registered_{MAGs,bins}*.tsv" , emit: upload_registered_mags
20+
path "results/{MAG,bin}_upload/submission.xml" , emit: upload_submission_xml
21+
path "versions.yml" , emit: versions
22+
23+
script:
24+
def tpa = params.upload_tpa ? "--tpa" : ""
25+
def force = params.upload_force ? "--force" : ""
26+
def mode = (!params.test_upload) ? "--live" : ""
27+
def args = task.ext.args ?: ''
28+
29+
"""
30+
export ENA_WEBIN=\$WEBIN_ACCOUNT
31+
export ENA_WEBIN_PASSWORD=\$WEBIN_PASSWORD
32+
33+
genome_upload \\
34+
-u $params.ena_genome_study_accession \\
35+
--genome_info ${table_for_upload} \\
36+
--centre_name $params.centre_name \\
37+
--${mags_or_bins_flag} \\
38+
${tpa} \\
39+
${force} \\
40+
${mode} \\
41+
--out results \\
42+
${args}
43+
44+
cat <<-END_VERSIONS > versions.yml
45+
"${task.process}":
46+
genome_uploader: \$(genome_upload --version 2>&1 | sed 's/genome_uploader //g')
47+
END_VERSIONS
48+
"""
49+
}
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
name: genome_upload
2+
description: Python package to upload bins and MAGs to ENA (European Nucleotide Archive)
3+
keywords:
4+
- ena upload
5+
- ena submission
6+
- mags submission
7+
- bins submission
8+
tools:
9+
- genome_upload:
10+
description: |
11+
Generate xmls and manifests necessary for genome submission
12+
Link the genomes you want to submit with the samples/runs used to generate them
13+
documentation: https://github.com/EBI-Metagenomics/genome_uploader
14+
licence: ["Apache License"]
15+
input:
16+
- meta:
17+
type: map
18+
description: |
19+
Groovy Map containing sample information.
20+
e.g. [ id:'test', single_end:false ]
21+
- table_for_upload:
22+
type: file
23+
description: |
24+
Tab-separated table with required information for submission
25+
- mags:
26+
type: file
27+
description: File in FASTA format containing targeted mag/bin/assembly for submission
28+
pattern: "*.{fasta,fna,fas,fa}*"
29+
30+
output:
31+
- meta:
32+
type: map
33+
description: |
34+
Groovy Map containing sample information
35+
e.g. [ id:'test', single_end:false ]
36+
- manifests:
37+
type: file
38+
description: Maniseft file required for submission with webin-cli
39+
pattern: "*.manifest"
40+
- ena_upload_backup_json:
41+
type: file
42+
description: JSON file with ENA metadata
43+
pattern: "*ENA_backup.json"
44+
- upload_genome_samples:
45+
type: file
46+
description: Genome samples
47+
pattern: "*genome_samples.xml"
48+
- upload_registered_mags:
49+
type: file
50+
description: Table with uploaded data
51+
pattern: "registered_{MAGs,bins}*.tsv"
52+
- upload_submission_xml:
53+
type: file
54+
description: XML file required for ENA submission
55+
pattern: "*submission.xml"
56+
- versions:
57+
type: file
58+
description: File containing software versions
59+
pattern: "versions.yml"
60+
61+
authors:
62+
- "@KateSakharova"
63+
- "@ochkalova"

0 commit comments

Comments
 (0)