Kate's stuff copy pasted + pre-commit

vagkaratzas · vagkaratzas · commit d4dd5c357fd5 · 2025-10-21T15:42:06.000+01:00
diff --git a/README.md b/README.md
@@ -21,7 +21,27 @@
 
 ## Introduction
 
-**nf-core/seqsubmit** is a bioinformatics pipeline that ...
+**nf-core/seqsubmit** is a bioinformatics pipeline that submits data to public archives such as [ENA](https://www.ebi.ac.uk/ena/browser/home)
+
+Pipeline will have several modes
+
+- `mags` for MAGs submission with **genome_submitter** wf
+- `bins` for bins submission with **genome_submitter** wf
+- `assemblies` for assembly submission with **assembly_submitter** wf
+
+## Requirements
+
+- Webin account registered https://www.ebi.ac.uk/ena/submit/webin/login
+- Raw reads submitted into [INSDC](https://www.insdc.org/)
+
+## genome_submitter
+
+Workflow to submit MAGs and/or bins to ENA.
+
+It takes input `samplesheet.csv` with fields required for [genome_uploader](https://github.com/EBI-Metagenomics/genome_uploader). Fields described in [docs](https://github.com/EBI-Metagenomics/genome_uploader/blob/main/README.md#input-tsv-and-fields).
+For now workflow converts CSV into required TSV.
+
+_Future implementation will consider missing fields (for example completeness and contamination) and would run steps to fill in the gaps._
 
 <!-- TODO nf-core:
    Complete this sentence with a 2-3 sentence summary of what types of data the pipeline ingests, a brief overview of the
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -13,19 +13,73 @@
                 "errorMessage": "Sample name must be provided and cannot contain spaces",
                 "meta": ["id"]
             },
-            "fastq_1": {
+            "fasta": {
                 "type": "string",
                 "format": "file-path",
                 "exists": true,
                 "pattern": "^([\\S\\s]*\\/)?[^\\s\\/]+\\.f(ast)?q\\.gz$",
-                "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
+                "errorMessage": "FASTA file for sequences 1 must be provided, cannot contain spaces and must have extension '.fa.gz' or '.fasta.gz'",
+                "description": "MAG/bin sequence file"
             },
-            "fastq_2": {
+            "accession": {
                 "type": "string",
-                "format": "file-path",
-                "exists": true,
-                "pattern": "^([\\S\\s]*\\/)?[^\\s\\/]+\\.f(ast)?q\\.gz$",
-                "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
+                "description": "Run or assembly ENA accession"
+            },
+            "assembly_software": {
+                "type": "string",
+                "description": "Tool name and version that was used to assemble data"
+            },
+            "binning_software": {
+                "type": "string",
+                "description": "Tool name and version that was used to bin data"
+            },
+            "binning_parameters": {
+                "type": "string",
+                "description": "Arguments used to bin data different from default"
+            },
+            "stats_generation_software": {
+                "type": "string",
+                "description": "Tool(s) used to estimate completeness and contamination"
+            },
+            "completeness": {
+                "type": "number",
+                "description": "MAG/bin completeness value"
+            },
+            "contamination": {
+                "type": "number",
+                "description": "MAG/bin contamination value"
+            },
+            "genome_coverage": {
+                "type": "number",
+                "description": "MAG/bin coverage value"
+            },
+            "metagenome": {
+                "type": "string",
+                "description": "ENA accepted metagenome name"
+            },
+            "co-assembly": {
+                "type": "boolean",
+                "description": "True if data was co-assembled, False otherwise"
+            },
+            "broad_environment": {
+                "type": "string",
+                "description": "broad ecological context of a sample"
+            },
+            "local_environment": {
+                "type": "string",
+                "description": "local ecological context"
+            },
+            "environmental_medium": {
+                "type": "string",
+                "description": "material displaced by the sample"
+            },
+            "rRNA_presence": {
+                "type": "boolean",
+                "description": "True/False if rRNA genes detected"
+            },
+            "NCBI_lineage": {
+                "type": "string",
+                "description": "full NCBI lineage - format: x;y;z"
             }
         },
         "required": ["sample", "fastq_1"]
diff --git a/conf/test.config b/conf/test.config
@@ -25,5 +25,12 @@ params {
     // Input data
     // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
     // TODO nf-core: Give any required params for the test so that command line flags are not needed
-    input  = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv'
+    input  = params.pipelines_testdata_base_path + 'seqsubmit/samplesheet/sampleshee.csv'
+
+    mode                       = "mags"
+    ena_genome_study_accession = "PRJEB98843"
+    centre_name                = "TEST_CENTER"
+    webin_account              = "Webin-47019"
+    webin_password             = 'ws2017'
+
 }
diff --git a/main.nf b/main.nf
@@ -15,7 +15,8 @@
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
 
-include { SEQSUBMIT  } from './workflows/seqsubmit'
+include { SEQSUBMIT               } from './workflows/seqsubmit'
+include { GENOMESUBMIT            } from './workflows/genomesubmit'
 include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_seqsubmit_pipeline'
 include { PIPELINE_COMPLETION     } from './subworkflows/local/utils_nfcore_seqsubmit_pipeline'
 /*
@@ -33,15 +34,27 @@ workflow NFCORE_SEQSUBMIT {
     samplesheet // channel: samplesheet read in from --input
 
     main:
-
+    ch_multiqc_report = Channel.empty()
     //
     // WORKFLOW: Run pipeline
     //
-    SEQSUBMIT (
-        samplesheet
-    )
+    if ((params.mode == "mags") || (params.mode == "bins")) {
+        GENOMESUBMIT (
+            samplesheet,
+            params.mode
+        )
+        ch_multiqc_report = GENOMESUBMIT.out.multiqc_report
+    } else {
+        SEQSUBMIT (
+            samplesheet
+        )
+        ch_multiqc_report = SEQSUBMIT.out.multiqc_report
+    }
+
+
+
     emit:
-    multiqc_report = SEQSUBMIT.out.multiqc_report // channel: /path/to/multiqc_report.html
+    multiqc_report = ch_multiqc_report // channel: /path/to/multiqc_report.html
 }
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/modules/local/ena_webin_cli/main.nf b/modules/local/ena_webin_cli/main.nf
@@ -0,0 +1,58 @@
+process ENA_WEBIN_CLI {
+
+    label 'process_low'
+    tag "${id}"
+    stageInMode 'copy'
+    container "quay.io/biocontainers/ena-webin-cli:9.0.1--hdfd78af_1"
+
+    secret 'WEBIN_ACCOUNT'
+    secret 'WEBIN_PASSWORD'
+
+    input:
+    tuple val(id), path(submission_item), path(manifest)
+
+    output:
+    tuple val(id), path("*webin-cli.report") , emit: webin_report
+    tuple val(id), env('STATUS')             , emit: upload_status
+    path "versions.yml"                      , emit: versions
+
+    script:
+
+    def mode               = params.test_upload     ? "-test" : ""
+    def submit_or_validate = params.webincli_submit ? "-submit": "-validate"
+
+    """
+    # change FASTA path in manifest to current workdir
+    export ITEM_FULL_PATH=\$(readlink -f ${submission_item})
+    sed 's|^FASTA\t.*|FASTA\t'"\${ITEM_FULL_PATH}"'|g' ${manifest} > ${id}_updated_manifest.manifest
+
+    ena-webin-cli \\
+        -context=genome \\
+        -manifest=${id}_updated_manifest.manifest \\
+        -userName='\$WEBIN_ACCOUNT' \\
+        -password='\$WEBIN_PASSWORD' \\
+        ${submit_or_validate} \\
+        ${mode}
+
+    mv webin-cli.report "${id}_webin-cli.report"
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        ena-webin-cli: \$(ena-webin-cli -version 2>&1 )
+    END_VERSIONS
+
+    # status check
+    if grep -q "submission has been completed successfully" "${id}_webin-cli.report"; then
+        # first time submission completed successfully
+        export STATUS="success"
+        true
+    elif grep -q "object being added already exists in the submission account with accession" "${id}_webin-cli.report"; then
+        # there was attempt to re-submit already submitted genome
+        export STATUS="success"
+        true
+    else
+        export STATUS="failed"
+        false
+    fi
+    """
+}
diff --git a/modules/local/ena_webin_cli/meta.yml b/modules/local/ena_webin_cli/meta.yml
@@ -0,0 +1,48 @@
+name: ena_webin_cli
+description: ENA data submission tool using Webin account details
+keywords:
+  - ena
+  - submission
+  - upload
+  - webin
+tools:
+  - ena_webin_cli:
+      description: |
+        Data submissions to ENA can be made using the Webin command line submission interface (Webin-CLI). Webin submission account credentials are required to use the program.
+      documentation: https://github.com/enasequence/webin-cli
+      licence: ["Apache License"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information.
+  - submission_item:
+      type: file
+      description: |
+        Target FASTA file for submission (mag/bin/assembly)
+  - manifest:
+      type: file
+      description: |
+        Submission manifest
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - webin_report:
+      type: file
+      description: Submission report
+      pattern: "*webin-cli.report"
+  - STATUS:
+      type: bool
+      description: Submission status success(true) or failed(false
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+
+authors:
+  - "@KateSakharova"
+  - "@ochkalova"
diff --git a/modules/local/genome_upload/main.nf b/modules/local/genome_upload/main.nf
@@ -0,0 +1,49 @@
+process GENOME_UPLOAD {
+
+    label 'process_low'
+
+    container "community.wave.seqera.io/library/pip_genome-uploader:e2815984bcdc3e83"
+
+    secret 'WEBIN_ACCOUNT'
+    secret 'WEBIN_PASSWORD'
+
+    input:
+    path(table_for_upload)
+    tuple val(meta), path(mags)
+    val(mags_or_bins_flag)
+
+    output:
+    path "results/{MAG,bin}_upload/manifests*/*.manifest"       , emit: manifests
+    path "results/{MAG,bin}_upload/ENA_backup.json"             , emit: ena_upload_backup_json
+    path "results/{MAG,bin}_upload/genome_samples.xml"          , emit: upload_genome_samples
+    path "results/{MAG,bin}_upload/registered_{MAGs,bins}*.tsv" , emit: upload_registered_mags
+    path "results/{MAG,bin}_upload/submission.xml"              , emit: upload_submission_xml
+    path "versions.yml"                                         , emit: versions
+
+    script:
+    def tpa      = params.upload_tpa  ? "--tpa"  : ""
+    def force    = params.upload_force  ? "--force"  : ""
+    def mode     = (!params.test_upload) ? "--live" : ""
+    def args     = task.ext.args ?: ''
+
+    """
+    export ENA_WEBIN=\$WEBIN_ACCOUNT
+    export ENA_WEBIN_PASSWORD=\$WEBIN_PASSWORD
+
+    genome_upload \\
+        -u $params.ena_genome_study_accession \\
+        --genome_info ${table_for_upload} \\
+        --centre_name $params.centre_name \\
+        --${mags_or_bins_flag} \\
+        ${tpa} \\
+        ${force} \\
+        ${mode} \\
+        --out results \\
+        ${args}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        genome_uploader: \$(genome_upload --version 2>&1 | sed 's/genome_uploader //g')
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/genome_upload/meta.yml b/modules/local/genome_upload/meta.yml
@@ -0,0 +1,63 @@
+name: genome_upload
+description: Python package to upload bins and MAGs to ENA (European Nucleotide Archive)
+keywords:
+  - ena upload
+  - ena submission
+  - mags submission
+  - bins submission
+tools:
+  - genome_upload:
+      description: |
+        Generate xmls and manifests necessary for genome submission
+        Link the genomes you want to submit with the samples/runs used to generate them
+      documentation: https://github.com/EBI-Metagenomics/genome_uploader
+      licence: ["Apache License"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information.
+        e.g. [ id:'test', single_end:false ]
+  - table_for_upload:
+      type: file
+      description: |
+        Tab-separated table with required information for submission
+  - mags:
+      type: file
+      description: File in FASTA format containing targeted mag/bin/assembly for submission
+      pattern: "*.{fasta,fna,fas,fa}*"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - manifests:
+      type: file
+      description: Maniseft file required for submission with webin-cli
+      pattern: "*.manifest"
+  - ena_upload_backup_json:
+      type: file
+      description: JSON file with ENA metadata
+      pattern: "*ENA_backup.json"
+  - upload_genome_samples:
+      type: file
+      description: Genome samples
+      pattern: "*genome_samples.xml"
+  - upload_registered_mags:
+      type: file
+      description: Table with uploaded data
+      pattern: "registered_{MAGs,bins}*.tsv"
+  - upload_submission_xml:
+      type: file
+      description: XML file required for ENA submission
+      pattern: "*submission.xml"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+
+authors:
+  - "@KateSakharova"
+  - "@ochkalova"
diff --git a/nextflow.config b/nextflow.config
diff --git a/workflows/genomesubmit.nf b/workflows/genomesubmit.nf