Skip to content
Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
8a37c4f
add CREATE_ASSEMBLY_METADATA_CSV module, add versions tracking for al…
ochkalova Apr 24, 2026
4bb213f
add CREATE_GENOME_METADATA_TSV module
ochkalova Apr 24, 2026
6340b4b
refactor metadata files creating modules
ochkalova Apr 27, 2026
7cb2880
add find_concatenate module
ochkalova Apr 28, 2026
c6b7fbc
patch find/concatenate
ochkalova Apr 28, 2026
8304fbf
fix version statement for GENERATE_ASSEMBLY_MANIFEST module
ochkalova Apr 28, 2026
f217ac8
add FIND_CONCATENATE to assemblysubmit and update tests
ochkalova Apr 28, 2026
48abbfd
add FIND_CONCATENATE to genomesubmit and update tests
ochkalova Apr 28, 2026
badfb2b
add additional inputs instead of params usage subwf
ochkalova Apr 24, 2026
47b8698
add additional inputs instead of params usage in ENA_WEBIN_CLI_WRAPPER
ochkalova Apr 24, 2026
c410d5e
add additional inputs in GENOME_UPLOAD
ochkalova Apr 24, 2026
708cb1e
add additional inputs in GENERATE_ASSEMBLY_MANIFEST
ochkalova Apr 24, 2026
fd698fd
update tests
ochkalova Apr 24, 2026
689259a
fix checkm2_db definition
ochkalova Apr 24, 2026
74fa995
remove --upload_force param because it's unnecessary for a user
ochkalova Apr 24, 2026
1371dec
do not reference params in ASSEMBLYSUBMIT and GENOMESUBMIT
ochkalova Apr 27, 2026
93439b4
add test_upload parameter to REGISTERSTUDY and GENERATE_ASSEMBLY_MANI…
ochkalova Apr 27, 2026
ff2d46d
rename checkm2_db_zenodo_id to checkm2_db_download_id for consistency
ochkalova Apr 27, 2026
7a6030c
add missing REGISTERSTUDY input
ochkalova Apr 27, 2026
4cf9a0e
rename webincli_submit to webincli_mode
ochkalova Apr 28, 2026
4530c73
add CREATE_ASSEMBLY_METADATA_CSV module, add versions tracking for al…
ochkalova Apr 24, 2026
0b2d785
add CREATE_GENOME_METADATA_TSV module
ochkalova Apr 24, 2026
c7b2603
add FIND_CONCATENATE to assemblysubmit and update tests
ochkalova Apr 28, 2026
5e435bf
update snapshot
ochkalova Apr 28, 2026
555e4a1
Merge branch 'dev' into feat/metadata_modules
ochkalova Apr 28, 2026
aafeff6
update snapshot
ochkalova Apr 28, 2026
c2c7ba6
add citations
KateSakharova Apr 24, 2026
84b94b7
test custom data in multiqc. So far it works with files _mqc prefix
KateSakharova Apr 24, 2026
e48194f
add custom tables to multiqc
KateSakharova Apr 27, 2026
f828cd5
wip
KateSakharova Apr 28, 2026
e3b8072
wip
KateSakharova Apr 28, 2026
006d953
include topics
KateSakharova Apr 29, 2026
121a75b
Merge branch 'dev' into feature/multiqc-citations
KateSakharova Apr 29, 2026
7a560d8
fix links
KateSakharova Apr 29, 2026
d97a072
different fixes
KateSakharova Apr 29, 2026
689b785
fix bloody multiqc version
KateSakharova Apr 30, 2026
37e6886
lint
KateSakharova Apr 30, 2026
cf71ebf
fix wording and versions
KateSakharova May 1, 2026
4d6dfbe
update snaphots
KateSakharova May 6, 2026
bb7e9ac
revert multiqc container, die in hell!
KateSakharova May 6, 2026
80c4b08
snapshots
KateSakharova May 6, 2026
bbfcaf0
lint
KateSakharova May 6, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 152 additions & 1 deletion assets/multiqc_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,156 @@ report_section_order:
order: -1002

export_plots: true

disable_general_stats: true
disable_version_detection: true

custom_content:
order:
- assembly_metadata
- genome_metadata
- sample_registration
- submission_results_assemblies
- submission_results_genomes

sp:
assembly_metadata:
fn_re: ".*assemblies_metadata.csv$"
genome_metadata:
fn_re: ".*genomes_metadata.tsv$"
sample_registration:
fn_re: '.*registered.*\.tsv$'
submission_results_genomes:
fn_re: 'genomes_accessions\.tsv$'
submission_results_assemblies:
fn_re: 'assemblies_accessions\.tsv$'

# Custom data configuration
custom_data:
assembly_metadata:
id: "assembly_metadata"
section_name: "Assembly metadata"
description: "Metadata for submitted assemblies"
file_format: "csv"
plot_type: "table"
pconfig:
id: "assembly_metadata"
title: "Assembly metadata table"
headers:
runs:
title: "Runs"
assembly_coverage:
title: "Coverage"
assembler:
title: "Assembler"
assembler_version:
title: "Version"
file_path:
title: "Filepath"
sample: "Sample"

genome_metadata:
id: "genome_metadata"
section_name: "Genome metadata"
description: "Metadata for submitted genomes"
file_format: "tsv"
plot_type: "table"
pconfig:
id: "genome_metadata"
title: "Genome metadata table"
headers:
genome_name:
title: "Genome name"
genome_path:
title: "Genome path"
accessions:
title: "Accession"
assembly_software:
title: "Assembly software"
binning_software:
title: "Binning software"
binning_parameters:
title: "Binning parameters"
stats_generation_software:
title: "Stats software"
completeness:
title: "Completeness (%)"
contamination:
title: "Contamination (%)"
genome_coverage:
title: "Coverage"
metagenome:
title: "Metagenome"
co-assembly:
title: "Co-assembly"
broad_environment:
title: "Broad environment"
local_environment:
title: "Local environment"
environmental_medium:
title: "Environmental medium"
rRNA_presence:
title: "rRNA presence"
Comment thread
KateSakharova marked this conversation as resolved.
NCBI_lineage:
title: "NCBI lineage"

sample_registration:
file_format: "tsv"
plot_type: "table"
pconfig:
id: "sample_registration"
title: "Sample registration assigned accessions"
headers:
alias:
title: "ID"
description: "FASTA file for upload"
accession:
title: "ENA sample accession"
description: "Assigned accession after submission"

submission_results_assemblies:
id: "submission_results_assemblies"
section_name: "Submission results"
description: >
Accessions assigned during upload process.
As all assemblies in ENA are submitted as ‘analyses’, for each assembly submission, Webin will report a unique accession number that starts with ERZ.
For most assemblies, this accession number is for internal processing only and will not be visible in the browser.
As a result, for most assemblies you will receive additional post-processing accession numbers starting with GCA_.
Always make a note of any accessions you receive as these are the unique identifiers for each of your submissions to ENA.
The ERZ accession can be used to access information on the progress of the internal processing of each assembly through the [Webin Portal](https://ena-docs.readthedocs.io/en/latest/submit/general-guide/submissions-portal.html).
You can also use this service to see the assigned chromosome, contig, and scaffold accessions.
Please follow the Webin Portal link to learn more about this.
See individual submission guidelines for information on what accessions you will receive for each assembly type.
More information about accessions can be found in ENA docs: https://ena-docs.readthedocs.io/en/latest/submit/assembly.html#accessions.
file_format: "tsv"
plot_type: "table"
pconfig:
id: "Assemblies assigned accessions"
title: "Submission results"
col1_header: "ID"
col2_header: "ENA accession"
headers:
alias:
title: "ID"
description: "FASTA file for upload"
accession:
title: "ENA accession"
description: "Assigned accession after submission"

submission_results_genomes:
id: "submission_results_genomes"
section_name: "Submission results"
description: "Accessions assigned during upload process."
file_format: "tsv"
plot_type: "table"
pconfig:
id: "Genomes assigned accessions"
title: "Submission results"
col1_header: "ID"
col2_header: "ENA accession"
headers:
alias:
title: "ID"
description: "FASTA file for upload"
accession:
title: "ENA accession"
description: "Assigned accession after submission"
7 changes: 6 additions & 1 deletion conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,12 @@ process {
//

withName: 'MULTIQC' {
ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' }
ext.args = {
[
params.multiqc_title ? "--title \"$params.multiqc_title\"" : '',
"-p"
Comment thread
KateSakharova marked this conversation as resolved.
].findAll().join(' ')
}
publishDir = [
path: { "${params.outdir}/multiqc" },
mode: params.publish_dir_mode,
Expand Down
6 changes: 6 additions & 0 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ When `--mode metagenomic_assemblies` is used, results are written under `metagen

Assembly study registration, manifest generation, and Webin-CLI submission are executed by the workflow, but their intermediate outputs are not currently published into `--outdir` by the pipeline.

> [!NOTE]
> As all assemblies in ENA are submitted as ‘analyses’, for each assembly submission, Webin will report a unique accession number that starts with ERZ.
> For most assemblies, this accession number is for **internal processing only** and will not be visible in the browser.
> As a result, for most assemblies you will receive additional post-processing accession numbers starting with GCA\_.
> More information about accessions can be found in ENA docs: https://ena-docs.readthedocs.io/en/latest/submit/assembly.html#accessions.'
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
> More information about accessions can be found in ENA docs: https://ena-docs.readthedocs.io/en/latest/submit/assembly.html#accessions.'
> More information about accessions can be found in ENA docs: https://ena-docs.readthedocs.io/en/latest/submit/assembly.html#accessions.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should be in the MAGs/bins section
And specify that it's for MAGs only.
Because it's something that we know for sure.

I know that ERZs for bins and primary metagenomes ARE displayed on the portal and they are used to reference the data forever. Docs state the opposite, but I would ask ENA before assume that it's true for ALL assemblies :/


## Common outputs

### MultiQC
Expand Down
6 changes: 4 additions & 2 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ workflow NFCORE_SEQSUBMIT {
params.upload_tpa,
params.test_upload,
params.webin_cli_version,
params.webincli_mode
params.webincli_mode,
params.outdir
)
ch_multiqc_report = GENOMESUBMIT.out.multiqc_report
} else if (params.mode == "metagenomic_assemblies") {
Expand All @@ -66,7 +67,8 @@ workflow NFCORE_SEQSUBMIT {
params.upload_tpa,
params.test_upload,
params.webin_cli_version,
params.webincli_mode
params.webincli_mode,
params.outdir
)
ch_multiqc_report = ASSEMBLYSUBMIT.out.multiqc_report
}
Expand Down
2 changes: 1 addition & 1 deletion modules/local/ena_webin_cli_wrapper/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ process ENA_WEBIN_CLI_WRAPPER {

label 'process_low'
tag "${meta.id}"
container "quay.io/microbiome-informatics/java_mgnify-pipelines-toolkit:1.4.21"
container "community.wave.seqera.io/library/ena-webin-cli_mgnify-pipelines-toolkit:0fd318932c5ba88e"
Comment thread
KateSakharova marked this conversation as resolved.
stageInMode 'copy'

input:
Expand Down
3 changes: 2 additions & 1 deletion modules/nf-core/multiqc/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

58 changes: 49 additions & 9 deletions subworkflows/local/utils_nfcore_seqsubmit_pipeline/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -183,24 +183,65 @@ def validateInputSamplesheet(input) {
// Generate methods description for MultiQC
//
def toolCitationText() {
// TODO nf-core: Optionally add in-text citation tools to this list.
// Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "",
// Uncomment function in methodsDescriptionText to render in MultiQC report
Comment thread
KateSakharova marked this conversation as resolved.
Outdated

def mgnify_tools = [
params.mode == "metagenomic_assemblies" ? "Manifests for metagenomic assemblies upload were created by assembly_uploader (Richardson et al. 2023)." : "",
params.mode != "metagenomic_assemblies" ? "Samples registration and upload manifests generation for bins/MAGs were done by genome_uploader (Gurbich et al. 2023).": ""
Comment thread
KateSakharova marked this conversation as resolved.
Outdated
].join(' ').trim()

def ena_tools = [
"Submission to ENA was done by webin-cli (David, Yuan, et al. 2025)."
].join(' ').trim()

def preprocessing_tools = [
"Input FASTA validation was done by py_fasta_validator (Edwards et al. 2023)."
].join(' ').trim()

def stats_tools = [
params.mode != "metagenomic_assemblies" ? "Completeness and contamination metrics was performed by CheckM2 (Chklovski et al. 2023)." : "",
"Coverage calculation for metagenomic assemblies/bins/MAGs was done by CoverM (Aroney et el. 2025)."
].join(' ').trim()

def taxonomy_tools = [
params.mode != "metagenomic_assemblies" ? "Taxonomy was assigned by CAT_pack (Von Meijenfeldt et al. 2019)." : ""
].join(' ').trim()

def rna_tools = [
params.mode != "metagenomic_assemblies" ? "Bacterial ribosomal RNA prediction was performed by barrnap (Seemann T. 2013)." : "",
params.mode != "metagenomic_assemblies" ? "Transfer RNA genes were detected by tRNAscan-SE (Chan et al. 2021)." : ""
].join(' ').trim()

def postprocessing_text = "MultiQC (Ewels et al. 2016)."

def citation_text = [
"Tools used in the workflow included:",
"MultiQC (Ewels et al. 2016)",
"."
preprocessing_tools,
stats_tools,
taxonomy_tools,
rna_tools,
mgnify_tools,
ena_tools,
postprocessing_text
].join(' ').trim()

return citation_text
}

def toolBibliographyText() {
// TODO nf-core: Optionally add bibliographic entries to this list.
// Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "<li>Author (2023) Pub name, Journal, DOI</li>" : "",
// Uncomment function in methodsDescriptionText to render in MultiQC report
Comment thread
KateSakharova marked this conversation as resolved.
Outdated
def reference_text = [
"<li>Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354</li>"
'<li>Richardson, L., Allen, B., Baldi, G., Beracochea, M., Bileschi, M. L., Burdett, T., ... & Finn, R. D. (2023). MGnify: the microbiome sequence data analysis resource in 2023. Nucleic acids research, 51(D1), D753-D759. doi: <a href="https://doi.org/10.1093/nar/gkac1080"10.1093/nar/gkac1080</a></li>',
'<li>Gurbich, T. A., Almeida, A., Beracochea, M., Burdett, T., Burgin, J., Cochrane, G., ... & Finn, R. D. (2023). MGnify genomes: a resource for biome-specific microbial genome catalogues. Journal of molecular biology, 435(14), 168016. doi: <a href="https://doi.org/10.1016/j.jmb.2023.168016"10.1016/j.jmb.2023.168016</a></li>',
'<li>webin-cli: David, Y., Alisha, A., Awais, A., Rajkumar, D., Dipayan, G., Muhammad, H., ... & Ugis, S. (2026). The European Nucleotide Archive in 2025. Nucleic Acids Research, 54(D1), D120-D127. doi: <a href="https://doi.org/10.1093/nar/gkaf1295"10.1093/nar/gkaf1295</a></li>',
'<li>Edwards, R. (2023). linsalrob/py_fasta_validator: Compressy. doi: <a href="https://doi.org/10.5281/zenodo.5002710"10.5281/zenodo.5002710</a></li>',
'<li>Chklovski, A., Parks, D. H., Woodcroft, B. J., & Tyson, G. W. (2023). CheckM2: a rapid, scalable and accurate tool for assessing microbial genome quality using machine learning. Nature Methods, 20(8), 1203-1212. doi: <a href="https://doi.org/10.1038/s41592-023-01940-w"10.1038/s41592-023-01940-w</a></li>',
'<li>Aroney, S. T., Newell, R. J., Nissen, J. N., Camargo, A. P., Tyson, G. W., & Woodcroft, B. J. (2025). CoverM: read alignment statistics for metagenomics. Bioinformatics, 41(4), btaf147. doi: <a href="https://doi.org/10.1093/bioinformatics/btaf147"10.1093/bioinformatics/btaf147</a></li>',
'<li>Von Meijenfeldt, F. B., Arkhipova, K., Cambuy, D. D., Coutinho, F. H., & Dutilh, B. E. (2019). Robust taxonomic classification of uncharted microbial sequences and bins with CAT and BAT. Genome biology, 20(1), 217. doi: <a href="https://doi.org/10.1038/s41467-024-47155-1"10.1038/s41467-024-47155-1</a></li>',
'<li>barrnap: Seemann, T. (2014). barrnap: Bacterial ribosomal RNA predictor. barrnap: Bacterial ribosomal RNA predictor. <a href="http://vicbioinformatics.com/"vicbioinformatics.com</a></li>',
'<li>Chan, P. P., Lin, B. Y., Mak, A. J., & Lowe, T. M. (2021). tRNAscan-SE 2.0: improved detection and functional classification of transfer RNA genes. Nucleic acids research, 49(16), 9077-9096. doi: <a href="https://doi.org/10.1093/nar/gkab688"10.1093/nar/gkab688</a></li>',
'<li>Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: <a href="https://doi.org/10.1093/bioinformatics/btw354"10.1093/bioinformatics/btw354</a></li>'
].join(' ').trim()

return reference_text
Expand Down Expand Up @@ -230,9 +271,8 @@ def methodsDescriptionText(mqc_methods_yaml) {
meta["tool_citations"] = ""
meta["tool_bibliography"] = ""

// TODO nf-core: Only uncomment below if logic in toolCitationText/toolBibliographyText has been filled!
// meta["tool_citations"] = toolCitationText().replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".")
// meta["tool_bibliography"] = toolBibliographyText()
meta["tool_citations"] = toolCitationText().replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".")
meta["tool_bibliography"] = toolBibliographyText()


def methods_text = mqc_methods_yaml.text
Expand Down
27 changes: 24 additions & 3 deletions workflows/assemblysubmit.nf
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ workflow ASSEMBLYSUBMIT {
test_upload // val: true for test upload mode
webin_cli_version // val: WebinCLI tool version to download and use for submission
webincli_mode // val: either 'validate' or 'submit' to specify WebinCLI mode of operation
outdir

main:
ch_versions = channel.empty()
Expand Down Expand Up @@ -195,13 +196,31 @@ workflow ASSEMBLYSUBMIT {
//
// Collate and save software versions
//
softwareVersionsToYAML(ch_versions)
def topic_versions = channel.topic("versions")
.distinct()
.branch { entry ->
versions_file: entry instanceof Path
versions_tuple: true
}

def topic_versions_string = topic_versions.versions_tuple
.map { process, tool, version ->
[ process[process.lastIndexOf(':')+1..-1], " ${tool}: ${version}" ]
}
.groupTuple(by:0)
.map { process, tool_versions ->
tool_versions.unique().sort()
"${process}:\n${tool_versions.join('\n')}"
}

def ch_collated_versions = softwareVersionsToYAML(ch_versions.mix(topic_versions.versions_file))
.mix(topic_versions_string)
.collectFile(
storeDir: "${params.outdir}/pipeline_info",
storeDir: "${outdir}/pipeline_info",
name: 'nf_core_' + 'seqsubmit_software_' + 'mqc_' + 'versions.yml',
sort: true,
newLine: true
).set { ch_collated_versions }
)


//
Expand All @@ -227,6 +246,8 @@ workflow ASSEMBLYSUBMIT {
ch_methods_description = channel.value(
methodsDescriptionText(ch_multiqc_custom_methods_description))

ch_multiqc_files = ch_multiqc_files.mix(CONCAT_ACCESSIONS.out.file_out)
ch_multiqc_files = ch_multiqc_files.mix(CREATE_ASSEMBLY_METADATA_CSV.out.csv)
Comment thread
KateSakharova marked this conversation as resolved.
Outdated
ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions)
ch_multiqc_files = ch_multiqc_files.mix(
ch_methods_description.collectFile(
Expand Down
Loading
Loading