nf-core · julianu · May 12, 2026 · May 12, 2026
diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
@@ -1,3 +1,3 @@
-sample,fastq_1,fastq_2
-SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz
-SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz,
+ID,spectrum_file,fasta
+1,/path/to/run1.raw,/path/to/proteins.fasta
+2,/path/to/run2.mzML,/path/to/proteins.fasta
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -18,6 +18,13 @@
                 "exists": true,
                 "pattern": "^\\S+\\.(raw|RAW|mzML|mzml|d|)(.tar.(gz|bz2)|.zip|.gz)?$",
                 "errorMessage": "MS file name, cannot contain spaces and must have one of the extensions: raw | RAW | mzML | d"
+            },
+            "fasta": {
+                "type": "string",
+                "format": "path",
+                "exists": true,
+                "pattern": "^\\S+\\.(fasta|fa|fas|faa)(\\.gz)?$",
+                "errorMessage": "FASTA file cannot contain spaces and must have one of the extensions: fasta | fa | fas | faa"
             }
         },
         "required": ["ID", "spectrum_file"]

diff --git a/main.nf b/main.nf
@@ -30,7 +30,7 @@ include { PIPELINE_COMPLETION     } from './subworkflows/local/utils_nfcore_mspe
 workflow NFCORE_MSPEPID {
 
     take:
-    samplesheet // channel: samplesheet read in from --input
+    samplesheet // channel: [meta, spectrum_file, fasta_file] read in from --input
 
     main:
 
@@ -40,7 +40,6 @@ workflow NFCORE_MSPEPID {
     MSPEPID(
         samplesheet,
         params.outdir,
-        params.fasta,
         params.entrapment_fold,
         params.skip_decoy_generation,
         params.precursor_tol_ppm,

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -47,9 +47,9 @@
                 "fasta": {
                     "type": "string",
                     "fa_icon": "fas fa-file",
-                    "pattern": ".(fasta|faa)$",
-                    "description": "Input FASTA protein database",
-                    "help_text": "Path to the protein database file. Whether it already contains decoys is set by the --skip_decoy_generation parameter."
+                    "pattern": ".(fasta|fa|fas|faa)(.gz)?$",
+                    "description": "Global input FASTA protein database (mutually exclusive with per-run 'fasta' column in the samplesheet)",
+                    "help_text": "Path to a single protein database file applied to all MS runs. Cannot be used together with the 'fasta' column in the samplesheet. Whether the file already contains decoys is controlled by --skip_decoy_generation."
                 },
                 "entrapment_fold": {
                     "type": "integer",

diff --git a/subworkflows/local/spectra_identification/main.nf b/subworkflows/local/spectra_identification/main.nf
@@ -7,8 +7,8 @@ include { PSMUTILSCONVERSIONS } from '../../../modules/local/psmutilsconversions
 
 workflow SPECTRA_IDENTIFICATION {
     take:
-    ch_fasta
-    ch_spectra_files // val(meta), path(mzml), path(raw_spectra)
+    ch_spectra_files    // val(meta), path(mzml), path(raw_spectra)
+    ch_fasta_db         // channel: [sample_id, db_fasta] one item per sample
     precursor_tol_ppm
     fragment_tol_da
     run_comet
@@ -21,13 +21,14 @@ workflow SPECTRA_IDENTIFICATION {
     main:
     ch_versions = channel.empty()
 
-    // TODO: this will become the identifications, probably with some meta mapping?
+    // this will contain the identifications, with some meta data
     ch_identifications = channel.empty()
 
-    // prepare the input channel for identifications
-    // TODO: this right now only adds the fasta - must be adapted for per sample DB
-    // TODO: also adapt for per-sample parameters
-    ch_ident_in = ch_spectra_files.combine(ch_fasta.map { _meta, fasta -> [fasta] })
+    // join each spectrum file with its per-run database FASTA
+    ch_ident_in = ch_spectra_files
+        .map { meta, mzml, raw -> [meta.id, meta, mzml, raw] }
+        .join(ch_fasta_db, by: 0)
+        .map { _id, meta, mzml, raw, fasta -> [meta, mzml, raw, fasta] }
 
     // run Comet, if enabled
     if (run_comet) {
@@ -71,15 +72,19 @@ workflow SPECTRA_IDENTIFICATION {
         )
         ch_versions = ch_versions.mix(SAGECONFIG.out.versions_python)
 
-        ch_sage_spectra = ch_spectra_files.map { meta, mzml, _raw_spectra -> [meta, mzml] }
-        // add empty meta information for compatibility and convert to value channel
-        ch_sage_config = SAGECONFIG.out.config.map { config -> [["ID": "SAGE_CONFIG"], config] }.first()
-        // convert to value channel
-        ch_sage_fasta = ch_fasta.first()
+        ch_sage_config = SAGECONFIG.out.config.map { config -> [['ID': 'SAGE_CONFIG'], config] }.first()
+
+        // Re-use ch_ident_in (already joined with per-run fasta) and split into
+        // the two separate channels SAGEBETA requires.
+        ch_sage_joined = ch_ident_in
+            .multiMap { meta, mzml, _raw, fasta ->
+                spectra: [meta, mzml]
+                fasta:   [[id: fasta.getBaseName()], fasta]
+            }
 
         SAGEBETA(
-            ch_sage_spectra,
-            ch_sage_fasta,
+            ch_sage_joined.spectra,
+            ch_sage_joined.fasta,
             ch_sage_config,
         )
         ch_versions = ch_versions.mix(SAGEBETA.out.versions_sagebeta)

diff --git a/subworkflows/local/utils_nfcore_mspepid_pipeline/main.nf b/subworkflows/local/utils_nfcore_mspepid_pipeline/main.nf
@@ -102,21 +102,36 @@ workflow PIPELINE_INITIALISATION {
     // Create channel from input file provided through params.input
     //
 
+    // Validate FASTA source mutual exclusivity before building channels
+    def raw_rows = samplesheetToList(input, "${projectDir}/assets/schema_input.json")
+    def sheet_has_fasta = raw_rows.any  { row -> row.size() > 2 && row[2] }
+    def sheet_all_fasta = raw_rows.every { row -> row.size() > 2 && row[2] }
+
+    if (params.fasta && sheet_has_fasta) {
+        error("Conflicting FASTA sources: '--fasta' parameter and 'fasta' column in the samplesheet are mutually exclusive. Please use only one.")
+    }
+    if (!params.fasta && !sheet_has_fasta) {
+        error("No FASTA provided. Please either use '--fasta' or add a 'fasta' column to every row in the samplesheet.")
+    }
+    if (!params.fasta && sheet_has_fasta && !sheet_all_fasta) {
+        error("Incomplete FASTA column: either provide one fasta for each run in the samplesheet, or use the global '--fasta' parameter.")
+    }
+
     channel
-        .fromList(samplesheetToList(input, "${projectDir}/assets/schema_input.json"))
+        .fromList(raw_rows)
         .map { row ->
-            def meta = row[0]
-            def spectrum_file = row.size() > 1 ? row[1] : null
-            if (!spectrum_file) {
-                error("Please check input samplesheet -> Missing required column 'spectrum_file' for sample ID: ${meta.id}")
-            }
-            def sample_meta = meta + [sampleId: meta.id, id: "${meta.id}-${spectrum_file.simpleName}"]
-            return [sample_meta.id, sample_meta, spectrum_file]
+            def meta          = row[0]
+            def spectrum_file = row[1]
+            // Resolve fasta: either from the global --fasta param or from the samplesheet column.
+            // Mutual exclusivity and completeness were already validated above.
+            def fasta_file    = params.fasta
+                ? file(params.fasta, checkIfExists: true)
+                : row[2]
+            def sample_meta   = meta + [sampleId: meta.id, id: "${meta.id}-${spectrum_file.simpleName}"]
+            return [sample_meta.id, sample_meta, spectrum_file, fasta_file]
         }
         .groupTuple()
-        .map { samplesheet ->
-            validateInputSamplesheet(samplesheet)
-        }
+        .map { validateInputSamplesheet(it) }
         .set { ch_samplesheet }
 
     emit:
@@ -177,13 +192,13 @@ workflow PIPELINE_COMPLETION {
 // Validate channels from input samplesheet
 //
 def validateInputSamplesheet(input) {
-    def (metas, spectra_files) = input[1..2]
+    def (metas, spectra_files, fasta_files) = input[1..3]
 
     if (spectra_files.size() != 1) {
         error("Please check input samplesheet -> Expected exactly one 'spectrum_file' per sample row after grouping, got ${spectra_files.size()} for: ${metas[0].id}")
     }
 
-    return [metas[0], spectra_files[0]]
+    return [metas[0], spectra_files[0], fasta_files[0]]
 }
 //
 // Generate methods description for MultiQC

diff --git a/workflows/mspepid.nf b/workflows/mspepid.nf
@@ -20,9 +20,8 @@ include { SPECTRA_RESCORING      } from '../subworkflows/local/spectra_rescoring
 
 workflow MSPEPID {
     take:
-    ch_samplesheet // channel: samplesheet read in from --input
+    ch_samplesheet    // channel: [meta, spectrum_file, fasta_file] read in from --input
     outdir
-    fasta // string: path to fasta file
     entrapment_fold // integer: fold for entrapment generation, 0 for none
     skip_decoy_generation // boolean: whether to skip decoy generation
     precursor_tol_ppm // integer: Precursor mass tolerance in ppm for spectra identification
@@ -42,28 +41,40 @@ workflow MSPEPID {
 
     def ch_versions = channel.empty()
 
-    // create channel for fasta input
-    ch_fasta = channel.fromPath(fasta, checkIfExists: true)
-        .map { fa -> [[id: fa.getBaseName()], fa] }
+    // Extract fasta from the samplesheet channel
+    ch_fasta = ch_samplesheet.map { meta, _spectrum_file, fasta_file -> [meta, fasta_file] }
+
+    // Deduplicate FASTAs by path: run PREPARE_DATABASES once per unique FASTA file.
+    // To later join the FASTAs back to then runs, meta gets a 'sample_ids' list of all run IDs
+    ch_fasta_dedup = ch_fasta
+        .map { meta, fasta -> [fasta.toString(), meta.id, fasta] }
+        .groupTuple(by: 0)
+        .map { _path_key, sample_ids, fastas ->
+            [[id: fastas[0].getBaseName(), sample_ids: sample_ids], fastas[0]]
+        }
 
     // prepare the databases: decoy generation and entrapment database creation
     PREPARE_DATABASES(
-        ch_fasta,
+        ch_fasta_dedup,
         entrapment_fold,
         skip_decoy_generation,
     )
-    ch_fasta_db = PREPARE_DATABASES.out.fasta
+    // create one [sample_id, db_fasta] for each sample_id in the original samplesheet
+    ch_fasta_db_per_run = PREPARE_DATABASES.out.fasta
+        .flatMap { db_meta, db_fasta ->
+            db_meta.sample_ids.collect { run_id -> [run_id, db_fasta] }
+        }
 
-    // prepare the spectra files
+    // prepare the spectra files (strip the fasta_file before passing to PREPARE_SPECTRA)
     PREPARE_SPECTRA(
-        ch_samplesheet
+        ch_samplesheet.map { meta, spectrum_file, _fasta_file -> [meta, spectrum_file] }
     )
     ch_prepared_spectra = PREPARE_SPECTRA.out.mzmls.join(PREPARE_SPECTRA.out.uncompressed, by: 0)
 
     // spectra identification
     SPECTRA_IDENTIFICATION(
-        ch_fasta_db,
         ch_prepared_spectra,
+        ch_fasta_db_per_run,
         precursor_tol_ppm,
         fragment_tol_da,
         run_comet,