diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 5f653ab..f3cfa5a 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,3 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, +ID,spectrum_file,fasta +1,/path/to/run1.raw,/path/to/proteins.fasta +2,/path/to/run2.mzML,/path/to/proteins.fasta diff --git a/assets/schema_input.json b/assets/schema_input.json index 9e5dbbc..1add03b 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -18,6 +18,13 @@ "exists": true, "pattern": "^\\S+\\.(raw|RAW|mzML|mzml|d|)(.tar.(gz|bz2)|.zip|.gz)?$", "errorMessage": "MS file name, cannot contain spaces and must have one of the extensions: raw | RAW | mzML | d" + }, + "fasta": { + "type": "string", + "format": "path", + "exists": true, + "pattern": "^\\S+\\.(fasta|fa|fas|faa)(\\.gz)?$", + "errorMessage": "FASTA file cannot contain spaces and must have one of the extensions: fasta | fa | fas | faa" } }, "required": ["ID", "spectrum_file"] diff --git a/main.nf b/main.nf index d684a1a..c20e2e8 100644 --- a/main.nf +++ b/main.nf @@ -30,7 +30,7 @@ include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_mspe workflow NFCORE_MSPEPID { take: - samplesheet // channel: samplesheet read in from --input + samplesheet // channel: [meta, spectrum_file, fasta_file] read in from --input main: @@ -40,7 +40,6 @@ workflow NFCORE_MSPEPID { MSPEPID( samplesheet, params.outdir, - params.fasta, params.entrapment_fold, params.skip_decoy_generation, params.precursor_tol_ppm, diff --git a/nextflow_schema.json b/nextflow_schema.json index 939caa2..0caba21 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -47,9 +47,9 @@ "fasta": { "type": "string", "fa_icon": "fas fa-file", - "pattern": ".(fasta|faa)$", - "description": "Input FASTA protein database", - "help_text": "Path to the protein database file. Whether it already contains decoys is set by the --skip_decoy_generation parameter." + "pattern": "^\\S+\\.(fasta|fa|fas|faa)(.gz)?$", + "description": "Global input FASTA protein database (mutually exclusive with per-run 'fasta' column in the samplesheet)", + "help_text": "Path to a single protein database file applied to all MS runs. Cannot be used together with the 'fasta' column in the samplesheet. Whether the file already contains decoys is controlled by --skip_decoy_generation." }, "entrapment_fold": { "type": "integer", diff --git a/subworkflows/local/spectra_identification/main.nf b/subworkflows/local/spectra_identification/main.nf index 0226947..22ea095 100644 --- a/subworkflows/local/spectra_identification/main.nf +++ b/subworkflows/local/spectra_identification/main.nf @@ -7,8 +7,8 @@ include { PSMUTILSCONVERSIONS } from '../../../modules/local/psmutilsconversions workflow SPECTRA_IDENTIFICATION { take: - ch_fasta - ch_spectra_files // val(meta), path(mzml), path(raw_spectra) + ch_spectra_files // val(meta), path(mzml), path(raw_spectra) + ch_fasta_db // channel: [sample_id, db_fasta] one item per sample precursor_tol_ppm fragment_tol_da run_comet @@ -21,13 +21,14 @@ workflow SPECTRA_IDENTIFICATION { main: ch_versions = channel.empty() - // TODO: this will become the identifications, probably with some meta mapping? + // this will contain the identifications, with some meta data ch_identifications = channel.empty() - // prepare the input channel for identifications - // TODO: this right now only adds the fasta - must be adapted for per sample DB - // TODO: also adapt for per-sample parameters - ch_ident_in = ch_spectra_files.combine(ch_fasta.map { _meta, fasta -> [fasta] }) + // join each spectrum file with its per-run database FASTA + ch_ident_in = ch_spectra_files + .map { meta, mzml, raw -> [meta.id, meta, mzml, raw] } + .join(ch_fasta_db, by: 0) + .map { _id, meta, mzml, raw, fasta -> [meta, mzml, raw, fasta] } // run Comet, if enabled if (run_comet) { @@ -71,15 +72,19 @@ workflow SPECTRA_IDENTIFICATION { ) ch_versions = ch_versions.mix(SAGECONFIG.out.versions_python) - ch_sage_spectra = ch_spectra_files.map { meta, mzml, _raw_spectra -> [meta, mzml] } - // add empty meta information for compatibility and convert to value channel - ch_sage_config = SAGECONFIG.out.config.map { config -> [["ID": "SAGE_CONFIG"], config] }.first() - // convert to value channel - ch_sage_fasta = ch_fasta.first() + ch_sage_config = SAGECONFIG.out.config.map { config -> [['ID': 'SAGE_CONFIG'], config] }.first() + + // Re-use ch_ident_in (already joined with per-run fasta) and split into + // the two separate channels SAGEBETA requires. + ch_sage_joined = ch_ident_in + .multiMap { meta, mzml, _raw, fasta -> + spectra: [meta, mzml] + fasta: [[id: fasta.getBaseName()], fasta] + } SAGEBETA( - ch_sage_spectra, - ch_sage_fasta, + ch_sage_joined.spectra, + ch_sage_joined.fasta, ch_sage_config, ) ch_versions = ch_versions.mix(SAGEBETA.out.versions_sagebeta) diff --git a/subworkflows/local/utils_nfcore_mspepid_pipeline/main.nf b/subworkflows/local/utils_nfcore_mspepid_pipeline/main.nf index e3b0f5b..4ec8771 100644 --- a/subworkflows/local/utils_nfcore_mspepid_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_mspepid_pipeline/main.nf @@ -102,21 +102,36 @@ workflow PIPELINE_INITIALISATION { // Create channel from input file provided through params.input // + // Validate FASTA source mutual exclusivity before building channels + def raw_rows = samplesheetToList(input, "${projectDir}/assets/schema_input.json") + def sheet_has_fasta = raw_rows.any { row -> row.size() > 2 && row[2] } + def sheet_all_fasta = raw_rows.every { row -> row.size() > 2 && row[2] } + + if (params.fasta && sheet_has_fasta) { + error("Conflicting FASTA sources: '--fasta' parameter and 'fasta' column in the samplesheet are mutually exclusive. Please use only one.") + } + if (!params.fasta && !sheet_has_fasta) { + error("No FASTA provided. Please either use '--fasta' or add a 'fasta' column to every row in the samplesheet.") + } + if (!params.fasta && sheet_has_fasta && !sheet_all_fasta) { + error("Incomplete FASTA column: either provide one fasta for each run in the samplesheet, or use the global '--fasta' parameter.") + } + channel - .fromList(samplesheetToList(input, "${projectDir}/assets/schema_input.json")) + .fromList(raw_rows) .map { row -> - def meta = row[0] - def spectrum_file = row.size() > 1 ? row[1] : null - if (!spectrum_file) { - error("Please check input samplesheet -> Missing required column 'spectrum_file' for sample ID: ${meta.id}") - } - def sample_meta = meta + [sampleId: meta.id, id: "${meta.id}-${spectrum_file.simpleName}"] - return [sample_meta.id, sample_meta, spectrum_file] + def meta = row[0] + def spectrum_file = row[1] + // Resolve fasta: either from the global --fasta param or from the samplesheet column. + // Mutual exclusivity and completeness were already validated above. + def fasta_file = params.fasta + ? file(params.fasta, checkIfExists: true) + : row[2] + def sample_meta = meta + [sampleId: meta.id, id: "${meta.id}-${spectrum_file.simpleName}"] + return [sample_meta.id, sample_meta, spectrum_file, fasta_file] } .groupTuple() - .map { samplesheet -> - validateInputSamplesheet(samplesheet) - } + .map { validateInputSamplesheet(it) } .set { ch_samplesheet } emit: @@ -177,13 +192,13 @@ workflow PIPELINE_COMPLETION { // Validate channels from input samplesheet // def validateInputSamplesheet(input) { - def (metas, spectra_files) = input[1..2] + def (metas, spectra_files, fasta_files) = input[1..3] if (spectra_files.size() != 1) { error("Please check input samplesheet -> Expected exactly one 'spectrum_file' per sample row after grouping, got ${spectra_files.size()} for: ${metas[0].id}") } - return [metas[0], spectra_files[0]] + return [metas[0], spectra_files[0], fasta_files[0]] } // // Generate methods description for MultiQC diff --git a/workflows/mspepid.nf b/workflows/mspepid.nf index 5c82165..5e31a10 100644 --- a/workflows/mspepid.nf +++ b/workflows/mspepid.nf @@ -20,9 +20,8 @@ include { SPECTRA_RESCORING } from '../subworkflows/local/spectra_rescoring workflow MSPEPID { take: - ch_samplesheet // channel: samplesheet read in from --input + ch_samplesheet // channel: [meta, spectrum_file, fasta_file] read in from --input outdir - fasta // string: path to fasta file entrapment_fold // integer: fold for entrapment generation, 0 for none skip_decoy_generation // boolean: whether to skip decoy generation precursor_tol_ppm // integer: Precursor mass tolerance in ppm for spectra identification @@ -42,28 +41,40 @@ workflow MSPEPID { def ch_versions = channel.empty() - // create channel for fasta input - ch_fasta = channel.fromPath(fasta, checkIfExists: true) - .map { fa -> [[id: fa.getBaseName()], fa] } + // Extract fasta from the samplesheet channel + ch_fasta = ch_samplesheet.map { meta, _spectrum_file, fasta_file -> [meta, fasta_file] } + + // Deduplicate FASTAs by path: run PREPARE_DATABASES once per unique FASTA file. + // To later join the FASTAs back to the runs, meta gets a 'sample_ids' list of all run IDs + ch_fasta_dedup = ch_fasta + .map { meta, fasta -> [fasta.toString(), meta.id, fasta] } + .groupTuple(by: 0) + .map { _path_key, sample_ids, fastas -> + [[id: fastas[0].getBaseName(), sample_ids: sample_ids], fastas[0]] + } // prepare the databases: decoy generation and entrapment database creation PREPARE_DATABASES( - ch_fasta, + ch_fasta_dedup, entrapment_fold, skip_decoy_generation, ) - ch_fasta_db = PREPARE_DATABASES.out.fasta + // create one [sample_id, db_fasta] for each sample_id in the original samplesheet + ch_fasta_db_per_run = PREPARE_DATABASES.out.fasta + .flatMap { db_meta, db_fasta -> + db_meta.sample_ids.collect { run_id -> [run_id, db_fasta] } + } - // prepare the spectra files + // prepare the spectra files (strip the fasta_file before passing to PREPARE_SPECTRA) PREPARE_SPECTRA( - ch_samplesheet + ch_samplesheet.map { meta, spectrum_file, _fasta_file -> [meta, spectrum_file] } ) ch_prepared_spectra = PREPARE_SPECTRA.out.mzmls.join(PREPARE_SPECTRA.out.uncompressed, by: 0) // spectra identification SPECTRA_IDENTIFICATION( - ch_fasta_db, ch_prepared_spectra, + ch_fasta_db_per_run, precursor_tol_ppm, fragment_tol_da, run_comet,