Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions assets/samplesheet.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
sample,fastq_1,fastq_2
SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz
SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz,
ID,spectrum_file,fasta
1,/path/to/run1.raw,/path/to/proteins.fasta
2,/path/to/run2.mzML,/path/to/proteins.fasta
7 changes: 7 additions & 0 deletions assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,13 @@
"exists": true,
"pattern": "^\\S+\\.(raw|RAW|mzML|mzml|d|)(.tar.(gz|bz2)|.zip|.gz)?$",
"errorMessage": "MS file name, cannot contain spaces and must have one of the extensions: raw | RAW | mzML | d"
},
"fasta": {
"type": "string",
"format": "path",
"exists": true,
"pattern": "^\\S+\\.(fasta|fa|fas|faa)(\\.gz)?$",
"errorMessage": "FASTA file cannot contain spaces and must have one of the extensions: fasta | fa | fas | faa"
}
},
"required": ["ID", "spectrum_file"]
Expand Down
3 changes: 1 addition & 2 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_mspe
workflow NFCORE_MSPEPID {

take:
samplesheet // channel: samplesheet read in from --input
samplesheet // channel: [meta, spectrum_file, fasta_file] read in from --input

main:

Expand All @@ -40,7 +40,6 @@ workflow NFCORE_MSPEPID {
MSPEPID(
samplesheet,
params.outdir,
params.fasta,
params.entrapment_fold,
params.skip_decoy_generation,
params.precursor_tol_ppm,
Expand Down
6 changes: 3 additions & 3 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@
"fasta": {
"type": "string",
"fa_icon": "fas fa-file",
"pattern": ".(fasta|faa)$",
"description": "Input FASTA protein database",
"help_text": "Path to the protein database file. Whether it already contains decoys is set by the --skip_decoy_generation parameter."
"pattern": ".(fasta|fa|fas|faa)(.gz)?$",
Comment thread
julianu marked this conversation as resolved.
Outdated
"description": "Global input FASTA protein database (mutually exclusive with per-run 'fasta' column in the samplesheet)",
"help_text": "Path to a single protein database file applied to all MS runs. Cannot be used together with the 'fasta' column in the samplesheet. Whether the file already contains decoys is controlled by --skip_decoy_generation."
},
"entrapment_fold": {
"type": "integer",
Expand Down
33 changes: 19 additions & 14 deletions subworkflows/local/spectra_identification/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ include { PSMUTILSCONVERSIONS } from '../../../modules/local/psmutilsconversions

workflow SPECTRA_IDENTIFICATION {
take:
ch_fasta
ch_spectra_files // val(meta), path(mzml), path(raw_spectra)
ch_spectra_files // val(meta), path(mzml), path(raw_spectra)
ch_fasta_db // channel: [sample_id, db_fasta] one item per sample
precursor_tol_ppm
fragment_tol_da
run_comet
Expand All @@ -21,13 +21,14 @@ workflow SPECTRA_IDENTIFICATION {
main:
ch_versions = channel.empty()

// TODO: this will become the identifications, probably with some meta mapping?
// this will contain the identifications, with some meta data
ch_identifications = channel.empty()

// prepare the input channel for identifications
// TODO: this right now only adds the fasta - must be adapted for per sample DB
// TODO: also adapt for per-sample parameters
ch_ident_in = ch_spectra_files.combine(ch_fasta.map { _meta, fasta -> [fasta] })
// join each spectrum file with its per-run database FASTA
ch_ident_in = ch_spectra_files
.map { meta, mzml, raw -> [meta.id, meta, mzml, raw] }
.join(ch_fasta_db, by: 0)
.map { _id, meta, mzml, raw, fasta -> [meta, mzml, raw, fasta] }

// run Comet, if enabled
if (run_comet) {
Expand Down Expand Up @@ -71,15 +72,19 @@ workflow SPECTRA_IDENTIFICATION {
)
ch_versions = ch_versions.mix(SAGECONFIG.out.versions_python)

ch_sage_spectra = ch_spectra_files.map { meta, mzml, _raw_spectra -> [meta, mzml] }
// add empty meta information for compatibility and convert to value channel
ch_sage_config = SAGECONFIG.out.config.map { config -> [["ID": "SAGE_CONFIG"], config] }.first()
// convert to value channel
ch_sage_fasta = ch_fasta.first()
ch_sage_config = SAGECONFIG.out.config.map { config -> [['ID': 'SAGE_CONFIG'], config] }.first()

// Re-use ch_ident_in (already joined with per-run fasta) and split into
// the two separate channels SAGEBETA requires.
ch_sage_joined = ch_ident_in
.multiMap { meta, mzml, _raw, fasta ->
spectra: [meta, mzml]
fasta: [[id: fasta.getBaseName()], fasta]
}

SAGEBETA(
ch_sage_spectra,
ch_sage_fasta,
ch_sage_joined.spectra,
ch_sage_joined.fasta,
ch_sage_config,
)
ch_versions = ch_versions.mix(SAGEBETA.out.versions_sagebeta)
Expand Down
41 changes: 28 additions & 13 deletions subworkflows/local/utils_nfcore_mspepid_pipeline/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -102,21 +102,36 @@ workflow PIPELINE_INITIALISATION {
// Create channel from input file provided through params.input
//

// Validate FASTA source mutual exclusivity before building channels
def raw_rows = samplesheetToList(input, "${projectDir}/assets/schema_input.json")
def sheet_has_fasta = raw_rows.any { row -> row.size() > 2 && row[2] }
def sheet_all_fasta = raw_rows.every { row -> row.size() > 2 && row[2] }

if (params.fasta && sheet_has_fasta) {
error("Conflicting FASTA sources: '--fasta' parameter and 'fasta' column in the samplesheet are mutually exclusive. Please use only one.")
}
Comment thread
julianu marked this conversation as resolved.
if (!params.fasta && !sheet_has_fasta) {
error("No FASTA provided. Please either use '--fasta' or add a 'fasta' column to every row in the samplesheet.")
}
if (!params.fasta && sheet_has_fasta && !sheet_all_fasta) {
error("Incomplete FASTA column: either provide one fasta for each run in the samplesheet, or use the global '--fasta' parameter.")
}

channel
.fromList(samplesheetToList(input, "${projectDir}/assets/schema_input.json"))
.fromList(raw_rows)
.map { row ->
def meta = row[0]
def spectrum_file = row.size() > 1 ? row[1] : null
if (!spectrum_file) {
error("Please check input samplesheet -> Missing required column 'spectrum_file' for sample ID: ${meta.id}")
}
def sample_meta = meta + [sampleId: meta.id, id: "${meta.id}-${spectrum_file.simpleName}"]
return [sample_meta.id, sample_meta, spectrum_file]
def meta = row[0]
def spectrum_file = row[1]
Comment thread
julianu marked this conversation as resolved.
// Resolve fasta: either from the global --fasta param or from the samplesheet column.
// Mutual exclusivity and completeness were already validated above.
def fasta_file = params.fasta
? file(params.fasta, checkIfExists: true)
: row[2]
def sample_meta = meta + [sampleId: meta.id, id: "${meta.id}-${spectrum_file.simpleName}"]
return [sample_meta.id, sample_meta, spectrum_file, fasta_file]
}
.groupTuple()
.map { samplesheet ->
validateInputSamplesheet(samplesheet)
}
.map { validateInputSamplesheet(it) }
.set { ch_samplesheet }

emit:
Expand Down Expand Up @@ -177,13 +192,13 @@ workflow PIPELINE_COMPLETION {
// Validate channels from input samplesheet
//
def validateInputSamplesheet(input) {
def (metas, spectra_files) = input[1..2]
def (metas, spectra_files, fasta_files) = input[1..3]

if (spectra_files.size() != 1) {
error("Please check input samplesheet -> Expected exactly one 'spectrum_file' per sample row after grouping, got ${spectra_files.size()} for: ${metas[0].id}")
}

return [metas[0], spectra_files[0]]
return [metas[0], spectra_files[0], fasta_files[0]]
}
//
// Generate methods description for MultiQC
Expand Down
31 changes: 21 additions & 10 deletions workflows/mspepid.nf
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,8 @@ include { SPECTRA_RESCORING } from '../subworkflows/local/spectra_rescoring

workflow MSPEPID {
take:
ch_samplesheet // channel: samplesheet read in from --input
ch_samplesheet // channel: [meta, spectrum_file, fasta_file] read in from --input
outdir
fasta // string: path to fasta file
entrapment_fold // integer: fold for entrapment generation, 0 for none
skip_decoy_generation // boolean: whether to skip decoy generation
precursor_tol_ppm // integer: Precursor mass tolerance in ppm for spectra identification
Expand All @@ -42,28 +41,40 @@ workflow MSPEPID {

def ch_versions = channel.empty()

// create channel for fasta input
ch_fasta = channel.fromPath(fasta, checkIfExists: true)
.map { fa -> [[id: fa.getBaseName()], fa] }
// Extract fasta from the samplesheet channel
ch_fasta = ch_samplesheet.map { meta, _spectrum_file, fasta_file -> [meta, fasta_file] }

// Deduplicate FASTAs by path: run PREPARE_DATABASES once per unique FASTA file.
// To later join the FASTAs back to then runs, meta gets a 'sample_ids' list of all run IDs
Comment thread
julianu marked this conversation as resolved.
Outdated
ch_fasta_dedup = ch_fasta
.map { meta, fasta -> [fasta.toString(), meta.id, fasta] }
.groupTuple(by: 0)
.map { _path_key, sample_ids, fastas ->
[[id: fastas[0].getBaseName(), sample_ids: sample_ids], fastas[0]]
}

// prepare the databases: decoy generation and entrapment database creation
PREPARE_DATABASES(
ch_fasta,
ch_fasta_dedup,
entrapment_fold,
skip_decoy_generation,
)
ch_fasta_db = PREPARE_DATABASES.out.fasta
// create one [sample_id, db_fasta] for each sample_id in the original samplesheet
ch_fasta_db_per_run = PREPARE_DATABASES.out.fasta
.flatMap { db_meta, db_fasta ->
db_meta.sample_ids.collect { run_id -> [run_id, db_fasta] }
}

// prepare the spectra files
// prepare the spectra files (strip the fasta_file before passing to PREPARE_SPECTRA)
PREPARE_SPECTRA(
ch_samplesheet
ch_samplesheet.map { meta, spectrum_file, _fasta_file -> [meta, spectrum_file] }
)
ch_prepared_spectra = PREPARE_SPECTRA.out.mzmls.join(PREPARE_SPECTRA.out.uncompressed, by: 0)

// spectra identification
SPECTRA_IDENTIFICATION(
ch_fasta_db,
ch_prepared_spectra,
ch_fasta_db_per_run,
precursor_tol_ppm,
fragment_tol_da,
run_comet,
Expand Down
Loading