PARANOiD_galaxy.xml

<tool id="PARANOiD" name="PARANOiD" version="1.0">
	<description>Processes and analyses reads generated by iCLIP experiments</description>
	<version_command>nextflow ${__tool_directory__}/main.nf --version</version_command>
	<requirements>
		<requirement type="package" version="23.04.3">nextflow</requirement>
		<requirement type="package" version="3.7.1">singularity</requirement>
	</requirements>

	<environment_variables>
		<environment_variable name="NXF_HOME">$__root_dir__/.nextflow</environment_variable>
		<environment_variable name="NXF_SINGULARITY_CACHEDIR">$__root_dir__/docker-container/PARANOiD</environment_variable>
	</environment_variables>

	<command><![CDATA[
		nextflow ${__tool_directory__}/main.nf
		--reads #for $file in $file_input.reads# "$file" #end for#
		--reference $file_input.reference
		--barcodes $file_input.barcodes
			#if $file_input.annotation
				--annotation $file_input.annotation
			#end if
		--output output

			#if $mapping_parameter.condition_transcript_analysis.transcript_reference
				--domain pro
				$mapping_parameter.condition_transcript_analysis.transcript_reference
				--number_top_transcripts $mapping_parameter.condition_transcript_analysis.number_of_top_transcripts
			#else
				--domain $mapping_parameter.condition_transcript_analysis.domain
			#end if

		--mapq $mapping_parameter.mapq_score

			#if $mapping_parameters.condition_max_alignments.report_all_alignments
				--report_all_alignments
			#else
				--max_alignments $mapping_parameters.max_alignments
			#end if

		--percentile $general_analysis_parameter.percentile
		--color_barplot '$general_analysis_parameter.color_barplot'

			#if $merge_replicates.condition_merge_replicates.merge
				--merge_replicates
				#if $merge_replicates.condition_merge_replicates.correlation_analysis
					--correlation_analysis
				#end if
			#end if

			#if $peak_calling_analysis.condition_peak_calling.peak_calling
				#if $peak_calling_analysis.condition_peak_calling.high_peak_coverage
					$peak_calling_analysis.condition_peak_calling.high_peak_coverage
				#end if
				#if $peak_calling_analysis.condition_peak_calling.condition_peak_regions.peak_region
					--peak_calling_regions_width $peak_calling_analysis.condition_peak_calling.condition_peak_regions.peak_calling_region_width
				#end if
			#else
				--omit_peak_calling
			#end if

			#if $peak_distance_analysis.condition_peak_distance.peak_distance
				--distance $peak_distance_analysis.condition_peak_distance.max_peak_distance
			#else
				--omit_peak_distance
			#end if

			#if $motif_analysis.condition_sequence_extraction.sequence_extraction
				--seq_len $motif_analysis.condition_sequence_extraction.extraction_length
				--omit_cl_nucleotide 
				--max_motif_num $motif_analysis.condition_sequence_extraction.max_motifs
				--min_motif_width $motif_analysis.condition_sequence_extraction.min_motif_width
				--max_motif_width $motif_analysis.condition_sequence_extraction.max_motif_width
			#else
				--omit_sequence_extraction
			#end if

			#if $file_input.annotation
				--rna_subtypes $rna_subtype_analysis.rna_subtypes
				--gene_id $rna_subtype_analysis.rna_subtypes
			#end if

			--min_length $read_parameter.min_read_length
			--min_qual $read_parameter.min_base_quality
			--min_percent_qual_filter $read_parameter.min_quality_percent
			--barcode_pattern $read_parameters.barcode_pattern
			--barcode_mismatches $read_parameter.max_barcode_mismatches

		--imgDir $__root_dir__/docker-containers

		-profile singularity,sge
		-c ${__tool_directory__}/nextflow.config &&
		zip -r ${output_collected}.zip output &&
		rm ${output_collected} &&
		ln -s ${output_collected}.zip ${output_collected}
	]]></command>

	<inputs>
		<section name="file_input" title="Files" expanded="true">
			<param name="reads"			type="data" argument="--reads"		format="fastqsanger" 	multiple="true"		label="Input reads"		help="Reads generated by an iCLIP experiment. To be provided as FASTQ file." />
			<param name="reference"		type="data" argument="--reference"	format="fasta" 								label="Reference file"	help="Reference sequence(s) on which cross-link sites will be detected." />
			<param name="annotation"	type="data" argument="--annotation"	format="gff,gff3,gtf" 	optional="true"		label="Annotation file"	help="Annotation file. Contains region information about the reference. To be provided as GFF, GFF3 or GTF file." />
			<param name="barcodes"		type="data" argument="--barcodes"	format="tabular" 							label="Barcode file" 	help="File containing experimental barcode sequences and the appendant experiment names. To be provided as TSV file." />
		</section>
		<section name="general_analysis_parameter" title="General analyses parameter">
			<param name="percentile" 	type="integer" 	argument="--percentile" 	min="0" max="100" 	value="90" 		label="Percentiles" 		help="Percentage of peaks that will be filtered out before peak distance analysis und motif detection. 90 means that only the top 10% of peaks will be used."/>
			<param name="color_barplot" type="color" 	argument="--color_barplot" 						value="#69b3a2" label="Color of barplots" 	help="Color for all barplots generated. To be provided as color hex code. If not sure which one to use one can use various websites to determine the correct hex code (for example: https://www.color-hex.com/)." >
				<sanitizer>
					<valid initial="string.ascii_letters,string.digits">
						<add value="#" />
					</valid>
				</sanitizer>
			</param>
		</section>
		<section name="merge_replicates" title="Replicate merging">
			<conditional name="condition_merge_replicates">
				<param name="merge"	type="boolean" argument="--merge_replicates" checked="false" label="Merge replicates" help="Merges replicates into a representative form." />
				<when value="true">
					<param name="correlation_analysis" type="boolean" argument="--correlation_analysis" checked="false" label="Correlation analysis" help="Performs correlation analysis to see the similarity of replicates. Can cause problems with large reference genomes due to high memory usage" />
				</when>
			</conditional>
		</section>
		<section name="peak_calling_analysis" title="Peak calling">
			<conditional name="condition_peak_calling">
				<param name="peak_calling" type="boolean" argument="--omit_peak_calling" checked="false" truevalue="true" label="Peak calling" help="Activates peak calling. This involves the filtering of background noise and optionally the summarization of peaks in close proximity into binding regions. Uses PureCLIP." />
				<when value="true">
					<param name="high_peak_coverage" type="boolean" argument="--peak_calling_for_high_coverage" checked="false" truevalue="--peak_calling_for_high_coverage" label="High peak coverage" help="Should be used when a protein with a high general coverage was used. These datasets can cause errors when running PureCLIP and which can be avoided by choosing this option." />
					<conditional name="condition_peak_regions">
						<param name="peak_region" type="boolean" argument="--peak_calling_regions" checked="false" truevalue="--peak_calling_regions" label="Report binding regions" help="Summarizes peaks in close proximity into binding regions." />
						<when value="--peak_calling_regions">
							<param name="peak_calling_region_width" type="integer" argument="--peak_calling_regions_width" value="8" label="Binding region width" help="Maximum distance until which peaks can still be summarized into a single binding region." />
						</when>
					</conditional>
				</when>
			</conditional>
		</section>
		<section name="peak_distance_analysis" title="Peak distance">
			<conditional name="condition_peak_distance">
				<param name="peak_distance" type="boolean" argument="--omit_peak_distance" checked="false" label="If true peak distance analysis will be omited" />
				<when value="true">
					<param name="max_peak_distance" type="integer" argument="--distance" min="1" value="30" label="Maximum peak distance" />
				</when>
			</conditional>
		</section>
		<section name="motif_analysis" title="Sequence extraction and motif analysis">
			<conditional name="condition_sequence_extraction">
				<param name="sequence_extraction" type="boolean" argument="--omit_sequence_extraction" checked="false" label="Sequence extraction and motif analysis" />
				<when value="true">
					<param name="extraction_length" 	type="integer" argument="--seq_len" 			min="1" 			value="20" 		label="Sequence extraction length (down AND upstream)" />
					<param name="omit_cl_nucleotide" 	type="boolean" argument="--omit_cl_nucleotide" 						checked="false" label="Omit cl site nucleotide" />
					<param name="max_motifs" 			type="integer" argument="--max_motif_num" 		min="1" 			value="50" 		label="Max number of motifs" />
					<param name="min_motif_width" 		type="integer" argument="--min_motif_width" 	min="3" max="30" 	value="8" 		label="Min motif length" />
					<param name="max_motif_width" 		type="integer" argument="--max_motif_width" 	min="3" max="30" 	value="15" 		label="Max motif length" />
				</when>
			</conditional>
		</section>
		<section name="rna_subtype_analysis" title="RNA subtype analysis">
			<param name="rna_subtypes" 	type="text"  argument="--rna_subtypes" 	value="3_prime_UTR,transcript,5_prime_UTR" 	label="RNA subtypes" />
			<param name="gene_id" 		type="text"  argument="--gene_id" 		value="ID" 									label="Gene ID" />
		</section>
		<section name="mapping_parameter" title="Alignment parameters">
			<conditional name="condition_transcript_analysis">
				<param name="transcript_reference" type="boolean" argument="--map_to_transcrips" checked="false" truevalue="--map_to_transcripts" falsevalue="false" label="Choose if provided reference consists of transcripts" />
				<when value="--map_to_transcripts">
					<param name="number_of_top_transcripts" type="integer" argument="--number_top_transcripts" min="1" value="10" label="Number of transcripts with most hits per experiment" />
				</when>
				<when value="false" >
					<param name="domain" type="select" argument="--domain" label="Domain">
						<option value="pro" selected="true"	>Prokaryote</option>
						<option value="eu"					>Eukaryote</option>
					</param>
				</when>
			</conditional>
			
			<param name="mapq_score" type="integer" argument="--mapq" value="2" label="Alignment quality" />

			<conditional name="condition_max_alignments">
				<param name="report_all_alignments" type="boolean" argument="--report_all_alignments" checked="false"  truevalue="--report_all_alignments" falsevalue="false" label="Report all alignments" />
				<when value="false">
					<param name="max_alignments" type="integer" argument="--max_alignments" value="1" label="Maximum alignments provided" />
				</when>
			</conditional>
			
		</section>
		<section name="read_parameter" title="Read processing">
			<param name="min_read_length"			type="integer" argument="--min_length" 				value="30" 					label="Minimum length allowed after adapter removal" />
			<param name="min_base_quality"			type="integer" argument="--min_qual" 				value="20" 					label="Minimum nucleotide quality" />
			<param name="min_quality_percent"		type="integer" argument="--min_percent_qual_filter" value="90" 					label="Percent of nucleotides above quality threshold to keep read" />
			<param name="barcode_pattern"			type="string"  argument="--barcode_pattern" 		value="NNNNNXXXXXXNNNN" 	label="Pattern used for barcodes. N represents random and X experimental barcodes" />
			<param name="max_barcode_mismatches"	type="integer" argument="--barcode_mismatches" 		value="1" 					label="Number of mismatches in the experimental barcode allowed to still asign a read to its experiment" />
		</section>
	</inputs>
	<outputs>
		<collection name="output_alignment_files" type="list:list" label="Alignment files">
			<discover_datasets pattern="(?P&lt;identifier_0&gt;[^\s]+)\.sorted\.(?P&lt;identifier_1&gt;[^\s]+)" directory="output/alignments" visible="false" />
		</collection>
		<collection name="output_cross_link_sites_bedgraph" type="list:paired" label="Cross link sites (bedgraph)">
			<discover_datasets pattern="(?P&lt;identifier_0&gt;[^\s]+)_(?P&lt;identifier_1&gt;[^_]+)\.bedgraph" directory="output/cross-link-sites/bedgraph" ext="bedgraph" visible="false" />
		</collection>
		<collection name="output_cross_link_sites_bigWig" type="list:paired" label="Cross link sites (bigWig)">
			<discover_datasets pattern="(?P&lt;identifier_0&gt;[^\s]+)_(?P&lt;identifier_1&gt;[^_]+)\.bw" directory="output/cross-link-sites/bigWig" ext="bigWig" visible="false" />
		</collection>
		<collection name="output_cross_link_sites_wig" type="list:paired" label="Cross link sites (wig)">
			<discover_datasets pattern="(?P&lt;identifier_0&gt;[^\s]+)_(?P&lt;identifier_1&gt;[^_]+)\.wig" directory="output/cross-link-sites/wig" ext="wig" visible="false" />
		</collection>
		<collection name="output_cross_link_sites_bedgraph_merged" type="list:paired" label="Merged cross link sites (bedgraph)">
			<discover_datasets pattern="(?P&lt;identifier_0&gt;[^\s]+)_(?P&lt;identifier_1&gt;[^_]+)\.bedgraph" directory="output/cross-link-sites-merged/bedgraph" ext="bedgraph" visible="false" />
			<filter>merge_replicates['condition_merge_replicates']['merge']</filter>
		</collection>
		<collection name="output_cross_link_sites_bigWig_merged" type="list:paired" label="Merged cross link sites (bigWig)">
			<discover_datasets pattern="(?P&lt;identifier_0&gt;[^\s]+)_(?P&lt;identifier_1&gt;[^_]+)\.bw" directory="output/cross-link-sites-merged/bigWig" ext="bigWig" visible="false" />
			<filter>merge_replicates['condition_merge_replicates']['merge']</filter>
		</collection>
		<collection name="output_cross_link_sites_wig_merged" type="list:paired" label="Merged cross link sites (wig)">
			<discover_datasets pattern="(?P&lt;identifier_0&gt;[^\s]+)_(?P&lt;identifier_1&gt;[^_]+)\.wig" directory="output/cross-link-sites-merged/wig" ext="wig" visible="false" />
			<filter>merge_replicates['condition_merge_replicates']['merge']</filter>
		</collection>
		<collection name="output_correlation" type="list:list" label="Correlation overview">
			<discover_datasets pattern="(?P&lt;identifier_0&gt;[^\s]+)_correlation.(?P&lt;identifier_1&gt;[^\s]+)" directory="output/correlation_of_replicates" visible="false" />
			<filter>merge_replicates['condition_merge_replicates']['merge'] and merge_replicates['condition_merge_replicates']['correlation_analysis']</filter>
		</collection>
		<collection name="output_strand_distribution" type="list" label="Strand distribution" >
			<discover_datasets pattern="__name_and_ext__" directory="output/strand-distribution" ext="txt" visible="false" />
		</collection>
		<collection name="output_strand_distribution_vis" type="list" label="Visualization of strand distribution" >
			<discover_datasets pattern="__name_and_ext__" directory="output/strand-distribution/visualization" ext="png" visible="false" />
		</collection>
		<collection name="output_statistics_multiqc_all" type="list" label="all multiqc statistics" >
			<discover_datasets pattern="__name_and_ext__" directory="output/statistics/multiqc_data" visible="false" />
		</collection>
		<data name="output_multiqc_overview" file="multiqc_report.html" directory="output/statistics" format="html" label="Multiqc overview" compare="re_match" />
		<data name="output_collected" format="zip" label="Collected output of PARANOiD" />
		<data name="top_transcript_sequences" format="fasta" label="Top transcript sequences">
			<discover_datasets pattern="[^\s]+_transcripts\.fna" directory="output" ext="fna" />
			<filter>mapping_parameter['condition_transcript_analysis']['transcript_reference']</filter>
		</data>
		<data name="top_transcript_overview" format="txt" label="Top transcript overview">
			<discover_datasets pattern="transcript-targets-top[\d]+\.txt" directory="output/transcripts/overview-hits" ext="txt" />
			<filter>mapping_parameter['condition_transcript_analysis']['transcript_reference']</filter>
		</data>
		<collection name="top_transcript_for_samples" type="list" label="Top transcripts for each sample" >
			<discover_datasets pattern="__name_and_ext__" directory="output/transcripts/overview-hits" ext="tsv" visible="false" />
			<filter>mapping_parameter['condition_transcript_analysis']['transcript_reference']</filter>
		</collection>
		<collection name="output_extracted_sequences" type="list" label="Sequences extracted around cross-link sites" >
			<discover_datasets pattern="__name_and_ext__" directory="output/extracted_sequences/" visible="false" />
			<filter>motif_analysis['condition_sequence_extraction']['sequence_extraction']</filter>
		</collection>
		<collection name="output_motif_analysis" type="list" label="Motifs generated via Streme" >
			<discover_datasets pattern="(?P&lt;identifier_0&gt;[^\s]+)\.pureCLIP_crosslink_sites\.extracted-sequences_motif/__name_and_ext__" directory="output/motif_search/" visible="false" />
			<filter>motif_analysis['condition_sequence_extraction']['sequence_extraction']</filter>
		</collection>
		<collection name="output_peak_calling" type="list" label="Peak calling via PureCLIP" >
			<discover_datasets pattern="__name_and_ext__" directory="output/peak_calling" ext="bed" visible="false" />
			<filter>peak_calling_analysis['condition_peak_calling']['peak_calling']</filter>
		</collection>
		<collection name="output_peak_distance" type="list:list" label="Peak distance" >
			<discover_datasets pattern="(?P&lt;identifier_0&gt;[^\.]+)\.(?P&lt;identifier_1&gt;[^\s]+)" directory="output/peak_distance" visible="false" />
			<filter>peak_distance_analysis['condition_peak_distance']['peak_distance']</filter>
		</collection>
		<collection name="output_peak_height_dist" type="list" label="Peak height distribution" >
			<discover_datasets pattern="__name_and_ext__" directory="output/peak_height_distribution" visible="false" />
		</collection>
		<collection name="output_RNA_distribution" type="list:list" label="RNA subtype distribution" >
			<discover_datasets pattern="(?P&lt;identifier_0&gt;[^\.]+)\.subtypes_distribution\.(?P&lt;identifier_1&gt;[^\.]+)" directory="output/RNA_subtypes" visible="false" />
			<filter>file_input['annotation']</filter>
		</collection>
	</outputs>
	<tests>
		<test> <!-- Test minimal behaviour -->
			<section name="file_input">
				<param name="reads" value="PARANOiD-test-reads.fastq" />
				<param name="reference" value="PARANOiD-test-reference.fasta" />
				<param name="barcodes" value="PARANOiD-test-barcodes.tsv" />
				<param name="merge" value="false" />
				<param name="peak_calling" value="false" />
				<param name="peak_distance" value="false" />
				<param name="sequence_extarction" value="false" />
			</section>
			<output_collection name="output_alignment_files" type="list:list">
				<element name="test_output_alignment_bam" file="exp1_rep_3.sorted.bam" ftype="bam" compare="contains" />
				<element name="test_output_alignment_bai" file="exp1_rep_3.sorted.bam.bai" ftype="bai" compare="contains" />
			</output_collection>
			<output_collection name="output_cross_link_sites_wig" type="list:paired">
				<element name="test_output_cls_wig_for" file="exp1_rep_3_forward.wig" ftype="wig" compare="contains" />
				<element name="test_output_cls_wig_rev" file="exp1_rep_3_reverse.wig" ftype="wig" compare="contains" />
			</output_collection>
			<output_collection name="output_cross_link_sites_bigWig" type="list:paired">
				<element name="test_output_cls_wig_for" file="exp1_rep_3_forward.wig" ftype="bigWig" compare="contains" />
				<element name="test_output_cls_wig_rev" file="exp1_rep_3_reverse.wig" ftype="bigWig" compare="contains" />
			</output_collection>
			<output_collection name="output_cross_link_sites_bedgraph" type="list:paired">
				<element name="test_output_cls_wig_for" file="exp1_rep_3_forward.wig" ftype="bedgraph" compare="contains" />
				<element name="test_output_cls_wig_rev" file="exp1_rep_3_reverse.wig" ftype="bedgraph" compare="contains" />
			</output_collection>
			<output_collection name="output_strand_distribution" type="list">
				<element name="test_output_strand_dist" file="exp1_rep_3.strand_proportion.txt" ftype="txt" compare="contains" />
			</output_collection>
			<output_collection name="output_peak_height_dist" type="list">
				<element name="test_output_peak_height_dist" file="exp1_rep_3.png" ftype="png" compare="contains" />
			</output_collection>
		</test>
		<test> <!-- negative test -->
			<param name="reads" value="PARANOiD-test-reads.fastq" />
			<param name="reference" value="PARANOiD-test-reference.fasta" />
			<param name="barcodes" value="PARANOiD-test-barcodes.tsv" />
			<output_collection name="output_cross_link_sites_wig" type="list:paired">
				<element name="test_output_cls_wig" file="gibts_nicht.wig" ftype="wig" compare="contains" />
			</output_collection>
		</test>
	</tests>
	<help><![CDATA[**What it does**
	iCLIP is a special variant of CLIP-experiments that increases the accuracy of cross link site detection to the exact nucleotide.
	PARANOiD is a workflow that offers an automated analysis of read data acquired by iCLIP experiments. It offers results in different file formats that can easily
	be visualized using the IGV. Furthermore, it offers several additional analyses that can be activated or deactivated as needed. These analyses include:

	*Replicate merging*
	Merges several replicates into a single representative version which can be used for publications, posters or presentations. 
	This version shows the mean hit count for every position. Additionally, a correlation analysis is performed to give the user 
	an evaluation of the sample similarity and therefore a rationale for this analysis.

	*RNA subtype analysis*
	Gives an overview of RNA subtypes the protein of interest was cross-linked to. Specific subtypes can be chosen by the user. Must be provided via a comma separated list
	[e.g. 3_prime_UTR,transcript,5_prime_UTR]. Is activated when providing an annotation file

	*Transcript analysis*
	An analysis that shows if specific transcript are more prone to bind to the protein of interest. The top N (default: N=10) hits of each replicate are provided as output
	together with a fasta file containing all transcripts present in these top hits. Should only be activated if the reference file consists of transcripts.

	*Peak calling*
	Peak calling is used to filter out background noise generated in iCLIP experiments and point out actual cross-link events. 
	PARANOiD employs PureCLIP for its peak calling process. PureCLIP uses a hidden Markov model to divide the reference into 4 different states based on the peak distribution.
	Additionally, identified peaks in close proximity can be merged into binding regions. 

	*Motif detection*
	Protein binding sites are often determined by protein-specific RNA motifs. These motifs are typically found at or in close proximity tocross-linking sites. To identify 
	these motifs the motif detection was implemented. Background noise is being filtered out by using only the top percentiles of peaks (by default only the top 10% are used). 
	Sequences around all peaks above the threshold are extracted and provided as output. All extracted sequences are then used for motif detection via streme, which offers 
	several enriched sequences.

	*Peak distance analysis*
	Some proteins bind to long stretches of RNA instead of certain motif-dependent RNA subregions. This is, for example, the case with the Nucleocapsid (N) protein of 
	several virus species which bind to a distinct number of nucleotides per N protein while packaging the viral RNA. The peak distance analysis was implemented to detect
	such periodical RNA-protein interactions by determining the occurences of distances between peaks. 
	Background noise is being filtered out by using only the top percentiles of peaks (by default only the top 10% are used). Then, going through every peak above the 
	threshold, the distances to all other peaks above this threshold, which are within a certain distance (by default 30 nt) are measured, summarized and provided as a 
	TSV file and visualized as a plot.

	A more detailed description can be accessed via https://paranoid.readthedocs.io/en/readthedocs/index.html
	]]></help>
</tool>