Added initial nextflow scripts for gene-oracle and KINC

bentsherman · bentsherman · commit f9e69c0a054f · 2018-12-18T17:47:55.000-05:00
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
+.nextflow*
 *.yaml
diff --git a/KINC/Dockerfile b/KINC/Dockerfile
@@ -30,7 +30,7 @@ ENV PATH   "$QTDIR/bin:$PATH"
 RUN cd /opt \
 	&& git clone https://github.com/SystemsGenetics/ACE.git \
 	&& cd ACE/build \
-	&& git checkout develop \
+	&& git checkout v3.0.2 \
 	&& qmake ../src/ACE.pro PREFIX=/opt/ace \
 	&& make -j 20 \
 	&& make qmake_all \
@@ -46,7 +46,7 @@ ENV LD_LIBRARY_PATH     "$ACEDIR/lib:$LD_LIBRARY_PATH"
 RUN cd /opt \
 	&& git clone https://github.com/SystemsGenetics/KINC.git \
 	&& cd KINC/build \
-	&& git checkout develop \
+	&& git checkout v3.2.2 \
 	&& qmake ../src/KINC.pro PREFIX=/opt/kinc \
 	&& make -j 20 \
 	&& make qmake_all \
diff --git a/KINC/main.nf b/KINC/main.nf
@@ -0,0 +1,151 @@
+#!/usr/bin/env nextflow
+
+
+
+/**
+ * The import_emx process converts a plain-text expression matrix into
+ * a KINC data object.
+ */
+process import_emx {
+	publishDir params.output_dir
+
+	output:
+		file("*.emx") into EMX_FILE
+
+	script:
+		"""
+		EMX_FILE="\$(basename ${params.dataset} .txt).emx"
+
+		kinc settings set opencl 0:0  || echo
+		kinc settings set threads 4   || echo
+		kinc settings set logging off || echo
+
+		kinc run import-emx \
+			--input ${params.dataset} \
+			--output \$EMX_FILE
+		"""
+}
+
+
+
+/**
+ * The similarity process performs a single chunk of KINC similarity.
+ */
+process similarity {
+	tag { index }
+
+	input:
+		file(emx_file) from EMX_FILE
+		val(index) from Channel.from( 0 .. params.chunks-1 )
+
+	output:
+		set val(emx_file.name), file("*.abd") into SIMILARITY_CHUNKS
+
+	script:
+		"""
+		kinc chunkrun ${index} ${params.chunks} similarity \
+			--input ${emx_file} \
+			--clusmethod ${params.clus_method} \
+			--corrmethod ${params.corr_method}
+		"""
+}
+
+
+
+/**
+ * Merge output chunks from similarity into a list.
+ */
+GROUPED_CHUNKS = SIMILARITY_CHUNKS.groupTuple()
+
+
+
+/**
+ * The merge process takes the output chunks from similarity
+ * and merges them into the final output files.
+ */
+process merge {
+	publishDir params.output_dir
+
+	input:
+		file(emx_file) from EMX_FILE
+		set val(emx_name), file(chunks) from GROUPED_CHUNKS
+
+	output:
+		file("*.ccm") into CCM_FILE
+		file("*.cmx") into CMX_FILE
+
+	script:
+		"""
+		CCM_FILE="\$(basename ${params.dataset} .txt).ccm"
+		CMX_FILE="\$(basename ${params.dataset} .txt).cmx"
+
+		kinc merge ${params.chunks} similarity \
+			--input ${emx_file} \
+			--ccm \$CCM_FILE \
+			--cmx \$CMX_FILE
+		"""
+}
+
+
+
+/**
+ * Copy CMX file into all processes that use it.
+ */
+CMX_FILE.into { CMX_FILE_THRESHOLD; CMX_FILE_EXTRACT }
+
+
+
+/**
+ * The threshold process takes the correlation matrix from similarity
+ * and attempts to find a suitable correlation threshold.
+ */
+process threshold {
+	publishDir params.output_dir
+
+	input:
+		file(cmx_file) from CMX_FILE_THRESHOLD
+
+	output:
+		file("*-threshold.log") into THRESHOLD_LOG
+
+	script:
+		"""
+		LOG_FILE="\$(basename ${params.dataset} .txt)-threshold.log"
+
+		kinc run rmt \
+			--input ${cmx_file} \
+			--log \$LOG_FILE
+		"""
+}
+
+
+
+/**
+ * The extract process takes the correlation matrix from similarity
+ * and attempts to find a suitable correlation threshold.
+ */
+process extract {
+	publishDir params.output_dir
+
+	input:
+		file(emx_file) from EMX_FILE
+		file(ccm_file) from CCM_FILE
+		file(cmx_file) from CMX_FILE_EXTRACT
+		file(log_file) from THRESHOLD_LOG
+
+	output:
+		file("*-net.txt")
+
+	script:
+		"""
+		NET_FILE="\$(basename ${params.dataset} .txt)-net.txt"
+		THRESHOLD=\$(tail -n 1 ${log_file})
+
+		kinc run extract \
+		   --emx ${emx_file} \
+		   --ccm ${ccm_file} \
+		   --cmx ${cmx_file} \
+		   --output \$NET_FILE \
+		   --mincorr \$THRESHOLD
+		"""
+}
diff --git a/KINC/nextflow.config b/KINC/nextflow.config
@@ -0,0 +1,70 @@
+manifest {
+	mainScript = "main.nf"
+	defaultBranch = "master"
+	nextflowVersion = ">=0.32.0"
+}
+
+params {
+	dataset = "${PWD}/data/Yeast.txt"
+	chunks = 10
+	clus_method = "none"
+	corr_method = "pearson"
+	output_dir = "${PWD}/output"
+
+	execution {
+		queue_size = 100
+		threads = 1
+		max_retries = 0
+		error_strategy = "terminate"
+	}
+}
+
+
+
+report {
+	file = "${params.output_dir}/report.html"
+}
+
+
+
+timeline {
+	file = "${params.output_dir}/timeline.html"
+}
+
+
+
+trace {
+	fields = "task_id,hash,native_id,process,tag,name,status,exit,module,container,cpus,time,disk,memory,attempt,submit,start,complete,duration,realtime,queue,%cpu,%mem,rss,vmem,peak_rss,peak_vmem,rchar,wchar,syscr,syscw,read_bytes,write_bytes"
+	file = "${params.output_dir}/trace.txt"
+	raw = true
+}
+
+
+
+process {
+	errorStrategy = { "${task.attempt}" <= "${params.execution.max_retries}" ? "retry" : "${params.execution.error_strategy}" }
+	maxRetries = "${params.execution.max_retries}"
+	// maxErrors = 1000
+}
+
+
+
+profiles {
+
+	standard {
+		process.executor = "local"
+		executor.cpus = 1
+		executor.memory = "8 GB"
+	}
+
+	pbs {
+		process {
+			executor = "pbs"
+			time = "8h"
+			clusterOptions = "-l select=1:mem=2gb:ncpus=2:ngpus=2:gpu_model=p100"
+		}
+		executor {
+			queueSize = "${params.execution.queue_size}"
+		}
+	}
+}
diff --git a/gene-oracle/main.nf b/gene-oracle/main.nf
@@ -0,0 +1,124 @@
+#!/usr/bin/env nextflow
+
+
+
+/**
+ * The split process splits the input subset list into chunks.
+ */
+process split {
+	input:
+		val(infile) from params.subset_list
+
+	output:
+		file("*") into SUBSET_CHUNKS mode flatten
+
+	when:
+		params.subset == true
+
+	script:
+		"""
+		split -d -n r/${params.chunks} $infile ""
+		"""
+}
+
+
+
+/**
+ * The subsets process performs experiments from a single chunk of a subset list.
+ */
+process subset {
+	tag { chunk.name }
+
+	input:
+		file(chunk) from SUBSET_CHUNKS
+
+	output:
+		set val("subset"), file("*.log") into SUBSET_LOGS
+
+	script:
+		"""
+		source activate gene-oracle
+
+		cd ${HOME}/workspace/gene-oracle
+
+		python scripts/classify.py \
+			--dataset      ${params.dataset} \
+			--gene_list    ${params.gene_list} \
+			--sample_json  ${params.sample_json} \
+			--config       ${params.config} \
+			--out_file     \$OLDPWD/subset.${chunk.name}.log \
+			--subset_list  \$OLDPWD/${chunk.name} \
+			--verbose
+		"""
+}
+
+
+
+/**
+ * The random process performs a single chunk of random experiments.
+ */
+process random {
+	tag { index }
+
+	input:
+		val(index) from Channel.from( 0 .. params.chunks-1 )
+
+	output:
+		set val("random"), file("*.log") into RANDOM_LOGS
+
+	when:
+		params.random == true
+
+	script:
+		"""
+		IDX=\$(printf %02d $index)
+		let "MIN = $params.random_min + ($params.random_max - $params.random_min + 1) * $index / $params.chunks"
+		let "MAX = $params.random_min + ($params.random_max - $params.random_min + 1) * ($index + 1) / $params.chunks - 1"
+
+		source activate gene-oracle
+
+		cd ${HOME}/workspace/gene-oracle
+
+		python scripts/classify.py \
+			--dataset      ${params.dataset} \
+			--gene_list    ${params.gene_list} \
+			--sample_json  ${params.sample_json} \
+			--config       ${params.config} \
+			--out_file     \$OLDPWD/random.\$IDX.log \
+			--random_test \
+			--range_random_genes \$MIN \$MAX \
+			--rand_iters ${params.random_iters} \
+			--verbose
+		"""
+}
+
+
+
+/**
+ * Group output chunks by prefix so that they can be merged.
+ */
+MERGE_CHUNKS = Channel.empty()
+	.concat(SUBSET_LOGS, RANDOM_LOGS)
+	.groupTuple()
+
+
+
+/**
+ * The merge process takes the output chunks from previous processes
+ * and merges their outputs into a single file.
+ */
+process merge {
+	publishDir params.output_dir
+	tag { prefix }
+
+	input:
+		set val(prefix), file(chunks) from MERGE_CHUNKS
+
+	output:
+		file("${prefix}.log")
+
+	script:
+		"""
+		cat ${chunks} > ${prefix}.log
+		"""
+}
diff --git a/gene-oracle/nextflow.config b/gene-oracle/nextflow.config