Skip to content

Commit f9e69c0

Browse files
committed
Added initial nextflow scripts for gene-oracle and KINC
1 parent b93e1a5 commit f9e69c0

File tree

6 files changed

+429
-2
lines changed

6 files changed

+429
-2
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1+
.nextflow*
12
*.yaml

KINC/Dockerfile

+2-2
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ ENV PATH "$QTDIR/bin:$PATH"
3030
RUN cd /opt \
3131
&& git clone https://github.com/SystemsGenetics/ACE.git \
3232
&& cd ACE/build \
33-
&& git checkout develop \
33+
&& git checkout v3.0.2 \
3434
&& qmake ../src/ACE.pro PREFIX=/opt/ace \
3535
&& make -j 20 \
3636
&& make qmake_all \
@@ -46,7 +46,7 @@ ENV LD_LIBRARY_PATH "$ACEDIR/lib:$LD_LIBRARY_PATH"
4646
RUN cd /opt \
4747
&& git clone https://github.com/SystemsGenetics/KINC.git \
4848
&& cd KINC/build \
49-
&& git checkout develop \
49+
&& git checkout v3.2.2 \
5050
&& qmake ../src/KINC.pro PREFIX=/opt/kinc \
5151
&& make -j 20 \
5252
&& make qmake_all \

KINC/main.nf

+151
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
#!/usr/bin/env nextflow
2+
3+
4+
5+
/**
6+
* The import_emx process converts a plain-text expression matrix into
7+
* a KINC data object.
8+
*/
9+
process import_emx {
10+
publishDir params.output_dir
11+
12+
output:
13+
file("*.emx") into EMX_FILE
14+
15+
script:
16+
"""
17+
EMX_FILE="\$(basename ${params.dataset} .txt).emx"
18+
19+
kinc settings set opencl 0:0 || echo
20+
kinc settings set threads 4 || echo
21+
kinc settings set logging off || echo
22+
23+
kinc run import-emx \
24+
--input ${params.dataset} \
25+
--output \$EMX_FILE
26+
"""
27+
}
28+
29+
30+
31+
/**
32+
* The similarity process performs a single chunk of KINC similarity.
33+
*/
34+
process similarity {
35+
tag { index }
36+
37+
input:
38+
file(emx_file) from EMX_FILE
39+
val(index) from Channel.from( 0 .. params.chunks-1 )
40+
41+
output:
42+
set val(emx_file.name), file("*.abd") into SIMILARITY_CHUNKS
43+
44+
script:
45+
"""
46+
kinc chunkrun ${index} ${params.chunks} similarity \
47+
--input ${emx_file} \
48+
--clusmethod ${params.clus_method} \
49+
--corrmethod ${params.corr_method}
50+
"""
51+
}
52+
53+
54+
55+
/**
56+
* Merge output chunks from similarity into a list.
57+
*/
58+
GROUPED_CHUNKS = SIMILARITY_CHUNKS.groupTuple()
59+
60+
61+
62+
/**
63+
* The merge process takes the output chunks from similarity
64+
* and merges them into the final output files.
65+
*/
66+
process merge {
67+
publishDir params.output_dir
68+
69+
input:
70+
file(emx_file) from EMX_FILE
71+
set val(emx_name), file(chunks) from GROUPED_CHUNKS
72+
73+
output:
74+
file("*.ccm") into CCM_FILE
75+
file("*.cmx") into CMX_FILE
76+
77+
script:
78+
"""
79+
CCM_FILE="\$(basename ${params.dataset} .txt).ccm"
80+
CMX_FILE="\$(basename ${params.dataset} .txt).cmx"
81+
82+
kinc merge ${params.chunks} similarity \
83+
--input ${emx_file} \
84+
--ccm \$CCM_FILE \
85+
--cmx \$CMX_FILE
86+
"""
87+
}
88+
89+
90+
91+
/**
92+
* Copy CMX file into all processes that use it.
93+
*/
94+
CMX_FILE.into { CMX_FILE_THRESHOLD; CMX_FILE_EXTRACT }
95+
96+
97+
98+
/**
99+
* The threshold process takes the correlation matrix from similarity
100+
* and attempts to find a suitable correlation threshold.
101+
*/
102+
process threshold {
103+
publishDir params.output_dir
104+
105+
input:
106+
file(cmx_file) from CMX_FILE_THRESHOLD
107+
108+
output:
109+
file("*-threshold.log") into THRESHOLD_LOG
110+
111+
script:
112+
"""
113+
LOG_FILE="\$(basename ${params.dataset} .txt)-threshold.log"
114+
115+
kinc run rmt \
116+
--input ${cmx_file} \
117+
--log \$LOG_FILE
118+
"""
119+
}
120+
121+
122+
123+
/**
124+
* The extract process takes the correlation matrix from similarity
125+
* and attempts to find a suitable correlation threshold.
126+
*/
127+
process extract {
128+
publishDir params.output_dir
129+
130+
input:
131+
file(emx_file) from EMX_FILE
132+
file(ccm_file) from CCM_FILE
133+
file(cmx_file) from CMX_FILE_EXTRACT
134+
file(log_file) from THRESHOLD_LOG
135+
136+
output:
137+
file("*-net.txt")
138+
139+
script:
140+
"""
141+
NET_FILE="\$(basename ${params.dataset} .txt)-net.txt"
142+
THRESHOLD=\$(tail -n 1 ${log_file})
143+
144+
kinc run extract \
145+
--emx ${emx_file} \
146+
--ccm ${ccm_file} \
147+
--cmx ${cmx_file} \
148+
--output \$NET_FILE \
149+
--mincorr \$THRESHOLD
150+
"""
151+
}

KINC/nextflow.config

+70
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
manifest {
2+
mainScript = "main.nf"
3+
defaultBranch = "master"
4+
nextflowVersion = ">=0.32.0"
5+
}
6+
7+
params {
8+
dataset = "${PWD}/data/Yeast.txt"
9+
chunks = 10
10+
clus_method = "none"
11+
corr_method = "pearson"
12+
output_dir = "${PWD}/output"
13+
14+
execution {
15+
queue_size = 100
16+
threads = 1
17+
max_retries = 0
18+
error_strategy = "terminate"
19+
}
20+
}
21+
22+
23+
24+
report {
25+
file = "${params.output_dir}/report.html"
26+
}
27+
28+
29+
30+
timeline {
31+
file = "${params.output_dir}/timeline.html"
32+
}
33+
34+
35+
36+
trace {
37+
fields = "task_id,hash,native_id,process,tag,name,status,exit,module,container,cpus,time,disk,memory,attempt,submit,start,complete,duration,realtime,queue,%cpu,%mem,rss,vmem,peak_rss,peak_vmem,rchar,wchar,syscr,syscw,read_bytes,write_bytes"
38+
file = "${params.output_dir}/trace.txt"
39+
raw = true
40+
}
41+
42+
43+
44+
process {
45+
errorStrategy = { "${task.attempt}" <= "${params.execution.max_retries}" ? "retry" : "${params.execution.error_strategy}" }
46+
maxRetries = "${params.execution.max_retries}"
47+
// maxErrors = 1000
48+
}
49+
50+
51+
52+
profiles {
53+
54+
standard {
55+
process.executor = "local"
56+
executor.cpus = 1
57+
executor.memory = "8 GB"
58+
}
59+
60+
pbs {
61+
process {
62+
executor = "pbs"
63+
time = "8h"
64+
clusterOptions = "-l select=1:mem=2gb:ncpus=2:ngpus=2:gpu_model=p100"
65+
}
66+
executor {
67+
queueSize = "${params.execution.queue_size}"
68+
}
69+
}
70+
}

gene-oracle/main.nf

+124
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
#!/usr/bin/env nextflow
2+
3+
4+
5+
/**
6+
* The split process splits the input subset list into chunks.
7+
*/
8+
process split {
9+
input:
10+
val(infile) from params.subset_list
11+
12+
output:
13+
file("*") into SUBSET_CHUNKS mode flatten
14+
15+
when:
16+
params.subset == true
17+
18+
script:
19+
"""
20+
split -d -n r/${params.chunks} $infile ""
21+
"""
22+
}
23+
24+
25+
26+
/**
27+
* The subsets process performs experiments from a single chunk of a subset list.
28+
*/
29+
process subset {
30+
tag { chunk.name }
31+
32+
input:
33+
file(chunk) from SUBSET_CHUNKS
34+
35+
output:
36+
set val("subset"), file("*.log") into SUBSET_LOGS
37+
38+
script:
39+
"""
40+
source activate gene-oracle
41+
42+
cd ${HOME}/workspace/gene-oracle
43+
44+
python scripts/classify.py \
45+
--dataset ${params.dataset} \
46+
--gene_list ${params.gene_list} \
47+
--sample_json ${params.sample_json} \
48+
--config ${params.config} \
49+
--out_file \$OLDPWD/subset.${chunk.name}.log \
50+
--subset_list \$OLDPWD/${chunk.name} \
51+
--verbose
52+
"""
53+
}
54+
55+
56+
57+
/**
58+
* The random process performs a single chunk of random experiments.
59+
*/
60+
process random {
61+
tag { index }
62+
63+
input:
64+
val(index) from Channel.from( 0 .. params.chunks-1 )
65+
66+
output:
67+
set val("random"), file("*.log") into RANDOM_LOGS
68+
69+
when:
70+
params.random == true
71+
72+
script:
73+
"""
74+
IDX=\$(printf %02d $index)
75+
let "MIN = $params.random_min + ($params.random_max - $params.random_min + 1) * $index / $params.chunks"
76+
let "MAX = $params.random_min + ($params.random_max - $params.random_min + 1) * ($index + 1) / $params.chunks - 1"
77+
78+
source activate gene-oracle
79+
80+
cd ${HOME}/workspace/gene-oracle
81+
82+
python scripts/classify.py \
83+
--dataset ${params.dataset} \
84+
--gene_list ${params.gene_list} \
85+
--sample_json ${params.sample_json} \
86+
--config ${params.config} \
87+
--out_file \$OLDPWD/random.\$IDX.log \
88+
--random_test \
89+
--range_random_genes \$MIN \$MAX \
90+
--rand_iters ${params.random_iters} \
91+
--verbose
92+
"""
93+
}
94+
95+
96+
97+
/**
98+
* Group output chunks by prefix so that they can be merged.
99+
*/
100+
MERGE_CHUNKS = Channel.empty()
101+
.concat(SUBSET_LOGS, RANDOM_LOGS)
102+
.groupTuple()
103+
104+
105+
106+
/**
107+
* The merge process takes the output chunks from previous processes
108+
* and merges their outputs into a single file.
109+
*/
110+
process merge {
111+
publishDir params.output_dir
112+
tag { prefix }
113+
114+
input:
115+
set val(prefix), file(chunks) from MERGE_CHUNKS
116+
117+
output:
118+
file("${prefix}.log")
119+
120+
script:
121+
"""
122+
cat ${chunks} > ${prefix}.log
123+
"""
124+
}

0 commit comments

Comments
 (0)