-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathSnakefile
474 lines (447 loc) · 19.3 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
configfile: "config/config.yaml"
# TODO refactor the snakefile so that all dependencies are managed by a containerized mamba.
# TODO make it so that parallelization is dynamic, probably using checkpoints.
# container: "docker://condaforge/mambaforge:23.1.0-1"
rule all:
input:
"results/grafted.tre"
# Creates and populates a SQLite database with filtered sequence records.
# Uses BOLD dump TSV as defined in config file
rule create_database:
input: config["file_names"]["bold_tsv"]
output: 'results/databases/create_database.ok'
params:
db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]),
marker=config["marker"],
minlength=config["minlength"],
log_level=config['log_level']
conda: "envs/create_database.yml"
log: "logs/create_database/create_database.log"
benchmark:
"benchmarks/create_database.benchmark.txt"
shell:
"""
python workflow/scripts/create_database.py \
-v {params.log_level} \
-o {params.db} \
-i {input} 2> {log}
touch {output}
"""
# Enriches the database with mappings to OpenToL. Because this operates on
# the same database file, the output is a 0-byte file `map_opentol.ok` to
# indicate that the task was run.
rule map_opentol:
input: 'results/databases/create_database.ok'
output: 'results/databases/map_opentol.ok'
params:
db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]),
marker=config['marker'],
log_level=config['log_level']
conda: "envs/map_opentol.yml"
log: "logs/map_opentol/map_opentol.log"
benchmark:
"benchmarks/map_opentol.benchmark.txt"
shell:
"""
python workflow/scripts/map_opentol.py \
-d {params.db} \
-m {params.marker} \
-v {params.log_level} \
-o {output} 2> {log}
"""
# Creates and populates the OpenToL SQLite database. Uses the location of
# the OpenToL tree as per the config file. Merges the resulting database
# with the BOLD database (which gains a table `node`). SQLite can, at time
# of writing, not apply foreign key constraints retroactively. This is a
# shame because taxon.opentol_id implicitly references node.id.
rule megatree_loader:
input:
tree = config["file_names"]["open_tre"],
mapping_ok = rules.map_opentol.output
output: 'results/databases/megatree_loader.ok'
conda: "envs/megatree_loader.yml"
log: "logs/megatree_loader/megatree_loader.log"
benchmark:
"benchmarks/megatree_loader.benchmark.txt"
params:
db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]),
tempdb = temp("results/databases/opentree_nodes.db"),
tempsql = temp("results/databases/node.sql")
shell:
"""
megatree-loader -i {input.tree} -d {params.tempdb} -v 2> {log}
sqlite3 {params.tempdb} ".dump node" > {params.tempsql} 2>> {log}
sqlite3 {params.db} < {params.tempsql} 2>> {log}
touch {output} 2>> {log}
rm {params.tempsql} {params.tempdb}
"""
# TODO either this is updated dynamically or scattergather is abandoned for dynamic()
scattergather:
split = config["nfamilies"]
# Exports unaligned BIN sequences for each family within the total set. This task is parallelized as many times as
# specified by config["nfamilies"]. During this task, the longest raw sequence within each BIN is selected.
rule family_fasta:
input: rules.map_opentol.output
output:
fastas=scatter.split("results/fasta/family/{scatteritem}/unaligned.fa"),
status=f'{config["file_names"]["fasta_dir"]}/family_fasta.ok'
params:
log_level=config['log_level'],
fasta_dir=config["file_names"]["fasta_dir"],
filter_level=config["fasta_filter"]["filter_level"],
filter_name=config["fasta_filter"]["filter_name"],
marker=config["marker"],
db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]),
chunks=config["nfamilies"],
conda: "envs/family_fasta.yml"
log: "logs/family_fasta/family_fasta.log"
benchmark:
"benchmarks/family_fasta.benchmark.txt"
shell:
"""
python workflow/scripts/family_fasta.py \
-d {params.db} \
-f {params.fasta_dir} \
-l {params.filter_level} \
-n {params.filter_name} \
-c {params.chunks} \
-m {params.marker} \
-v {params.log_level} 2> {log}
touch {output.status}
"""
# Creates a local BLAST database from the exported sequences that have OTT IDs. Later on, this database will be used
# for locating outgroups that are i) close to the ingroup but not inside it, ii) occur in the OpenToL. The latter will
# hopefully mean that even monotypic families will have a constraint tree with >= 3 tips so all chunks can be treated
# identically.
rule makeblastdb:
input: rules.family_fasta.output.status
output: f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}.nsq'
params:
chunks=config["nfamilies"],
fasta_dir=config["file_names"]["fasta_dir"],
concatenated=f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}.fa',
blastdb=f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}'
conda: "envs/blast.yml"
log: "logs/makeblastdb/makeblastdb.log"
benchmark:
"benchmarks/makeblastdb.benchmark.txt"
shell:
"""
sh workflow/scripts/makeblastdb.sh \
{params.blastdb} {params.fasta_dir} {params.chunks} {params.concatenated} 2> {log}
"""
# Gets the nearest outgroups by blasting. The number of outgroups is defined in config["outgroups"]. The outgroups are
# selected by querying every sequence in the ingroup for the top 10 hits, then taking the most frequently occurring
# hits across all queries.
rule get_outgroups:
input:
unaligned = "results/fasta/family/{scatteritem}/unaligned.fa",
makeblastdb = rules.makeblastdb.output
output: "results/fasta/family/{scatteritem}/outgroups.fa",
params:
outgroups = config["outgroups"],
blastdb=f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}'
conda: "envs/blast.yml"
log: "logs/get_outgroups/get_outgroups-{scatteritem}.log"
benchmark:
"benchmarks/get_outgroups/get_outgroups-{scatteritem}.benchmark.txt"
shell:
"""
sh workflow/scripts/get_outgroups.sh {params.blastdb} {params.outgroups} {input.unaligned} {output} 2> {log}
"""
# Exports OpenToL newick file for each unaligned BIN sequence file. This implementation uses the induced subtree web
# service endpoint from OpenToL. The new implementation appears to achieve better coverage.
rule family_constraint:
input:
unaligned = "results/fasta/family/{scatteritem}/unaligned.fa",
outgroups = "results/fasta/family/{scatteritem}/outgroups.fa"
output: "results/fasta/family/{scatteritem}/constraint.tre"
params:
log_level=config['log_level'],
db="results/databases/BOLD_{}_barcodes.db".format(config["marker"])
conda: "envs/family_constraint.yml"
log: "logs/family_constraint/family_constraint-{scatteritem}.log"
benchmark:
"benchmarks/family_constraint/family_constraint-{scatteritem}.benchmark.txt"
shell:
"""
python workflow/scripts/family_constraint.py \
-i {input.unaligned} \
-g {input.outgroups} \
-d {params.db} \
-o {output} \
-v {params.log_level} 2> {log}
"""
# Aligns sequences with HMM. Here, the sequences are also corrected for possible revcom issues. It is possible that this
# is not needed at all because so far 0 revcom sequences were observed in BOLD.
rule msa_hmm:
input:
ingroup="results/fasta/family/{scatteritem}/unaligned.fa",
outgroup="results/fasta/family/{scatteritem}/outgroups.fa"
output: "results/fasta/family/{scatteritem}/aligned.fa"
params:
log_level=config['log_level'],
hmm_file=config['file_names']['hmm'],
db="results/databases/BOLD_{}_barcodes.db".format(config["marker"])
conda: "envs/msa_hmm.yml"
log: "logs/msa_hmm/msa_hmm-{scatteritem}.log"
benchmark:
"benchmarks/msa_hmm/msa_hmm-{scatteritem}.benchmark.txt"
shell:
"""
python workflow/scripts/msa_hmm.py \
-i {input.ingroup} \
-g {input.outgroup} \
-o {output} \
-m {params.hmm_file} \
-v {params.log_level} \
-d {params.db} 2> {log}
"""
# Prepares the aligned ingroup sequences and constraint tree for analysis by raxml-ng. This step also includes outgroup
# selection. A number of changes are needed here. Firstly, there are cases where the selected outgroup sequences yield
# results where the ingroup is not monophyletic with respect to the outgroups. This means that outgroup selection needs
# to be refined by considering all pairwise distances within the ingroup, and any outgroup must have a greater distance
# to the nearest ingroup taxon than the latter has to all other members of the ingroup. Secondly, the constraint tree
# should be generated here so that it also includes the outgroups, which means that an additional outgroup selection
# criterion should be that the candidate outgroups are in the OpenToL.
rule prep_raxml:
input:
alignment="results/fasta/family/{scatteritem}/aligned.fa",
tree="results/fasta/family/{scatteritem}/constraint.tre"
output:
tree="results/fasta/family/{scatteritem}/remapped.tre"
params:
db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]),
log_level=config['log_level']
conda: "envs/prep_raxml.yml"
log: "logs/prep_raxml/prep_raxml-{scatteritem}.log"
benchmark:
"benchmarks/prep_raxml/prep_raxml-{scatteritem}.benchmark.txt"
shell:
"""
python workflow/scripts/prep_raxml.py \
-t {input.tree} \
-o {output.tree} \
-a {input.alignment} \
-v {params.log_level} \
-d {params.db} 2> {log}
"""
# Runs raxml as a constrained tree search. Possibly this should instead take a two-step approach, where unplaced
# sequences are first placed, and then the branch lengths are estimated. This requires that there actually is a
# constraint tree. Currently, this rule deals with the fact that some of the constraint trees are 0-byte files due
# to incomplete OpenToL coverage by trapping raxml-ng errors and then re-running without the constraint.
rule run_raxml:
input:
alignment = "results/fasta/family/{scatteritem}/aligned.fa",
tree = "results/fasta/family/{scatteritem}/remapped.tre"
output:
tree = "results/fasta/family/{scatteritem}/aligned.fa.raxml.bestTree"
params:
model = config['model'],
num_outgroups= config['outgroups']
log: "logs/run_raxml/run_raxml-{scatteritem}.log"
benchmark:
"benchmarks/run_raxml/run_raxml-{scatteritem}.benchmark.txt"
conda: "envs/raxml.yml"
shell:
"""
if [ -s {input.alignment} ]; then
OG=$(grep '>' {input.alignment} | tail -{params.num_outgroups} | sed -e 's/>//' | tr '\n' ',')
if [ -s {input.tree} ]; then
set -e
raxml-ng \
--redo \
--msa {input.alignment} \
--outgroup $OG \
--model {params.model} \
--tree-constraint {input.tree} \
--search1 > {log} 2>&1 \
|| \
raxml-ng \
--redo \
--msa {input.alignment} \
--outgroup $OG \
--model {params.model} \
--search1 >> {log} 2>&1
else
raxml-ng --msa {input.alignment} --outgroup $OG --model {params.model} --search --redo > {log} 2>&1
fi
else
touch {output.tree}
fi
"""
# Reroots the raxml output in and then prunes the outgroups. Rooting is done by finding the smallest clade subtended
# by the root in the constraint tree, looking for the smallest bipartition in the result tree that includes those same
# clade members and then rooting on that bipartition branch. Subsequently, the outgroup taxa are removed.
rule reroot_raxml_output:
input:
tree = "results/fasta/family/{scatteritem}/aligned.fa.raxml.bestTree",
constraint = "results/fasta/family/{scatteritem}/remapped.tre",
alignment = "results/fasta/family/{scatteritem}/aligned.fa"
output:
outtree = "results/fasta/family/{scatteritem}/aligned.fa.raxml.bestTree.rooted"
params:
log_level = config['log_level'],
num_outgroups = config['outgroups']
log: "logs/reroot_raxml_output/reroot_raxml_output-{scatteritem}.log"
benchmark:
"benchmarks/reroot_raxml_output/reroot_raxml_output-{scatteritem}.benchmark.txt"
conda: "envs/reroot_backbone.yml"
shell:
"""
if [ -s {input.alignment} ]; then
OG=$(grep '>' {input.alignment} | tail -{params.num_outgroups} | sed -e 's/>//' | tr '\n' ',')
python workflow/scripts/reroot_backbone.py \
-i {input.tree} \
-c {input.constraint} \
-o {output.outtree} \
-g $OG \
-v {params.log_level} 2> {log}
else
touch {output.outtree}
fi
"""
# In principle, exemplar choosing can be performed using one of three ways:
# 1. the tallest tips on either side of the root are chosen
# 2. the shallowest tips on either side
# 3. the ones closest to the median root-to-tip path length
# Empirically, the first option yields rescaled branch lengths that are closest
# to those optimized freely on a total tree.
rule choose_exemplars:
input:
alignment = "results/fasta/family/{scatteritem}/aligned.fa",
tree= "results/fasta/family/{scatteritem}/aligned.fa.raxml.bestTree.rooted"
output: "results/fasta/family/{scatteritem}/exemplars.fa"
params:
log_level = config['log_level'],
strategy = 'median'
log: "logs/choose_exemplars/choose_exemplars-{scatteritem}.log"
benchmark:
"benchmarks/choose_exemplars/choose_exemplars-{scatteritem}.benchmark.txt"
conda: "envs/choose_exemplars.yaml"
shell:
"""
if [ -s {input.alignment} ]; then
python workflow/scripts/choose_exemplars.py \
-v {params.log_level} \
-t {input.tree} \
-i {input.alignment} \
-s {params.strategy} \
-o {output} 2> {log}
else
touch {output}
fi
"""
# When aligning the family-level subsets, different families may have different indel
# patterns. So, although they are all orientated against the model in the same way,
# there can be some frame shifting. The least bad option is to unalign and realign
# the exemplar sequences on the fly. Sed does the unaligning, hmmalign emits Stockholm
# data on the command line, awk turns it into FASTA.
rule prep_raxml_backbone:
input:
fastas=gather.split("results/fasta/family/{scatteritem}/exemplars.fa"),
opentol='results/databases/megatree_loader.ok'
output:
fasta="results/fasta/raxml-ready.fa",
tree="results/fasta/raxml-ready.tre",
extinct="results/fasta/extinct_pids.txt"
params:
db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]),
log_level = config['log_level'],
hmm_file=config['file_names']['hmm']
log: "logs/prep_raxml_backbone/prep_raxml_backbone.log"
conda: "envs/prep_raxml_backbone.yml"
shell:
"""
# Generate constraint tree and list of extinct PIDs
python workflow/scripts/backbone_constraint.py \
-d {params.db} \
-v {params.log_level} \
-i '{input.fastas}' \
-e {output.extinct} \
-o {output.tree} 2> {log}
# Clean the concatenated FASTA by removing gaps (dashes)
sed '/^>/! s/-//g' {input.fastas} > results/fasta/unaligned.fa
# Align with hmmalign and output in Stockholm format
hmmalign --trim --dna --informat FASTA --outformat Stockholm {params.hmm_file} results/fasta/unaligned.fa > results/fasta/aligned.sto
# Convert the Stockholm alignment to a non-interleaved FASTA format for RAxML
seqmagick convert results/fasta/aligned.sto {output.fasta}
# Remove any extinct PIDs
[ -e {output.extinct} ] && seqmagick mogrify --exclude-from-file {output.extinct} {output.fasta}
"""
# Constructs the backbone topology under ML using raxml-ng with a tree constraint that
# is intended to keep all pairs of exemplars monophyletic. Here, no outgroups are
# specified, because in principle this step could be dealing with the entire
# taxonomic width of BOLD. Instead, the tree will be rooted using the constraint.
rule run_raxml_backbone:
input:
alignment = "results/fasta/raxml-ready.fa",
tree = "results/fasta/raxml-ready.tre"
output:
tree = "results/fasta/raxml-ready.fa.raxml.bestTree"
params:
model = config['model']
log: "logs/run_raxml_backbone/run_raxml_backbone.log"
benchmark:
"benchmarks/run_raxml_backbone.benchmark.txt"
conda: "envs/raxml.yml"
shell:
"""
raxml-ng \
--redo \
--msa {input.alignment} \
--model {params.model} \
--tree-constraint {input.tree} \
--search > {log} 2>&1
"""
# Reroots the backbone using the constraint tree. The basic method is to find the
# basal split in the constraint, look for the equivalent bipartition in the inferred
# tree, and root there. TODO: this approach currently places the root on the midpoint
# of the bipartition. This is not ideal but it is not obvious what better options
# are available if there are no outgroups.
rule reroot_backbone:
input:
backbone = "results/fasta/raxml-ready.fa.raxml.bestTree",
constraint = "results/fasta/raxml-ready.tre"
output:
tree = "results/fasta/raxml-ready.fa.raxml.bestTree.rooted"
params:
log_level = config['log_level']
log: "logs/reroot_backbone/reroot_backbone.log"
benchmark:
"benchmarks/reroot_backbone.benchmark.txt"
conda: "envs/reroot_backbone.yml"
shell:
"""
python workflow/scripts/reroot_backbone.py \
-i {input.backbone} \
-c {input.constraint} \
-o {output.tree} \
-v {params.log_level} 2> {log}
"""
# Grafts the individual family clades onto the backbone. This is done by taking the
# backbone tree and replacing the tips that are in the clade with the clade tree.
rule graft_clades:
input:
backbone = "results/fasta/raxml-ready.fa.raxml.bestTree.rooted",
clades = config['file_names']['fasta_dir'],
extinct = "results/fasta/extinct_pids.txt"
output:
tree = "results/grafted.tre"
params:
log_level = config['log_level'],
nfamilies = config["nfamilies"]
log: "logs/graft_clades/graft_clades.log"
benchmark:
"benchmarks/graft_clades.benchmark.txt"
conda: "envs/graft_clades.yml"
shell:
"""
python workflow/scripts/graft_clades.py \
-t {input.backbone} \
-f {input.clades} \
-e {input.extinct} \
-o {output.tree} \
-n {params.nfamilies} \
-v {params.log_level} 2> {log}
"""