forked from bioinform/varsim
-
Notifications
You must be signed in to change notification settings - Fork 0
/
varsim.py
executable file
·713 lines (626 loc) · 37.2 KB
/
varsim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
#!/usr/bin/env python
import argparse
import os
import sys
import subprocess
import logging
import shutil
import time
import signal
import itertools
import glob
import tempfile
import re
import pysam
from liftover_restricted_vcf_map import lift_vcfs, lift_maps
from generate_small_test_ref import gen_restricted_ref_and_vcfs
from utils import makedirs, run_shell_command, versatile_open, get_loglevel, check_java, MY_DIR, VARSIMJAR, get_version
import utils
REQUIRE_VARSIMJAR = not os.path.isfile(VARSIMJAR)
if REQUIRE_VARSIMJAR: VARSIMJAR = None
def convertCN(filenames, operation):
"""
convert '2/1'-like copy number to a single number(e.g. 2)
0 will be considered same as 1
by default the max number will be kept
the change is in place
"""
logger = logging.getLogger(convertCN.__name__)
logger.info("convertCN started")
if operation != "two2one" and operation != "one2two":
raise ValueError("Only two2one or one2two allowed")
two2one = operation == "two2one"
delimiter = re.compile('[/|]')
for name in filenames:
logger.info("processing {}".format(name))
with versatile_open(name, 'r') as file_fd:
output = tempfile.NamedTemporaryFile(mode = 'r+w', delete = False)
for l in file_fd:
l = l.rstrip()
fields = l.split("\t")
if l.startswith("#") or 'CN' not in fields[8]:
if l.startswith('##FORMAT=<ID=CN'):
if two2one:
l = l.replace("Type=String","Type=Integer")
else:
l = l.replace("Type=Integer", "Type=String")
output.write(l + "\n")
else:
info = fields[8].split(':')
cnIndex = info.index('CN')
gtIndex = info.index('GT')
#change CN field in all samples
for sampleIndex in range(9,len(fields)):
sampleInfo = fields[sampleIndex].split(':')
if two2one:
cn = delimiter.split(sampleInfo[cnIndex])
#here cn is list of strings
sampleInfo[cnIndex] = str(max(map(int, cn)))
elif len(delimiter.split(sampleInfo[cnIndex])) == 1:
#only split when there is only one number
gt = delimiter.split(sampleInfo[gtIndex])
cn = sampleInfo[cnIndex]
for i in range(len(gt)):
gt[i] = '1' if gt[i] == '0' else cn
if sampleInfo[gtIndex].find('/') >= 0:
sampleInfo[cnIndex] = '/'.join(gt)
else:
sampleInfo[cnIndex] = '|'.join(gt)
fields[sampleIndex] = ":".join(sampleInfo)
output.write("\t".join(fields) + "\n")
output.close()
shutil.copyfile(output.name, name)
os.remove(output.name)
logger.info("convertCN done")
return
def get_contigs_list(reference):
with open("%s.fai" % (reference)) as fai_file:
contigs = [line.strip().split()[0] for line in fai_file.readlines()]
return contigs
def monitor_processes(processes):
logger = logging.getLogger(monitor_processes.__name__)
while processes:
time.sleep(1)
kill_all = False
processes_running = []
for p in processes:
status = p.poll()
if status is not None:
logger.info("Process %s exited with code %d" % (p.pid, status))
if status != 0:
kill_all = True
logger.error("Process %s failed. Will kill the remaining processes." % p.pid)
else:
processes_running.append(p)
if kill_all:
for p in processes:
status = p.poll()
if status is None:
try:
os.killpg(p.pid, signal.SIGTERM)
except OSError, e:
try:
os.killpg(p.pid, signal.SIGKILL)
except OSError, ex:
logger.error("Could not kill the process " + str(p.pid))
raise Exception('Aborting... Please check log for details.')
processes = processes_running
return []
def concatenate_files(files, merged, header_str="", simple_cat=True, remove_original=False):
logger = logging.getLogger(concatenate_files.__name__)
logger.info("Concatenating " + " ".join(files) + " as " + merged)
with open(merged, "w") as merged_fd:
for index, f in enumerate(files):
with open(f) as fd:
if simple_cat:
shutil.copyfileobj(fd, merged_fd)
else:
for line in fd:
if line.strip() and (not index or not header_str or not line.startswith(header_str)):
merged_fd.write(line)
if remove_original:
logger.info("Removing " + f)
os.remove(f)
def check_executable(fpath):
logger = logging.getLogger(check_executable.__name__)
if not os.path.isfile(fpath):
logger.error("ERROR: File %s does not exist\n" % (fpath))
sys.exit(os.EX_NOINPUT)
if not os.access(fpath, os.X_OK):
logger.error("ERROR: File %s is not executable\n" % (fpath))
sys.exit(os.EX_NOINPUT)
def fill_missing_sequences(vcf, id, seq_file, reference, work_dir, log_dir):
logger = logging.getLogger(fill_missing_sequences.__name__)
out_vcf = os.path.join(work_dir, os.path.basename(vcf))
if out_vcf.endswith(".gz"):
out_vcf = out_vcf[:-3]
out_log = os.path.join(log_dir, "%s_fill_missing.log" % (os.path.basename(vcf)))
command = ["java", utils.JAVA_XMX, "-jar", VARSIMJAR, "randsequencevcf", "-id", id, "-in_vcf", vcf, "-seq", seq_file, "-out_vcf", out_vcf, "-ref", reference]
with open(out_log, "w") as log_fd:
logger.info("Running command " + " ".join(command))
subprocess.check_call(" ".join(command), shell=True, stderr=log_fd)
return out_vcf
def run_vcfstats(vcfs, out_dir, log_dir):
logger = logging.getLogger(run_vcfstats.__name__)
processes = []
for in_vcf in vcfs:
out_prefix = os.path.basename(in_vcf)
vcfstats_stdout = open(os.path.join(out_dir, "%s.stats" % (out_prefix)), "w")
vcfstats_stderr = open(os.path.join(log_dir, "%s.vcfstats.err" % (out_prefix)), "w")
vcfstats_command = ["java", utils.JAVA_XMX, "-jar", VARSIMJAR, "vcfstats", "-vcf",
in_vcf]
logger.info("Executing command " + " ".join(vcfstats_command))
subprocess.check_call(vcfstats_command, stdout=vcfstats_stdout, stderr=vcfstats_stderr)
return processes
class RandVCFOptions:
def __init__(self, num_snp, num_ins, num_del, num_mnp, num_complex, percent_novel, min_length, max_length, prop_het, num_dup = 0, num_inv = 0):
self.num_snp = num_snp
self.num_ins = num_ins
self.num_del = num_del
self.num_mnp = num_mnp
self.num_dup = num_dup
self.num_inv = num_inv
self.num_complex = num_complex
self.percent_novel = percent_novel
self.min_length = min_length
self.max_length = max_length
self.prop_het = prop_het
class RandDGVOptions:
def __init__(self, num_ins, num_del, num_dup, num_inv, percent_novel, min_length, max_length, prop_het, output_all = " "):
self.num_ins = num_ins
self.num_del = num_del
self.num_dup = num_dup
self.num_inv = num_inv
self.percent_novel = percent_novel
self.min_length = min_length
self.max_length = max_length
self.prop_het = prop_het
self.output_all = output_all
def randdgv_options2randvcf_options(randdgv_options):
'''
automatically set up shared fields between RandVCFOptions and RandDGVOptions
:param randdgv_options:
:return: RandVCFOptions instance
'''
return RandVCFOptions(
num_snp= 0,
num_ins = randdgv_options.num_ins,
num_del = randdgv_options.num_del,
num_mnp = 0,
num_complex = 0,
percent_novel= randdgv_options.percent_novel,
min_length = randdgv_options.min_length,
max_length = randdgv_options.max_length,
prop_het=randdgv_options.prop_het,
num_dup = randdgv_options.num_dup,
num_inv = randdgv_options.num_inv
)
def run_randvcf(sampling_vcf, out_vcf_fd, log_file_fd, seed, sex, randvcf_options, reference):
logger = logging.getLogger(run_randvcf.__name__)
rand_vcf_command = ["java", utils.JAVA_XMX, "-jar", VARSIMJAR, "randvcf2vcf",
"-seed", str(seed),
"-t", sex,
"-num_snp", str(randvcf_options.num_snp),
"-num_ins", str(randvcf_options.num_ins),
"-num_del", str(randvcf_options.num_del),
"-num_mnp", str(randvcf_options.num_mnp),
"-num_complex", str(randvcf_options.num_complex),
"-num_dup", str(randvcf_options.num_dup),
"-num_inv", str(randvcf_options.num_inv),
"-novel", str(randvcf_options.percent_novel),
"-min_len", str(randvcf_options.min_length),
"-max_len", str(randvcf_options.max_length),
"-prop_het", str(randvcf_options.prop_het),
"-ref", os.path.realpath(reference),
"-vcf", sampling_vcf]
logger.info("Executing command " + " ".join(rand_vcf_command))
subprocess.check_call(rand_vcf_command, stdout=out_vcf_fd, stderr=log_file_fd)
return
def run_randdgv(dgv_file, out_vcf_fd, log_file_fd, seed, sex, options, reference, insert_seq_file):
logger = logging.getLogger(run_randvcf.__name__)
rand_dgv_command = ["java", utils.JAVA_XMX, "-jar", VARSIMJAR, "randdgv2vcf",
"-t", sex,
"-seed", str(seed),
"-num_ins", str(options.num_ins),
"-num_del", str(options.num_del),
"-num_dup", str(options.num_dup),
"-num_inv", str(options.num_inv),
"-novel", str(options.percent_novel),
"-min_len", str(options.min_length),
"-max_len", str(options.max_length),
"-prop_het", str(options.prop_het),
"-ref", os.path.realpath(reference),
"-ins", os.path.realpath(insert_seq_file),
"-dgv", os.path.realpath(dgv_file)]
if len(options.output_all.strip()) > 0:
rand_dgv_command.append(options.output_all)
logger.info("Executing command " + " ".join(rand_dgv_command))
subprocess.check_call(rand_dgv_command, stdout=out_vcf_fd, stderr=log_file_fd)
return
def varsim_main(reference,
simulator, # use None to disable simulation
simulator_exe,
total_coverage,
variant_vcfs=[],
sampling_vcf=None,
dgv_file=None,
randvcf_options=None, # use None to disable RandVCF
randdgv_options=None, # use None to disable RandDGV
nlanes=1,
simulator_options="",
sample_id="VarSim_Sample",
log_dir="log",
out_dir="out",
sv_insert_seq=None,
seed=0,
sex="MALE",
remove_filtered=False,
keep_temp=False,
force_five_base_encoding=False,
lift_ref=False,
disable_vcf2diploid=False):
check_java()
# make the directories we need
makedirs([log_dir, out_dir])
logger = logging.getLogger(varsim_main.__name__)
# Make sure we can actually execute the executable
if simulator:
if simulator not in ["dwgsim", "art", "longislnd"]:
raise NotImplementedError("Simulation method {} not implemented".format(simulator))
check_executable(simulator_exe)
processes = []
t_s = time.time()
variant_vcfs = map(os.path.realpath, variant_vcfs)
if sv_insert_seq:
in_vcfs = []
for i, vcf in enumerate(variant_vcfs):
tool_work_dir = os.path.join(out_dir, "filled_in", str(i))
makedirs([tool_work_dir])
in_vcfs.append(fill_missing_sequences(vcf, sample_id, os.path.realpath(sv_insert_seq), reference, tool_work_dir, tool_work_dir))
variant_vcfs = map(os.path.realpath, in_vcfs)
else:
logger.warn("Not filling in SV sequences since no insert sequence file provided")
open_fds = []
if randvcf_options:
if not sampling_vcf:
logger.error("Need to provide the VCF for random sampling")
raise ValueError("Sampling VCF missing")
rand_vcf_out_fd = open(os.path.join(out_dir, "random.vc.vcf"), "w")
rand_vcf_log_fd = open(os.path.join(log_dir, "RandVCF2VCF.err"), "w")
variant_vcfs.append(os.path.realpath(rand_vcf_out_fd.name))
run_randvcf(os.path.realpath(sampling_vcf), rand_vcf_out_fd, rand_vcf_log_fd, seed, sex, randvcf_options, reference)
open_fds += [rand_vcf_out_fd, rand_vcf_log_fd]
if randdgv_options:
if not sv_insert_seq:
raise ValueError("Need SV sequence file to fill in SV sequences")
if not dgv_file:
logger.error("Need to provide the DGV file for random sampling")
raise ValueError("DGV file missing")
rand_dgv_stdout = open(os.path.join(out_dir, "random.sv.vcf"), "w")
rand_dgv_stderr = open(os.path.join(log_dir, "RandDGV2VCF.err"), "w")
variant_vcfs.append(os.path.realpath(rand_dgv_stdout.name))
run_randdgv(dgv_file, rand_dgv_stdout, rand_dgv_stderr, seed, sex, randdgv_options, reference, sv_insert_seq)
open_fds += [rand_dgv_stdout, rand_dgv_stderr]
processes = monitor_processes(processes)
for open_fd in open_fds:
open_fd.close()
merged_reference = os.path.join(out_dir, "%s.fa" % (sample_id))
merged_truth_vcf = os.path.join(out_dir, "%s.truth.vcf" % (sample_id))
merged_map = os.path.join(out_dir, "%s.map" % (sample_id))
processes = run_vcfstats(variant_vcfs, out_dir, log_dir)
if not disable_vcf2diploid:
logger.info("vcf2diploid started")
vcf2diploid_stdout = open(os.path.join(out_dir, "vcf2diploid.out"), "w")
vcf2diploid_stderr = open(os.path.join(log_dir, "vcf2diploid.err"), "w")
vcf_arg_list = sum([["-vcf", v] for v in variant_vcfs], [])
filter_arg_list = ["-pass"] if remove_filtered else []
vcf2diploid_command = ["java", utils.JAVA_XMX, "-jar", VARSIMJAR, "vcf2diploid",
"-t", sex,
"-id", sample_id,
"-chr", os.path.realpath(reference)] + filter_arg_list + vcf_arg_list + ["-no_contig_id"]
logger.info("Executing command " + " ".join(vcf2diploid_command))
subprocess.check_call(vcf2diploid_command, stdout=vcf2diploid_stdout, stderr=vcf2diploid_stderr,
cwd=out_dir)
processes = monitor_processes(processes)
# Now concatenate the .fa from vcf2diploid
contigs = get_contigs_list(reference)
contig_fastas = map(lambda (x, y): os.path.join(out_dir, "%s_%s_%s.fa" % (x, sample_id, y)), itertools.product(contigs, ["maternal", "paternal"]))
fastas_to_cat = filter(os.path.isfile, contig_fastas)
concatenate_files(fastas_to_cat, merged_reference, remove_original=True)
if os.path.getsize(merged_reference) == 0:
logger.error("Merged FASTA is empty. Something bad happened. Exiting")
raise RuntimeError("Empty FASTA generated by vcf2diploid")
# contatenate the vcfs
vcfs_to_cat = filter(os.path.isfile, map(lambda x: os.path.join(out_dir, "%s_%s.vcf" % (x, sample_id)), contigs))
concatenate_files(vcfs_to_cat, merged_truth_vcf, header_str="#", simple_cat=False, remove_original=True)
monitor_processes(run_vcfstats([merged_truth_vcf], out_dir, log_dir))
logger.info("vcf2diploid done")
if lift_ref:
lifted_dir = os.path.join(out_dir, "lifted")
makedirs([lifted_dir])
#quick fix for issue of CN
convertCN([merged_truth_vcf], "two2one")
merged_truth_vcf = lift_vcfs([merged_truth_vcf], os.path.join(lifted_dir, "truth.vcf"), None, tabix_index=False)
#quick fix for issue of CN
convertCN([merged_truth_vcf], "one2two")
pysam.tabix_index(merged_truth_vcf, force=True, preset='vcf')
merged_map = lift_maps([merged_map], os.path.join(lifted_dir, "truth.map"))
if processes:
processes = monitor_processes(processes)
# Now generate the reads using art/pbsim/dwgsim
tmp_files = []
if simulator:
fifos = []
fastqs = []
sim_ts = time.time()
coverage_per_lane = total_coverage * 0.5 / nlanes
processes = []
fifo_src_dst = []
if simulator == "dwgsim":
for i, end in itertools.product(xrange(nlanes), [1, 2]):
fifo_src_dst.append(
("simulated.lane%d.read%d.fastq" % (i, end),
"simulated.lane%d.read%d.fq.gz" % (i, end)))
elif simulator == "art":
for i, end, suffix in itertools.product(xrange(nlanes), [1, 2], ["fq", "aln"]):
fifo_src_dst.append(("simulated.lane%d.read%d.%s" % (i, end, suffix),
"simulated.lane%d.read%d.%s.gz" % (i, end, suffix)))
else: # simulator == "longislnd":
pass
for fifo_name, dst in fifo_src_dst:
fifos.append(os.path.join(out_dir, fifo_name))
if os.path.exists(fifos[-1]): os.remove(fifos[-1])
os.mkfifo(fifos[-1])
gzip_stderr = open(os.path.join(log_dir, "gzip.%s" % (fifo_name)), "w")
gzip_command = "cat %s | gzip -2 > %s" % (fifos[-1], os.path.join(out_dir, dst))
logger.info("Executing command %s" % (gzip_command) )
gzip_p = subprocess.Popen(gzip_command, stdout = None, stderr = gzip_stderr, shell = True)
logger.info( " with pid " + str(gzip_p.pid))
processes.append(gzip_p)
tmp_files.append(os.path.join(out_dir, dst))
simulator_commands_files = []
if simulator == "dwgsim":
for i in xrange(nlanes):
simulator_command = "{} {} -C {} -z {} {} {}".format(os.path.realpath(simulator_exe), simulator_options, coverage_per_lane, seed + i, merged_reference, os.path.join(out_dir, "simulated.lane%d" % (i)))
simulator_commands_files.append((simulator_command, os.path.join(log_dir, "dwgsim.lane%d.out" % (i)), os.path.join(log_dir, "dwgsim.lane%d.err" % (i))))
elif simulator == "art":
for i in xrange(nlanes):
simulator_command = "{} {} -i {} -f {} -rs {} -o {}".format(simulator_exe, simulator_options, merged_reference, coverage_per_lane, seed + i, os.path.join(out_dir, "simulated.lane%d.read" % (i)))
simulator_commands_files.append((simulator_command, os.path.join(log_dir, "art.lane%d.out" % (i)), os.path.join(log_dir, "art.lane%d.err" % (i))))
else: # simulator == "longislnd":
simulator_command = "{} {} --coverage {} --out {} --fasta {}".format(simulator_exe, simulator_options, total_coverage * 0.5, os.path.join(out_dir, "longislnd_sim"), merged_reference)
simulator_commands_files.append((simulator_command, os.path.join(log_dir, "longislnd.out"), os.path.join(log_dir, "longislnd.err")))
simulator_fds = []
for command, stdout, stderr in simulator_commands_files:
stdout_fd = open(stdout, "w")
stderr_fd = open(stderr, "w")
process = subprocess.Popen(command, stdout=stdout_fd, stderr=stderr_fd, shell=True, close_fds=True)
logger.info("Executing command {} with pid {}".format(command, process.pid))
processes.append(process)
simulator_fds += [stdout_fd, stderr_fd]
monitor_processes(processes)
for fd in simulator_fds:
fd.close()
processes = []
logger.info("Read generation took %g seconds" % (time.time() - sim_ts))
sim_t_liftover = time.time()
# Now start lifting over the gzipped files
if simulator != "longislnd":
for i in xrange(nlanes):
liftover_stdout = open(os.path.join(log_dir, "lane%d.out" % (i)), "w")
liftover_stderr = open(os.path.join(log_dir, "liftover%d.log" % (i)), "w")
fastq_liftover_command = "java -server %s -jar %s fastq_liftover -map %s -id %d " \
"-fastq <(gunzip -c %s/simulated.lane%d.read1.fq.gz) " \
"-fastq <(gunzip -c %s/simulated.lane%d.read2.fq.gz) " \
"-out >(gzip -1 > %s/lane%d.read1.fq.gz) " \
"-out >(gzip -1 > %s/lane%d.read2.fq.gz)" % (
utils.JAVA_XMX,
VARSIMJAR, merged_map, i, out_dir, i,
out_dir, i, out_dir, i,
out_dir, i)
if force_five_base_encoding:
fastq_liftover_command += " -force_five_base_encoding "
if simulator == "art":
fastq_liftover_command += " -type art " \
"-aln <(gunzip -c %s/simulated.lane%d.read1.aln.gz) " \
"-aln <(gunzip -c %s/simulated.lane%d.read2.aln.gz)" % (
out_dir, i, out_dir, i)
elif simulator == "pbsim":
fastq_liftover_command += " -type pbsim " \
"-maf <(gunzip -c %s/simulated.lane%d.read1.maf.gz) " \
"-ref %s/simulated.lane%d.ref " % (out_dir, i, out_dir, i)
fastq_liftover_command = "bash -c \"%s\"" % (fastq_liftover_command)
logger.info("Executing command " + fastq_liftover_command)
subprocess.check_call(fastq_liftover_command, stdout = liftover_stdout, stderr = liftover_stderr, shell = True)
fastqs.append(os.path.join(out_dir, "lane%d.read%d.fq.gz" % (i, end)))
else:
# liftover the read map files
read_map_files = list(glob.glob(os.path.join(out_dir, "longislnd_sim", "*.bed")))
merged_raw_readmap = os.path.join(out_dir, "longislnd_sim", "merged_readmap.bed")
concatenate_files(read_map_files, merged_raw_readmap)
read_maps = "-longislnd %s" % merged_raw_readmap
read_map_liftover_command = "java %s -server -jar %s longislnd_liftover " % (utils.JAVA_XMX, VARSIMJAR) + read_maps + " -map %s " % merged_map + " -out %s" % (os.path.join(out_dir, sample_id + ".truth.map"))
read_map_liftover_stderr = open(os.path.join(log_dir, "longislnd_liftover.err"), "w")
logger.info("Executing command " + read_map_liftover_command )
subprocess.check_call(read_map_liftover_command, stdout = None, stderr = read_map_liftover_stderr, shell = True)
monitor_processes(processes)
logger.info("Liftover took %g seconds" % (time.time() - sim_t_liftover))
sim_te = max(sim_ts + 1, time.time())
bytes_written = sum([os.path.getsize(fastq) for fastq in fastqs])
logger.info("Took %g seconds, %ld Mbytes written, %g MB/s" % (
sim_te - sim_ts, bytes_written / 1024.0 / 1024.0, bytes_written / 1024.0 / 1024.0 / (sim_te - sim_ts)))
for fifo in fifos:
os.remove(fifo)
if not keep_temp:
logger.info("Cleaning up intermediate files")
for f in tmp_files:
os.remove(f)
logger.info("Done! (%g hours)" % ((time.time() - t_s) / 3600.0))
if __name__ == "__main__":
check_java()
main_parser = argparse.ArgumentParser(description="VarSim: A high-fidelity simulation validation framework",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
main_parser.add_argument("--out_dir", metavar="DIR",
help="Output directory for the simulated genome, reads and variants", required=False,
default="out")
main_parser.add_argument("--work_dir", metavar="DIR", help="Work directory, currently not used", required=False,
default="work")
main_parser.add_argument("--log_dir", metavar="DIR", help="Log files of all steps are kept here", required=False,
default="log")
main_parser.add_argument("--reference", metavar="FASTA", help="Reference genome that variants will be inserted into",
required=True)
main_parser.add_argument("--seed", metavar="seed", help="Random number seed for reproducibility", type=int, default=0)
main_parser.add_argument("--sex", metavar="Sex", help="Sex of the person (MALE/FEMALE)", required=False, type=str,
choices=["MALE", "FEMALE"], default="MALE")
main_parser.add_argument("--id", metavar="ID", help="Sample ID to be put in output VCF file", required=True)
main_parser.add_argument("--simulator", metavar="SIMULATOR", help="Read simulator to use", required=False, type=str,
choices=["art", "dwgsim", "longislnd"], default="art")
main_parser.add_argument("--simulator_executable", metavar="PATH",
help="Path to the executable of the read simulator chosen"
, required=True)
main_parser.add_argument("--varsim_jar", metavar="PATH", help="Path to VarSim.jar (deprecated)",
default=VARSIMJAR,
required=False)
main_parser.add_argument("--read_length", metavar="LENGTH", help="Length of read to simulate", default=100, type=int)
main_parser.add_argument("--nlanes", metavar="INTEGER",
help="Number of lanes to generate, coverage will be divided evenly over the lanes. Simulation is parallized over lanes. Each lane will have its own pair of files",
default=1, type=int)
main_parser.add_argument("--total_coverage", metavar="FLOAT", help="Total coverage to simulate", default=1.0,
type=float)
main_parser.add_argument("--mean_fragment_size", metavar="INT", help="Mean fragment size to simulate", default=350,
type=int)
main_parser.add_argument("--sd_fragment_size", metavar="INT", help="Standard deviation of fragment size to simulate",
default=50, type=int)
main_parser.add_argument("--vcfs", metavar="VCF",
help="Addtional list of VCFs to insert into genome, priority is lowest ... highest", nargs="+",
default=[])
main_parser.add_argument("--force_five_base_encoding", action="store_true", help="Force output bases to be only ACTGN")
main_parser.add_argument("--filter", action="store_true", help="Only use PASS variants for simulation")
main_parser.add_argument("--keep_temp", action="store_true", help="Keep temporary files after simulation")
main_parser.add_argument("--lift_ref", action="store_true", help="Liftover chromosome names from restricted reference")
main_parser.add_argument("--java_max_mem", metavar="XMX", help="max java memory", default="10g", type = str)
main_parser.add_argument('--version', action='version', version=get_version())
main_parser.add_argument('--log_to_stderr', action='store_true', help='Output log to stderr instead of log_dir/varsim.log')
main_parser.add_argument("--loglevel", help="Set logging level", choices=["debug", "warn", "info"], default="info")
pipeline_control_group = main_parser.add_argument_group("Pipeline control options. Disable parts of the pipeline.")
pipeline_control_group.add_argument("--disable_rand_vcf", action="store_true",
help="Disable sampling from the provided small variant VCF")
pipeline_control_group.add_argument("--disable_rand_dgv", action="store_true",
help="Disable sampline from the provided DGV file")
pipeline_control_group.add_argument("--disable_vcf2diploid", action="store_true",
help="Disable diploid genome simulation")
pipeline_control_group.add_argument("--disable_sim", action="store_true", help="Disable read simulation")
# RandVCF2VCF seed num_SNP num_INS num_DEL num_MNP num_COMPLEX percent_novel min_length_lim max_length_lim reference_file file.vcf
rand_vcf_group = main_parser.add_argument_group("Small variant simulation options")
rand_vcf_group.add_argument("--vc_num_snp", metavar="INTEGER", help="Number of SNPs to sample from small variant VCF",
default=0, type=int)
rand_vcf_group.add_argument("--vc_num_ins", metavar="INTEGER",
help="Number of insertions to sample from small variant VCF", default=0, type=int)
rand_vcf_group.add_argument("--vc_num_del", metavar="INTEGER",
help="Number of deletions to sample from small variant VCF", default=0, type=int)
rand_vcf_group.add_argument("--vc_num_mnp", metavar="INTEGER", help="Number of MNPs to sample from small variant VCF",
default=0, type=int)
rand_vcf_group.add_argument("--vc_num_complex", metavar="INTEGER",
help="Number of complex variants to sample from small variant VCF", default=0,
type=int)
rand_vcf_group.add_argument("--vc_percent_novel", metavar="FLOAT",
help="Percent variants sampled from small variant VCF that will be moved to novel positions",
default=0, type=float)
rand_vcf_group.add_argument("--vc_min_length_lim", metavar="INTEGER",
help="Min length of small variant to accept [inclusive]", default=0, type=int)
rand_vcf_group.add_argument("--vc_max_length_lim", metavar="INTEGER",
help="Max length of small variant to accept [inclusive]", default=99,
type=int)
rand_vcf_group.add_argument("--vc_in_vcf", metavar="VCF", help="Input small variant VCF, usually dbSNP",
required=False)
rand_vcf_group.add_argument("--vc_prop_het", metavar="FLOAT", help="Proportion of heterozygous small variants",
default=0.6,
type=float)
# RandDGV2VCF seed num_INS num_DEL num_DUP num_INV percent_novel min_length_lim max_length_lim reference_file insert_seq.txt dgv_file.txt
rand_dgv_group = main_parser.add_argument_group("Structural variant simulation options")
rand_dgv_group.add_argument("--sv_num_ins", metavar="INTEGER", help="Number of insertions to sample from DGV",
default=20, type=int)
rand_dgv_group.add_argument("--sv_num_del", metavar="INTEGER", help="Number of deletions to sample from DGV",
default=20, type=int)
rand_dgv_group.add_argument("--sv_num_dup", metavar="INTEGER", help="Number of duplications to sample from DGV",
default=20, type=int)
rand_dgv_group.add_argument("--sv_num_inv", metavar="INTEGER", help="Number of inversions to sample from DGV",
default=20, type=int)
rand_dgv_group.add_argument("--sv_percent_novel", metavar="FLOAT",
help="Percent variants sampled from DGV that will be moved to novel positions", default=0,
type=float)
rand_dgv_group.add_argument("--sv_min_length_lim", metavar="min_length_lim",
help="Min length of structural variant to accept [inclusive]", default=100,
type=int)
rand_dgv_group.add_argument("--sv_max_length_lim", metavar="max_length_lim",
help="Max length of structural variant to accept [inclusive]", default=1000000,
type=int)
rand_dgv_group.add_argument("--sv_insert_seq", metavar="FILE",
help="Path to file containing concatenation of real insertion sequences",
required=False)
rand_dgv_group.add_argument("--sv_dgv", metavar="DGV_FILE", help="DGV file containing structural variants",
required=False)
rand_dgv_group.add_argument("--sv_prop_het", metavar="FLOAT", help="Proportion of heterozygous structural variants",
default=0.6,
type=float)
dwgsim_group = main_parser.add_argument_group("DWGSIM options")
dwgsim_group.add_argument("--dwgsim_start_e", metavar="first_base_error_rate", help="Error rate on the first base",
default=0.0001, type=float)
dwgsim_group.add_argument("--dwgsim_end_e", metavar="last_base_error_rate", help="Error rate on the last base",
default=0.0015, type=float)
dwgsim_group.add_argument("--dwgsim_options", help="DWGSIM command-line options", default="", required=False)
art_group = main_parser.add_argument_group("ART options")
art_group.add_argument("--profile_1", metavar="profile_file1", help="ART error profile for first end", default="")
art_group.add_argument("--profile_2", metavar="profile_file2", help="ART error profile for second end", default="")
art_group.add_argument("--art_options", help="ART command-line options", default="")
pbsim_group = main_parser.add_argument_group("PBSIM options")
pbsim_group.add_argument("--model_qc", metavar="model_qc", help="PBSIM QC model", default=None, type=str)
longislnd_group = main_parser.add_argument_group("LongISLND options")
longislnd_group.add_argument("--longislnd_options", help="LongISLND options", default="")
args = main_parser.parse_args()
utils.JAVA_XMX = utils.JAVA_XMX + args.java_max_mem
makedirs([args.log_dir, args.out_dir])
# Setup logging
FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
loglevel = get_loglevel(args.loglevel)
if not args.log_to_stderr:
logging.basicConfig(filename=os.path.join(args.log_dir, "varsim.log"), filemode="w", level=loglevel, format=FORMAT)
else:
logging.basicConfig(level=loglevel, format=FORMAT)
simulator = None if args.disable_sim else args.simulator
simulator_opts = ""
if args.simulator == "dwgsim":
simulator_opts = "-e {1},{2} -E {1},{2} -d {3} -s {4} -1 {5} -2 {5} {6}".format(args.dwgsim_start_e, args.dwgsim_end_e, args.mean_fragment_size, args.sd_fragment_size, args.read_length, args.dwgsim_options)
elif args.simulator == "art":
profile_opts = "-1 {} -2 {}".format(args.profile_1, args.profile_2) if (args.profile_1 and args.profile_2) else ""
simulator_opts = "-p -l {} -m {} -s {} {} {}".format(args.read_length, args.mean_fragment_size, args.sd_fragment_size, profile_opts, args.art_options)
elif args.simulator == "longislnd":
simulator_opts = args.longislnd_options
elif args.simulator == "pbsim":
raise NotImplementedError("pbsim is no longer supported")
randvcf_options = None if args.disable_rand_vcf else RandVCFOptions(args.vc_num_snp, args.vc_num_ins, args.vc_num_del, args.vc_num_mnp, args.vc_num_complex, args.vc_percent_novel, args.vc_min_length_lim, args.vc_max_length_lim, args.vc_prop_het)
randdgv_options = None if args.disable_rand_dgv else RandDGVOptions(args.sv_num_ins, args.sv_num_del, args.sv_num_dup, args.sv_num_inv, args.sv_percent_novel, args.sv_min_length_lim, args.sv_max_length_lim, args.sv_prop_het)
logger = logging.getLogger()
logger.info(str(args))
varsim_main(args.reference,
simulator,
args.simulator_executable,
args.total_coverage,
variant_vcfs=args.vcfs,
sampling_vcf=args.vc_in_vcf,
dgv_file=args.sv_dgv,
randvcf_options=randvcf_options,
randdgv_options=randdgv_options,
nlanes=args.nlanes,
simulator_options=simulator_opts,
sample_id=args.id,
log_dir=args.log_dir,
out_dir=args.out_dir,
sv_insert_seq=args.sv_insert_seq,
seed=args.seed,
sex=args.sex,
remove_filtered=args.filter,
keep_temp=args.keep_temp,
force_five_base_encoding=args.force_five_base_encoding,
lift_ref=args.lift_ref,
disable_vcf2diploid=args.disable_vcf2diploid)