-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathvcf_snp_to_fasta.py
180 lines (154 loc) · 6.01 KB
/
vcf_snp_to_fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/env python3
""" Produce fasta files corresponding from SNP only VCFs
Note that this is a piece of legacy code from the Broad's fungal group, and is
provided as is, without optimizing the performance, code style and
documentation etc. Only minimal changes were introduced to ensure compatibility
with the funpipe package.
"""
import sys
import re
import argparse
from funpipe import vcftools
import gzip
parser = argparse.ArgumentParser()
parser.add_argument(
'infile',
help='a txt file where each line is a path to a vcf', type=str)
parser.add_argument(
'--max_amb_samples',
help=('maximum number of samples with ambiguous calls for a site to be '
'included'), type=int, default=10)
args = parser.parse_args()
infile = args.infile
max_amb = args.max_amb_samples # 100000 as default in legacy code
# Initialize variables
ref_bases = {}
alt_bases = {}
passed_snp_positions = {}
sample_list = []
amb_pos_counts = {}
sample_translations = {}
line_number = 1
site_in_span_dels = 0
# process each vcf file
fof = open(infile)
for vcf_name in fof:
if vcf_name[-2:] == 'gz':
raise ValueError((
"This legacy code does not support compressed VCFs. Please "
"decompress all your bgzipped VCFs and re-ran this analysis."))
sys.stderr.write("Searching " + vcf_name)
# process individual VCF
vcf_name = vcf_name.rstrip()
header = vcftools.VcfHeader(vcf_name)
caller = header.get_caller()
samples = header.get_samples()
contigs = header.get_contigs()
if line_number == 1:
for contig in contigs:
passed_snp_positions[contig] = {}
amb_pos_counts[contig] = {}
if samples == ['SAMPLE']:
samples = [vcf_name]
sample_translations[vcf_name] = 'SAMPLE'
sys.stderr.write("No sample name in " + vcf_name +
", using file name.\n")
per_file_sample_list = []
for sample in samples:
sample_list.append(sample)
per_file_sample_list.append(sample)
with open(vcf_name) as vcffh:
for vcf_line in vcffh:
if vcf_line[0] != '#':
record = vcftools.VcfRecord(vcf_line)
pass_or_fail = record.is_passing(caller)
# skip sites with '*' marks in alt field
if '*' in record.get_alt_field():
site_in_span_dels += 1
continue
for sample in per_file_sample_list:
translation = sample
try:
translation = sample_translations[sample]
except:
pass
genotype = (record.get_genotype(
index=header.get_sample_index(translation), min_gq=0))
variant_type = record.get_variant_type(caller, genotype)
if pass_or_fail and not variant_type:
pass
elif pass_or_fail and variant_type == 'SNP':
chr = record.get_chrom()
pos = int(record.get_pos())
if sample not in alt_bases:
alt_bases[sample] = {}
if chr not in alt_bases[sample]:
alt_bases[sample][chr] = {}
alt_bases[sample][chr][pos] = record.get_alt(genotype)
if chr not in ref_bases:
ref_bases[chr] = {}
ref_bases[chr][pos] = record.get_ref()
if alt_bases[sample][chr][pos] != 'N':
passed_snp_positions[chr][pos] = True
else:
chr = record.get_chrom()
pos = int(record.get_pos())
if sample not in alt_bases:
alt_bases[sample] = {}
if chr not in alt_bases[sample]:
alt_bases[sample][chr] = {}
alt_bases[sample][chr][pos] = 'N'
if pos not in amb_pos_counts[chr]:
amb_pos_counts[chr][pos] = 1
else:
amb_pos_counts[chr][pos] += 1
line_number += 1
fof.close()
sys.stderr.write(
str(site_in_span_dels)
+ " sites filtered due to locating in spanning deletions.\n")
# write fasta files
sorted_chrs = sorted(passed_snp_positions.keys())
print(">reference")
sequence = ''
for chr in sorted_chrs:
sorted_positions = sorted(passed_snp_positions[chr])
for pos in sorted_positions:
if pos in amb_pos_counts[chr]:
if amb_pos_counts[chr][pos] <= max_amb:
sequence += ref_bases[chr][pos]
else:
sequence += ref_bases[chr][pos]
for i in range(0, len(sequence), 60):
print(sequence[i:i+60])
# each sample
num_masked = 0
for sample in sample_list:
print(">" + sample)
sequence = ''
for chr in sorted_chrs:
sorted_positions = sorted(passed_snp_positions[chr])
for pos in sorted_positions:
if pos in amb_pos_counts[chr]:
if amb_pos_counts[chr][pos] <= max_amb:
try:
if pos in alt_bases[sample][chr]:
sequence += alt_bases[sample][chr][pos]
else:
sequence += ref_bases[chr][pos]
except:
sequence += ref_bases[chr][pos]
else:
num_masked += 1
else:
try:
if pos in alt_bases[sample][chr]:
sequence += alt_bases[sample][chr][pos]
else:
sequence += ref_bases[chr][pos]
except:
sequence += ref_bases[chr][pos]
for i in range(0, len(sequence), 60):
print(sequence[i:i+60])
sys.stderr.write(str(num_masked)
+ " sites filtered due to high missingness.\n")