-
Notifications
You must be signed in to change notification settings - Fork 29
/
_emirge.pyx
360 lines (302 loc) · 14.1 KB
/
_emirge.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
"""
_emirge.pyx contains helper cython functions for emirge.py
EMIRGE: Expectation-Maximization Iterative Reconstruction of Genes from the Environment
Copyright (C) 2010-2016 Christopher S. Miller ([email protected])
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>
https://github.com/csmiller/EMIRGE
"""
from libc.stdlib cimport malloc, free, atoi
from libc.math cimport M_E, log, pow as c_pow
from cpython.tuple cimport PyTuple_GetItem
import numpy as np
cimport numpy as np
import sys
# from time import time # DEBUG
# for lookup table of qual values
cdef extern from *:
ctypedef double* static_const_double_ptr "static const double*"
cdef extern from "_emirge_C.h":
static_const_double_ptr qual2one_minus_p # lookup tables to avoid too many double calcs.
static_const_double_ptr qual2p_div_3
cpdef int base_alpha2int(char base_ascii):
"""
base2i = {"A":0,"T":1,"C":2,"G":3}
"""
if base_ascii == <int>('A'):
return 0
elif base_ascii == <int>('T'):
return 1
elif base_ascii == <int>('C'):
return 2
elif base_ascii == <int>('G'):
return 3
else:
return 4
cpdef double calc_likelihood_cell(unsigned int seq_i, unsigned int read_i, unsigned int pair_i,
unsigned int pos,
np.ndarray[np.uint8_t, ndim=1] numeric_bases,
np.ndarray[np.uint8_t, ndim=1] qualints,
np.ndarray[np.float_t, ndim=2] probN): # an individual probN numpy matrix
"""
numeric_bases is an array of unsigned char (numpy.ubyte) already converted to 0-4-based index
return single entry for lik_data, keeping main looping in python still
RETURNS: double to place in lik_data
"""
cdef double p = 0.0
cdef double s
cdef int i
cdef int j
for i in range(numeric_bases.shape[0]):
# for i from 0 <= i < numeric_bases.shape[0]: # old syntax, range syntax above fast if i is cdef-ed.
s = 0.0
for j in range(4):
if numeric_bases[i] == j: # this is called base, set 1-P
# s += (1. - error_p_i) * probN[pos + i, j]
s += ( qual2one_minus_p[qualints[i]] * probN[pos + i, j] ) # lookup table
else: # this is not the called base, so set to P/3
# s += (error_p_i / 3 ) * probN[pos + i, j]
s += ( qual2p_div_3[qualints[i]] * probN[pos + i, j] ) # lookup table
p += log(s)
# do at least this product in log space (0.94005726833168002 vs. 0.94005726833167991)
# likelihood[seq_i, read_i] = e**(np_log(prob_b_i).sum())
return c_pow(M_E, p)
def _calc_likelihood(np.ndarray[np.int_t, ndim=2] bamfile_data,
list reads,
list quals,
list probN,
np.ndarray[np.uint_t, ndim=1] lik_row_seqi,
np.ndarray[np.uint_t, ndim=1] lik_col_readi,
np.ndarray[np.float_t, ndim=1] lik_data):
"""
do looping here in Cython
return values are placed in the arrays passed as the last 3 args.
"""
cdef int alignedread_i
cdef int seq_i
cdef int read_i
cdef int pair_i
cdef int rlen
cdef int pos
cdef double p
cdef double s
cdef int i
cdef int j
cdef np.ndarray[np.uint8_t, ndim=1] numeric_bases
cdef np.ndarray[np.uint8_t, ndim=1] qualints
cdef np.ndarray[np.float_t, ndim=2] probN_single # an individual probN numpy matrix
for alignedread_i in range(bamfile_data.shape[0]):
seq_i, read_i, pair_i, rlen, pos = bamfile_data[alignedread_i]
lik_row_seqi[alignedread_i] = seq_i
lik_col_readi[alignedread_i] = read_i
numeric_bases = reads[read_i]
qualints = quals[read_i]
probN_single = probN[seq_i]
p = 0.0
for i in range(rlen): # numeric_bases.shape[0]):
s = 0.0
for j in range(4):
if numeric_bases[i] == j: # this is called base, set 1-P
# s += (1. - error_p_i) * probN[pos + i, j]
s += ( qual2one_minus_p[qualints[i]] * probN_single[pos + i, j] ) # lookup table
else: # this is not the called base, so set to P/3
# s += (error_p_i / 3 ) * probN[pos + i, j]
s += ( qual2p_div_3[qualints[i]] * probN_single[pos + i, j] ) # lookup table
p += log(s)
lik_data[alignedread_i] = c_pow(M_E, p)
return
def calc_probN_read(bint initial_iteration,
unsigned int seq_i, unsigned int read_i,
unsigned int pos,
np.ndarray[np.float_t, ndim=1] priors,
posteriors, # pass as python object for now
np.ndarray[np.uint8_t, ndim=1] numeric_bases,
np.ndarray[np.uint8_t, ndim=1] qualints,
np.ndarray[np.float_t, ndim=2] probN): # an individual probN numpy matrix. Passed by ref?
"""
calculates effect of a single read on probN
this is inside loop over bamfile_data
"""
cdef int i
cdef int j
cdef double weight
## posteriors is a sparse matrix, so this presents a little bit of a tricky situation until
## I can think about how to pass this. Have to choose correct data struct, manip internals?
## for now, just keep as python obj.
if initial_iteration:
weight = priors[seq_i]
else:
if seq_i < posteriors.shape[0] and read_i < posteriors.shape[1]:
weight = posteriors[seq_i, read_i]
else:
weight = priors[seq_i]
for i in range(numeric_bases.shape[0]):
# for i from 0 <= i < numeric_bases.shape[0]: # old syntax, range syntax above fast if i is cdef-ed.
s = 0.0
for j in range(4):
if numeric_bases[i] == j: # this is called base, add (1-P) * weight
probN[pos + i, j] += qual2one_minus_p[qualints[i]] * weight
else: # this is not the called base, so add P/3 * weight
probN[pos + i, j] += qual2p_div_3[qualints[i]] * weight
return
def _calc_probN(np.ndarray[np.int_t, ndim=2] bamfile_data,
bint initial_iteration,
np.ndarray[np.float_t, ndim=1] priors,
posteriors, # pass as python object for now
list numeric_bases,
list qualints,
list probN): # list of probN numpy matrices
"""
calculates effect of *all* reads on probN
entire loop over bamfile_data.
ON RETURN: probN is modified for this iteration.
## posteriors is a sparse matrix, so this presents a little bit of a tricky situation until
## I can think about how to pass this. Have to choose correct data struct, manip internals?
## for now, just keep as python obj.
"""
cdef int alignedread_i
cdef int seq_i
cdef int read_i
cdef int pair_i
cdef int rlen
cdef int pos
cdef int i
cdef int j
cdef double weight
cdef np.ndarray[np.uint8_t, ndim=1] numeric_bases_single # single numeric bases numpy array -- per read
cdef np.ndarray[np.uint8_t, ndim=1] qualints_single # # single quals numpy array -- per read
cdef np.ndarray[np.float_t, ndim=2] probN_single # an individual probN numpy matrix.
cdef int posteriors_shape_0 = 0
cdef int posteriors_shape_1 = 0
if posteriors is not None:
posteriors_shape_0 = posteriors.shape[0]
posteriors_shape_1 = posteriors.shape[1]
for alignedread_i in range(bamfile_data.shape[0]):
seq_i, read_i, pair_i, rlen, pos = bamfile_data[alignedread_i]
# about 1/3 of time is spent doing this weight step
if initial_iteration:
weight = priors[seq_i]
else:
if seq_i < posteriors_shape_0 and read_i < posteriors_shape_1:
weight = posteriors[seq_i, read_i]
else:
weight = priors[seq_i]
numeric_bases_single = numeric_bases[read_i]
qualints_single = qualints[read_i]
probN_single = probN[seq_i]
for i in range(numeric_bases_single.shape[0]): # for i in range(rlen) # ????
s = 0.0
for j in range(4):
if numeric_bases_single[i] == j: # this is called base, add (1-P) * weight
probN_single[pos + i, j] += qual2one_minus_p[qualints_single[i]] * weight
else: # this is not the called base, so add P/3 * weight
probN_single[pos + i, j] += qual2p_div_3[qualints_single[i]] * weight
return
def count_cigar_aln(char* query_seq, char* hit_seq,
np.ndarray[np.uint8_t, ndim=1] query_unmapped_bases,
np.ndarray[np.uint8_t, ndim=1] hit_unmapped_bases,
unsigned int query_i, # query_start -- 0-based
unsigned int hit_i, # hit_start -- 0-based
alncode_list):
"""
alncode list comes from a cigar string (see call in emirge_amplicon.py)
returns number of aligned columns and number of matches in those aligned columns as tuple
only includes columns where both sequences have mapped bases
"""
# cdef unsigned int query_i = query_start # 0 # for global alignment, this used to always be zero.
# cdef unsigned int hit_i = hit_start # 0
cdef int matches = 0
cdef int aln_columns = 0
cdef int count
# cdef char aln_code # doesn't work. Expects integer
cdef int i
cdef int j
for i in range(len(alncode_list)):
count_a, aln_code = alncode_list[i]
# print >> sys.stderr, "***", count_a, aln_code, len(query_seq), len(query_unmapped_bases), len(hit_seq), len(hit_unmapped_bases)
if not len(count_a):
count = 1 # "As a special case, if n=1 then n is omitted"
else:
count = int(count_a)
if aln_code == 'M': # match (letter-letter column; could be mismatch)
for j in range(count):
# print >> sys.stderr, "***", query_i+j, hit_i+j, query_seq[query_i+j], hit_seq[hit_i+j]
if query_unmapped_bases[query_i+j] == 0:
if hit_unmapped_bases[hit_i+j] == 0:
aln_columns += 1
if query_seq[query_i+j] == hit_seq[hit_i+j]:
matches += 1
query_i += count
hit_i += count
elif aln_code == 'I':
query_i += count
elif aln_code == 'D':
hit_i += count
else:
raise ValueError, "unknown alignment code: '%s'"%aln_code
return aln_columns, matches
def process_bamfile_predata(int seq_i, int read_i,
list predata, tuple samfile_references,
dict sequence_name2sequence_i, dict sequence_i2sequence_name,
dict read_name2read_i, dict read_i2read_name,
list reads, list quals, list readlengths,
list coverage, int ascii_offset,
np.ndarray[np.int_t, ndim=2] bamfile_data):
"""
returns updated:
(seq_i, read_i)
"""
cdef int alignedread_i
cdef int pos
cdef int tid
cdef int seq_i_to_cache
cdef int pair_i
cdef int read_i_to_cache
for alignedread_i in range(len(predata)):
pos, tid, pair_i, qname, qual, seq = predata[alignedread_i]
# refname = PyTuple_GetItem(samfile_references, tid) # Cannot convert 'PyObject *' to Python object
refname = samfile_references[tid]
seq_i_to_cache = sequence_name2sequence_i.get(refname, seq_i)
if refname not in sequence_name2sequence_i: # new sequence we haven't seen before
sequence_name2sequence_i[refname] = seq_i
sequence_i2sequence_name[seq_i] = refname
coverage.append(0)
seq_i += 1
readname = "%s/%d"%(qname, pair_i+1)
read_i_to_cache = read_name2read_i.get(readname, read_i)
if readname not in read_name2read_i: # new read we haven't seen before
read_name2read_i[readname] = read_i
read_i2read_name[read_i] = readname
read_i += 1
# add to self.reads and self.quals and self.readlengths
readlengths.append(len(seq))
reads.append(seq_alpha2int(seq, len(seq)))
quals.append(quals_alpha2int(qual, len(qual), ascii_offset))
# reads.append(array([base_alpha2int(x) for x in fromstring(seq, dtype=uint8)], dtype=uint8))
# quals.append(np.fromstring(qual, dtype=np.uint8) - ascii_offset)
coverage[seq_i_to_cache] += len(seq)
bamfile_data[alignedread_i] = [seq_i_to_cache, read_i_to_cache, pair_i, len(seq), pos]
return seq_i, read_i
cdef np.ndarray[np.uint8_t, ndim=1] seq_alpha2int(char* seq, int seqlen):
cdef int i
np_as_ints = np.empty(seqlen, dtype=np.uint8)
for i in range(seqlen):
np_as_ints[i] = base_alpha2int(seq[i])
return np_as_ints
cdef np.ndarray[np.uint8_t, ndim=1] quals_alpha2int(char* qual, int quallen, int ascii_offset):
cdef int i
np_as_ints = np.empty(quallen, dtype=np.uint8)
for i in range(quallen):
np_as_ints[i] = qual[i] - ascii_offset
return np_as_ints
# def adjust_posteriors_for_split(posteriors, int seq_i, double major_fraction_avg):
# raise NotImplementedError
# return minor_data, minor_row, minor_col