-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGFP.py
159 lines (145 loc) · 5.05 KB
/
GFP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env python
import math
import pickle
import Bio
from Bio import SeqIO
# -----------------------------------------------
# Count the number of occurrence of nucleotides
# -----------------------------------------------
def CountNucleotide(sequence):
nu = ['A', 'T', 'G', 'C', 'N']
nu_count = dict()
for i in nu:
try:
count_i = sequence.count(i)
nu_count.update({i: count_i})
except KeyError:
print('Error occurred during nucleotide count.')
return nu_count
# Count the number of occurrence of each di nucleotides
def CountDinucleotide(sequence):
di_nu = ['AA', 'AT', 'AG', 'AC', 'TA', 'TT', 'TG', 'TC', 'GA', 'GT', 'GG', 'GC', 'CA', 'CT', 'CG', 'CC']
dnu_count = dict()
for i in di_nu:
try:
count_i = sequence.count(i)
dnu_count.update({i: count_i})
except KeyError:
print('Error occurred during di-nucleotides count.')
return dnu_count
#-----------------------------------------------
# Count the number of occurrence of each codons
#-----------------------------------------------
def CountCodon(sequence):
codon = ['AAA', 'AAT', 'AAG', 'AAC', 'ATA', 'ATT', 'ATG', 'ATC', 'AGA', 'AGT', 'AGG', 'AGC',
'ACA', 'ACT', 'ACG', 'ACC', 'TAA', 'TAT', 'TAG', 'TAC', 'TTA', 'TTT', 'TTG', 'TTC',
'TGA', 'TGT', 'TGG', 'TGC', 'TCA', 'TCT', 'TCG', 'TCC', 'GAA', 'GAT', 'GAG', 'GAC',
'GTA', 'GTT', 'GTG', 'GTC', 'GGA', 'GGT', 'GGG', 'GGC', 'GCA', 'GCT', 'GCG', 'GCC',
'CAA', 'CAT', 'CAG', 'CAC', 'CTA', 'CTT', 'CTG', 'CTC', 'CGA', 'CGT', 'CGG', 'CGC',
'CCA', 'CCT', 'CCG', 'CCC']
codon_count = dict()
for i in codon:
try:
count_i = sequence.count(i)
codon_count.update({i: count_i})
except KeyError:
print('Error occurred during codon count.')
return codon_count
# -----------------------------------------------
# Compute entropy
#------------------------------------------------
def ComputeEntropy(sequence):
dinuc_count = CountDinucleotide(sequence)
codon_count = CountCodon(sequence)
entropy = []
for i in range(len(sequence)):
if i >= 2:
cdn = sequence[i - 2:i + 1]
din = sequence[i - 1:i + 1]
e = math.log2(codon_count[cdn] / dinuc_count[din]) * (-1)
else:
e = 1
entropy.append(e)
return entropy
#---------------------------------------------------------------------------------------
'''
GRAPHICAL-FOOT-PRINT OF DNA
This function (proposed by JM) read the sequence and
the pointer move RIGHT -- if it read 'G'
the pointer move LEFT -- if it read 'A'
the pointer move UP -- if it read 'C'
the pointer move DOWN -- if it read 'T'
'''
#---------------------------------------------------------------------------------------
def GraphicalFootPrint(sequence, case):
sequence = str(sequence).replace('N','').replace('Y','').replace('K','').replace('R','').replace('W','').replace('M','').replace('S','').replace('D','').replace('H','')
seq_len = len(sequence)
gfp_xn = []
gfp_yn = []
xn=[0]
yn=[0]
# We proposed Entropy + weighted vector (Modified case-1)
if case == '1':
nu_count = CountNucleotide(sequence)
A_ratio = nu_count['A']*1.0 / seq_len
C_ratio = nu_count['C']*1.0 / seq_len
T_ratio = nu_count['T']*1.0 / seq_len
G_ratio = nu_count['G']*1.0 / seq_len
entropy = ComputeEntropy(sequence)
for i in range(seq_len):
if sequence[i] == 'G':
xn.append(xn[-1] + entropy[i])
yn.append(yn[-1] + G_ratio)
elif sequence[i] == 'A':
xn.append(xn[-1] - entropy[i])
yn.append(yn[-1] - A_ratio)
elif sequence[i] == 'C':
xn.append(xn[-1] - C_ratio)
yn.append(yn[-1] + entropy[i])
elif sequence[i] == 'T':
xn.append(xn[-1] + T_ratio)
yn.append(yn[-1] - entropy[i])
# We proposed Entropy + weighted vector (Modified case-2)
elif case == '2':
nu_count = CountNucleotide(sequence)
A_ratio = nu_count['A']*1.0 / seq_len
C_ratio = nu_count['C']*1.0 / seq_len
T_ratio = nu_count['T']*1.0 / seq_len
G_ratio = nu_count['G']*1.0 / seq_len
entropy = ComputeEntropy(sequence)
for i in range(seq_len):
if sequence[i] == 'C':
xn.append(xn[-1] + entropy[i])
yn.append(yn[-1] + C_ratio)
elif sequence[i] == 'G':
xn.append(xn[-1] - entropy[i])
yn.append(yn[-1] - G_ratio)
elif sequence[i] == 'T':
xn.append(xn[-1] - T_ratio)
yn.append(yn[-1] + entropy[i])
elif sequence[i] == 'A':
xn.append(xn[-1] + A_ratio)
yn.append(yn[-1] - entropy[i])
# We proposed Entropy + weighted vector (Modified case-3)
elif case == '3':
nu_count = CountNucleotide(sequence)
A_ratio = nu_count['A']*1.0 / seq_len
C_ratio = nu_count['C']*1.0 / seq_len
T_ratio = nu_count['T']*1.0 / seq_len
G_ratio = nu_count['G']*1.0 / seq_len
entropy = ComputeEntropy(sequence)
for i in range(seq_len):
if sequence[i] == 'A':
xn.append(xn[-1] + entropy[i])
yn.append(yn[-1] + A_ratio)
elif sequence[i] == 'C':
xn.append(xn[-1] - entropy[i])
yn.append(yn[-1] - C_ratio)
elif sequence[i] == 'T':
xn.append(xn[-1] - T_ratio)
yn.append(yn[-1] + entropy[i])
elif sequence[i] == 'G':
xn.append(xn[-1] + G_ratio)
yn.append(yn[-1] - entropy[i])
gfp = [[x,y] for x, y in zip(xn,yn)]
return gfp