-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathbio_seq.py
More file actions
148 lines (126 loc) · 5.69 KB
/
bio_seq.py
File metadata and controls
148 lines (126 loc) · 5.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from bio_structs import DNA_Codons, RNA_Codons, NUCLEOTIDE_BASE
from collections import Counter
import random
class bio_seq:
"""DNA sequnece class. Defalt value: ATCG, DNA, No label"""
def __init__(self, seq="ATCG", seq_type="DNA", label='No Label'):
"""Sequence initialization, validation."""
self.seq = seq.upper()
self.label = label
self.seq_type = seq_type
self.is_valid = self.__validate()
assert self.is_valid, f"Provided data does not seem to be a correct {self.seq_type} sequence"
# DNA Toolkit functions:
def __validate(self):
"""Check the sequence to make sure it is a valid DNA string"""
return set(NUCLEOTIDE_BASE[self.seq_type]).issuperset(self.seq)
def get_seq_biotype(self):
"""Returns sequence type"""
return self.seq_type
def get_seq_info(self):
"""Returns 4 strings. Full sequence information"""
return f"[Label]: {self.label}\n[Sequence]: {self.seq}\n[Biotype]: {self.seq_type}\n[Length]: {len(self.seq)}"
def generate_rnd_seq(self, length=10, seq_type="DNA"):
"""Generate a random DNA sequence, provided the length"""
seq = ''.join([random.choice(NUCLEOTIDE_BASE[seq_type])
for x in range(length)])
self.__init__(seq, seq_type, "Randomly generated sequence")
def nucleotide_frequency(self):
"""Count nucleotides in a given sequence. Return a dictionary"""
return dict(Counter(self.seq))
def transcription(self):
"""DNA -> RNA Transcription. Replacing Thymine with Uracil"""
if self.seq_type == "DNA":
return self.seq.replace("T", "U")
return "Not a DNA sequence"
def reverse_complement(self):
"""
Swapping adenine with thymine and guanine with cytosine.
Reversing newly generated string
"""
if self.seq_type == "DNA":
mapping = str.maketrans('ATCG', 'TAGC')
else:
mapping = str.maketrans('AUCG', 'UAGC')
return self.seq.translate(mapping)[::-1]
def gc_content(self):
"""GC Content in a DNA/RNA sequence"""
return round((self.seq.count('C') + self.seq.count('G')) / len(self.seq) * 100)
def gc_content_subsec(self, k=20):
"""GC Content in a DNA/RNA sub-sequence length k. k=20 by default"""
res = []
for i in range(0, len(self.seq) - k + 1, k):
subseq = self.seq[i:i + k]
res.append(
round((subseq.count('C') + subseq.count('G')) / len(subseq) * 100))
return res
def translate_seq(self, init_pos=0):
"""Translates a DNA sequence into an aminoacid sequence"""
if self.seq_type == "DNA":
return [DNA_Codons[self.seq[pos:pos + 3]] for pos in range(init_pos, len(self.seq) - 2, 3)]
elif self.seq_type == "RNA":
return [RNA_Codons[self.seq[pos:pos + 3]] for pos in range(init_pos, len(self.seq) - 2, 3)]
def codon_usage(self, aminoacid):
"""Provides the frequency of each codon encoding a given aminoacid in a DNA sequence"""
tmpList = []
if self.seq_type == "DNA":
for i in range(0, len(self.seq) - 2, 3):
if DNA_Codons[self.seq[i:i + 3]] == aminoacid:
tmpList.append(self.seq[i:i + 3])
elif self.seq_type == "RNA":
for i in range(0, len(self.seq) - 2, 3):
if RNA_Codons[self.seq[i:i + 3]] == aminoacid:
tmpList.append(self.seq[i:i + 3])
freqDict = dict(Counter(tmpList))
totalWight = sum(freqDict.values())
for seq in freqDict:
freqDict[seq] = round(freqDict[seq] / totalWight, 2)
return freqDict
def gen_reading_frames(self):
"""Generate the six reading frames of a DNA sequence, including reverse complement"""
frames = []
frames.append(self.translate_seq(0))
frames.append(self.translate_seq(1))
frames.append(self.translate_seq(2))
tmp_seq = bio_seq(self.reverse_complement(), self.seq_type)
frames.append(tmp_seq.translate_seq(0))
frames.append(tmp_seq.translate_seq(1))
frames.append(tmp_seq.translate_seq(2))
del tmp_seq
return frames
def proteins_from_rf(self, aa_seq):
"""Compute all possible proteins in an aminoacid seq and return a list of possible proteins"""
current_prot = []
proteins = []
for aa in aa_seq:
if aa == "_":
# STOP accumulating amino acids if _ - STOP was found
if current_prot:
for p in current_prot:
proteins.append(p)
current_prot = []
else:
# START accumulating amino acids if M - START was found
if aa == "M":
current_prot.append("")
for i in range(len(current_prot)):
current_prot[i] += aa
return proteins
def all_proteins_from_orfs(self, startReadPos=0, endReadPos=0, ordered=False):
"""Compute all possible proteins for all open reading frames"""
"""Protine Search DB: https://www.ncbi.nlm.nih.gov/nuccore/NM_001185097.2"""
"""API can be used to pull protein info"""
if endReadPos > startReadPos:
tmp_seq = bio_seq(
self.seq[startReadPos: endReadPos], self.seq_type)
rfs = tmp_seq.gen_reading_frames()
else:
rfs = self.gen_reading_frames()
res = []
for rf in rfs:
prots = self.proteins_from_rf(rf)
for p in prots:
res.append(p)
if ordered:
return sorted(res, key=len, reverse=True)
return res