-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstring_search_bensprung4.py
197 lines (178 loc) · 7.95 KB
/
string_search_bensprung4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#!/usr/bin/python
# Fall 2014
# Intro to Computational Biology
# HW 2
# Ben Sprung
# Looks for a pattern p against a string s using (1) a naive search, and
# (2) the bad character rule, (3) the good suffix rule
# One string s is read from the first line of a file called "database_ben.txt"
# One or more patterns p are read from a file called "pattern_ben.txt". Each
# line is used as a separate p
import sys
from time import sleep
# animation of search
animation_speed = 0.15
# open files, read the database string s and search patterns
with open("database_ben.txt") as f:
s = f.readline().rstrip()
patterns = []
with open("pattern_ben.txt") as g:
for line in g:
patterns.append(line.rstrip())
alphabet = ["A","C","G","T"]
#alphabet = [chr(i) for i in range(32,127)] #most standard characters
def naive_search(pattern, string):
p, s = pattern, string
total_shifts = 0
for i in range(len(s) - len(p) + 1) : #need +1 for last position
#animation:
sys.stdout.write(i * " " + p + "\r")
sleep(animation_speed); sys.stdout.flush()
perfect_match = True
for j in range(len(p)):
if p[j] != s[i+j]:
perfect_match = False
total_shifts += 1
break #no need to check more after first mismatch
if perfect_match:
return i, total_shifts #we get first match only
return -1, total_shifts
def bad_character_search(pattern,string):
"""From a given position search right to left. If we hit a mismatched
character, look up the shift in the precomputed shifts dictionary for the
bad character rule, which is indexed by the character in the database that
does not match, and then the position of the mismatch"""
p, s = pattern, string
bad_char_shifts_dict = compute_bad_character_shifts(pattern)
i = 0
total_shifts = 0
while i < (len(s) - len(p) + 1):
#animation:
sys.stdout.write(i * " " + p + "\r")
sleep(animation_speed); sys.stdout.flush()
perfect_match = True
for j in reversed(range(len(p))):
if p[j] != s[i + j]:
perfect_match = False
i += bad_char_shifts_dict[s[i+j]][j]
total_shifts += 1
break
if perfect_match:
return i, total_shifts
return -1, total_shifts
def good_suffix_search(pattern,string):
"""From a given position search right to left. If we hit a mismatched
character, look up the shift in the precomputed shifts dictionary for the
good suffix rule, which is indexed by the position of the pattern that
is causing the mismatch"""
p, s = pattern, string
good_suffix_shifts_dict = compute_good_suffix_shifts(p)
i = 0
total_shifts = 0
while i < (len(s) - len(p) + 1):
#animation:
sys.stdout.write(i * " " + p + "\r")
sleep(animation_speed); sys.stdout.flush()
perfect_match = True
for j in reversed(range(len(p))):
if p[j] != s[i + j]:
perfect_match = False
i += good_suffix_shifts_dict[j+1] #it's indexed to follow notes
total_shifts += 1
break
if perfect_match:
return i, total_shifts
return -1, total_shifts
def compute_bad_character_shifts(pattern):
"""For every position in the pattern, for every character in the alphabet,
calculate the distance, from right to left, to the next occurence of that
character. If the current position is the character in question, store a 0.
If the character never occurs, store the number of positions from the
character to the left end of the pattern."""
p = pattern
# initialize dictionary in which results will be stored with all zeros
bad_char_shifts_dict = {}
for char in alphabet:
bad_char_shifts_dict[char] = [0 for i in range(len(p))]
for i in range(len(p)):
for char in alphabet:
# look left from present position until hit first character of
# pattern or until hit alphabet character
j = 0
while True:
if p[i-j] == char:
bad_char_shifts_dict[char][i] = j
break
if j == i:
bad_char_shifts_dict[char][i] = i + 1
# plus one b/c if we are in leftmost position we still
# want to shift forward one (not zero)
break
j += 1
#print bad_char_shifts_dict
return bad_char_shifts_dict
def compute_good_suffix_shifts(pattern):
"""For each substring of a pattern, find the position in which it matches
the full pattern, starting with right alignment and sliding the substring
left by one iteratively. The leftmost character of the substring is treated
as not-that-character. The max possible offset is the length of the
full string minus one. Store the results in a dictionary where the keys are
the positions of the pattern to database mismatch that imply the substrings
and the values are the offsets."""
good_substring_shifts_dict = {}
# make all the substrings of the given pattern
substrings = []
for i in range(len(pattern)):
substrings.append(pattern[i:])
for substring in substrings:
offset = 0
while True:
# these start positions will handle lining up and overhang
# worked these out on paper hard to recap here
start_position_pattern = len(pattern) - len(substring) - offset
start_position_substring = start_position_pattern
if start_position_substring < 0: start_position_substring *= -1
else: start_position_substring = 0
if start_position_pattern < 0: start_position_pattern = 0
match = True
# first, check the first (not-) character which is NOT allowed to
# equal the corresponding character on the pattern
# Note that we don't always check the first position of the
# substring: when it is shifted leftwards enough, that position
# is out of the picture
if start_position_substring == 0:
if substring[0] == pattern[start_position_pattern]:
match = False
# since we checked, advance both patterns' start position
start_position_pattern += 1
start_position_substring += 1
# now check the remainder of the substring
# against the corresponding slice of the pattern
if substring[start_position_substring:] !=\
pattern[start_position_pattern:\
start_position_pattern+len(substring)-start_position_substring]:
match = False
if match:
good_substring_shifts_dict[len(pattern)-len(substring)+1] = \
offset
break
# if we don't find a match, stop when rightmost substring character
# is aligned with leftmost pattern character
if offset == len(pattern) - 1:
good_substring_shifts_dict[len(pattern)-len(substring)+1] = \
offset + 1
break
offset += 1
return good_substring_shifts_dict
if __name__ == "__main__":
#print compute_good_suffix_shifts("GCGACG")
for p in patterns:
print "---\n" + "naive search:\n" + s
result, shifts = naive_search(p,s)
print "\nMatch at position:", result, "Total shifts:", shifts
print "---\n" + "bad character search:\n" + s
result, shifts = bad_character_search(p,s)
print "\nMatch at position:", result, "Total shifts:", shifts
print "---\n" + "good suffix search:\n" + s
result, shifts = good_suffix_search(p,s)
print "\nMatch at position:", result, "Total shifts:", shifts