-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmarkov.py
144 lines (106 loc) · 3.94 KB
/
markov.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
'''
CAPP 30122 W'20: Markov models and hash tables
Marc Richardson
'''
import sys
import math
from hash_table import HashTable
HASH_CELLS = 57
class Markov:
'''Class representing a Markov model for text recognition'''
def __init__(self, k, s):
'''
Construct a new k-order Markov model using the statistics of string "s"
'''
self._k = k
self.uniq_char = len(set(s))
self.context_table = HashTable(HASH_CELLS, None)
self.learn(s)
def log_probability(self, s):
'''
Get the log probability of string "s", given the statistics of
character sequences modeled by this particular Markov model
This probability is *not* normalized by the length of the string.
'''
total = 0.
prev = s[-self._k:]
for char in s:
numerator = 1
denominator = self.uniq_char
char_table = self.context_table.lookup(prev)
if char_table != self.context_table.defval:
val = char_table.lookup(char)
denominator += sum(char_table.values())
numerator += val
total += math.log(numerator / denominator)
prev = prev[1:] + char
return total
def learn(self, s):
'''
Compute the string statistics for the Markov model for a string "s"
Inputs:
s (str): text to be processed
Returns:
HashTable mapping context sequences to trailing characters
'''
prev = s[-self._k:]
for char in s:
char_table = self.context_table.lookup(prev)
if char_table != self.context_table.defval:
val = char_table.lookup(char) + 1
char_table.update(char, val)
else:
char_table = HashTable(HASH_CELLS, 0)
char_table.update(char, 1)
self.context_table.update(prev, char_table)
prev = prev[1:] + char
def identify_speaker(speaker_a, speaker_b, unknown_speech, k):
'''
Given sample text from two speakers, and text from an unidentified speaker,
return a tuple with the *normalized* log probabilities of each of the
speakers uttering that text under a "k" order character-based Markov model,
and a conclusion of which speaker uttered the unidentified text
based on the two probabilities.
'''
s_len = len(unknown_speech)
model_a = Markov(k, speaker_a)
model_b = Markov(k, speaker_b)
log_probability_a = model_a.log_probability(unknown_speech) / s_len
log_probability_b = model_b.log_probability(unknown_speech) / s_len
if log_probability_a > log_probability_b:
prediction = 'A'
elif log_probability_b > log_probability_a:
prediction = 'B'
else:
prediction = 'B'
return (log_probability_a, log_probability_b, prediction)
def print_results(res_tuple):
'''
Given a tuple from identify_speaker, print formatted results to the screen
'''
(likelihood1, likelihood2, conclusion) = res_tuple
print("Speaker A: " + str(likelihood1))
print("Speaker B: " + str(likelihood2))
print("")
print("Conclusion: Speaker " + conclusion + " is most likely")
def go():
'''
Interprets command line arguments and runs the Markov analysis.
Useful for hand testing.
'''
num_args = len(sys.argv)
if num_args != 5:
print("usage: python3 " + sys.argv[0] + " <file name for speaker A> " +
"<file name for speaker B>\n <file name of text to identify> " +
"<order>")
sys.exit(0)
with open(sys.argv[1], "rU") as file1:
speech1 = file1.read()
with open(sys.argv[2], "rU") as file2:
speech2 = file2.read()
with open(sys.argv[3], "rU") as file3:
speech3 = file3.read()
res_tuple = identify_speaker(speech1, speech2, speech3, int(sys.argv[4]))
print_results(res_tuple)
if __name__ == "__main__":
go()