-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathsyntax_analyze.py
251 lines (234 loc) · 10.2 KB
/
syntax_analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from nfa_and_dfa import DFA, LRDFANode
class SyntaxAnalyze(object):
def __init__(self):
super(SyntaxAnalyze, self).__init__()
self.first_set = {}
self.productions = []
self.all_elem = set()
self.terminate = set()
self.noterminate = set()
self.productions_dict = {}
self.lr_analyze_table = {}
def read_syntax_grammar(self, file_name):
for line in open(file_name, 'r'):
line = line[:-1]
cur_left = line.split(':')[0]
cur_right = line.split(':')[1]
right_list = []
if cur_right.find(' ') != -1:
right_list = cur_right.split(' ')
else:
right_list.append(cur_right)
production = {cur_left: right_list}
self.productions.append(production)
def get_terminate_noterminate(self):
for production in self.productions:
for left in production.keys():
if left not in self.productions_dict:
self.productions_dict[left] = []
self.productions_dict[left].append((
tuple(production[left]),
self.productions.index(production)))
self.all_elem.add(left)
self.noterminate.add(left)
for right in production[left]:
self.all_elem.add(right)
self.terminate = self.all_elem - self.noterminate
def __get_first_set(self, cur_status, all_elem):
if cur_status in self.first_set:
return self.first_set[cur_status]
all_elem.add(cur_status)
cur_status_set = set()
for right_list in self.productions_dict[cur_status]:
for right in right_list[0]:
right_set = None
if right in all_elem:
continue
if right in self.first_set:
right_set = self.first_set[right]
else:
right_set = self.__get_first_set(right, all_elem)
cur_status_set |= right_set
if '$' not in right_set:
break
return cur_status_set
def init_first_set(self):
for terminate in self.terminate:
self.first_set[terminate] = set([terminate])
for noterminate in self.noterminate:
self.first_set[noterminate] = self.__get_first_set(
noterminate, set())
def create_lr_dfa(self):
all_status = {}
all_object_set = {}
self.DFA = DFA(set())
def create_get_lr_dfa_node(set_id):
if set_id in all_status:
return all_status[set_id]
return LRDFANode(set_id=set_id)
def expand_production(self, cur_production, ex_object_set):
ex_object_set.add(cur_production)
right = cur_production[2]
point_index = cur_production[3]
tail_set = cur_production[4]
if point_index < len(right) and\
(right[point_index] in self.noterminate):
for pro_right in self.productions_dict[right[point_index]]:
new_tail_set = set()
flag = True
for i in range(point_index + 1, len(right)):
cur_first_set = self.first_set[right[i]]
if '$' in cur_first_set:
new_tail_set = tuple(
set(new_tail_set) | (cur_first_set - set('$')))
else:
flag = False
new_tail_set = tuple(
set(new_tail_set) | cur_first_set)
break
if flag:
new_tail_set = tuple(set(new_tail_set) | set(tail_set))
ex_new_production = (
pro_right[1],
right[point_index], pro_right[0], 0, new_tail_set)
if ex_new_production not in ex_object_set:
ex_object_set |= expand_production(
self, ex_new_production, ex_object_set)
new_ex_object_set = {}
for eos in ex_object_set:
pro_key = (eos[0], eos[1], eos[2], eos[3])
if tuple(pro_key) not in new_ex_object_set:
new_ex_object_set[tuple(pro_key)] = set()
new_ex_object_set[pro_key] |= set(eos[4])
ex_object_set = set()
for key in new_ex_object_set:
production = (key[0], key[1], key[2], key[
3], tuple(new_ex_object_set[key]))
ex_object_set.add(tuple(production))
return ex_object_set
set_id = 0
new_node = create_get_lr_dfa_node(set_id)
object_set = expand_production(
self, (0, 'start1', ('start',), 0, '#'), set())
new_node.add_object_set_by_set(object_set)
all_object_set[tuple(object_set)] = set_id
all_status[set_id] = new_node
object_set_queue = list()
object_set_queue.append(new_node)
while object_set_queue:
top_object_node = object_set_queue.pop(0)
old_set = top_object_node.object_set
old_set_id = top_object_node.set_id
# print 'object_set_id =', old_set_id
for cur_production in old_set:
# print cur_production
pro_id = cur_production[0]
left = cur_production[1]
right = cur_production[2]
point_index = cur_production[3]
tail_set = cur_production[4]
if point_index >= len(right) or '$' in right:
if old_set_id not in self.lr_analyze_table:
self.lr_analyze_table[old_set_id] = {}
for tail in tail_set:
if tail in self.lr_analyze_table[old_set_id]:
print 'the grammar is not a LR(1) grammar!!!'
return
self.lr_analyze_table[old_set_id][tail] = ('r', pro_id)
else:
tar_set_id = 0
new_production = (pro_id, left, right,
point_index + 1, tail_set)
new_object_set = expand_production(
self, new_production, set())
if tuple(new_object_set) in all_object_set.keys():
tar_set_id = all_object_set[tuple(new_object_set)]
else:
set_id += 1
tar_set_id = set_id
all_object_set[tuple(new_object_set)] = set_id
new_node = create_get_lr_dfa_node(tar_set_id)
new_node.add_object_set_by_set(new_object_set)
all_status[tar_set_id] = new_node
object_set_queue.append(new_node)
if old_set_id not in self.lr_analyze_table:
self.lr_analyze_table[old_set_id] = {}
if right[point_index] in self.terminate:
self.lr_analyze_table[old_set_id][
right[point_index]] = ('s', tar_set_id)
else:
self.lr_analyze_table[old_set_id][
right[point_index]] = ('g', tar_set_id)
self.DFA.status = all_status
def run_on_lr_dfa(self, tokens):
status_stack = [0]
symbol_stack = ['#']
top = 0
success = False
tokens.reverse()
while not success:
top = status_stack[-1]
print 'token =', tokens[-1]
# print symbol_stack
print symbol_stack
if tokens[-1] in self.lr_analyze_table[top]:
action = self.lr_analyze_table[top][tokens[-1]]
if action[0] == 's':
status_stack.append(action[1])
symbol_stack.append(tokens[-1])
tokens = tokens[:-1]
elif action[0] == 'r':
if action[1] == 0:
print 'Syntax anaysis successfully!'
success = True
break
production = self.productions[action[1]]
left = production.keys()[0]
right_len = len(production[left])
tokens.append(left)
if production[left] == ['$']:
continue
status_stack = status_stack[:-right_len]
symbol_stack = symbol_stack[:-right_len]
else:
status_stack.append(action[1])
symbol_stack.append(tokens[-1])
tokens = tokens[:-1]
# print status_stack, symbol_stack
else:
print self.lr_analyze_table[top]
print 'Syntax error!\n'
break
def read_and_analyze(self, fileName):
token_table = open(fileName, 'r')
tokens = []
for line in token_table:
line = line[:-1]
tokens.append(line.split(' ')[0])
tokens.append('#')
self.run_on_lr_dfa(tokens)
def main():
syn_ana = SyntaxAnalyze()
# syn_ana.read_syntax_grammar('sample_syn_grammar.txt')
syn_ana.read_syntax_grammar('syn_grammar.txt')
syn_ana.get_terminate_noterminate()
syn_ana.init_first_set()
syn_ana.create_lr_dfa()
syn_ana.read_and_analyze('token_table.data')
# syn_ana.read_and_analyze('sample_token_table.txt')
# for key in syn_ana.lr_analyze_table:
# print key, ': ', syn_ana.lr_analyze_table[key]
# for pro in syn_ana.productions:
# print syn_ana.productions.index(pro), pro
# for key in syn_ana.first_set.keys():
# print 'key =', key, '\n', 'first =', syn_ana.first_set[key]
# print syn_ana.productions
# print '\n'
# for left in syn_ana.productions_dict:
# print left, ':', syn_ana.productions_dict[left]
# print syn_ana.terminate
# print syn_ana.noterminate
if __name__ == '__main__':
main()