forked from FattiMei/pl0com
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlexer.py
153 lines (133 loc) · 3.52 KB
/
lexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python3
"""Simple lexer for PL/0 using generators"""
# Tokens can have multiple definitions if needed
TOKEN_DEFS = {
'lparen': ['('],
'rparen': [')'],
'lspar': ['['],
'rspar': [']'],
'colon': [':'],
'times': ['*'],
'slash': ['/'],
'plus': ['+'],
'minus': ['-'],
'eql': ['='],
'neq': ['!='],
'lss': ['<'],
'leq': ['<='],
'gtr': ['>'],
'geq': ['>='],
'callsym': ['call'],
'beginsym': ['begin'],
'semicolon': [';'],
'endsym': ['end'],
'ifsym': ['if'],
'whilesym': ['while'],
'becomes': [':='],
'thensym': ['then'],
'elsesym': ['else'],
'dosym': ['do'],
'constsym': ['const'],
'comma': [','],
'varsym': ['var'],
'procsym': ['procedure'],
'period': ['.'],
'oddsym': ['odd'],
'print': ['!', 'print'],
'read': ['?', 'read']
}
class Lexer:
"""The lexer. Decomposes a string in tokens."""
def __init__(self, text):
self.text = text
self.pos = 0
self.str_to_token = list([(s, t) for t, ss in TOKEN_DEFS.items() for s in ss])
self.str_to_token.sort(key=lambda a: -len(a[0]))
def skip_whitespace(self):
in_comment = False
while self.pos < len(self.text) and (self.text[self.pos].isspace() or self.text[self.pos] == '{' or in_comment):
if self.text[self.pos] == '{' and not in_comment:
in_comment = True
elif in_comment and self.text[self.pos] == '}':
in_comment = False
self.pos += 1
def check_symbol(self):
for s, t in self.str_to_token:
if self.text[self.pos:self.pos+len(s)].lower() == s:
self.pos += len(s)
return t, s
return None, None
def check_regex(self, regex):
import re
match = re.match(regex, self.text[self.pos:])
if not match:
return None
found = match.group(0)
self.pos += len(found)
return found
def tokens(self):
"""Returns a generator which will produce a stream of (token identifier, token value) pairs."""
while self.pos < len(self.text):
self.skip_whitespace()
t, s = self.check_symbol()
if s:
yield t, s
continue
t = self.check_regex(r'[0-9]+')
if t:
yield 'number', int(t)
continue
t = self.check_regex(r'\w+')
if t:
yield 'ident', t
continue
try : t = self.text[self.pos]
except Exception : t='end of file' # at end of file this will fail because self.pos >= len(self.text)
yield 'illegal', t
break
# Test support
__test_program = '''VAR x, y, squ;
VAR arr[5]: char;
var multid[5][5]: short;
{This is a comment. You can write anything you want in a comment}
PROCEDURE square;
VAR test;
BEGIN
test := 1234;
squ := x * x
END;
BEGIN
x := -1;
read x;
if x > 100 then begin
print -x
end else begin
print x
end;
x := 1;
WHILE x <= 10 DO
BEGIN
CALL square;
x:=x+1;
!squ
END;
x := 101;
while x <= 105 do begin
arr[x-100] := x;
!arr[x-100];
x := x + 1
end;
x := 1;
y := 1;
while x <= 5 do begin
while y <= 5 do begin
multid[x][y] := arr[x];
!multid[x][y];
x := x + 1;
y := y + 1
end
end
END.'''
if __name__ == '__main__':
for t, w in Lexer(__test_program).tokens():
print(t, w)