-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathLexicalAnalyzer.py
75 lines (67 loc) · 2.83 KB
/
LexicalAnalyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import re
class LexicalAnalyzer:
# Token row
lin_num = 1
def tokenize(self, code):
rules = [
('MAIN', r'main'), # main
('INT', r'int'), # int
('FLOAT', r'float'), # float
('IF', r'if'), # if
('ELSE', r'else'), # else
('WHILE', r'while'), # while
('READ', r'read'), # read
('PRINT', r'print'), # print
('LBRACKET', r'\('), # (
('RBRACKET', r'\)'), # )
('LBRACE', r'\{'), # {
('RBRACE', r'\}'), # }
('COMMA', r','), # ,
('PCOMMA', r';'), # ;
('EQ', r'=='), # ==
('NE', r'!='), # !=
('LE', r'<='), # <=
('GE', r'>='), # >=
('OR', r'\|\|'), # ||
('AND', r'&&'), # &&
('ATTR', r'\='), # =
('LT', r'<'), # <
('GT', r'>'), # >
('PLUS', r'\+'), # +
('MINUS', r'-'), # -
('MULT', r'\*'), # *
('DIV', r'\/'), # /
('ID', r'[a-zA-Z]\w*'), # IDENTIFIERS
('FLOAT_CONST', r'\d(\d)*\.\d(\d)*'), # FLOAT
('INTEGER_CONST', r'\d(\d)*'), # INT
('NEWLINE', r'\n'), # NEW LINE
('SKIP', r'[ \t]+'), # SPACE and TABS
('MISMATCH', r'.'), # ANOTHER CHARACTER
]
tokens_join = '|'.join('(?P<%s>%s)' % x for x in rules)
lin_start = 0
# Lists of output for the program
token = []
lexeme = []
row = []
column = []
# It analyzes the code to find the lexemes and their respective Tokens
for m in re.finditer(tokens_join, code):
token_type = m.lastgroup
token_lexeme = m.group(token_type)
if token_type == 'NEWLINE':
lin_start = m.end()
self.lin_num += 1
elif token_type == 'SKIP':
continue
elif token_type == 'MISMATCH':
raise RuntimeError('%r unexpected on line %d' % (token_lexeme, self.lin_num))
else:
col = m.start() - lin_start
column.append(col)
token.append(token_type)
lexeme.append(token_lexeme)
row.append(self.lin_num)
# To print information about a Token
print('Token = {0}, Lexeme = \'{1}\', Row = {2}, Column = {3}'.format(token_type, token_lexeme, self.lin_num, col))
return token, lexeme, row, column