-
Notifications
You must be signed in to change notification settings - Fork 0
/
JackTokenizer.cs
148 lines (134 loc) · 5.08 KB
/
JackTokenizer.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;
namespace JackCompiler
{
/// <summary>
/// Serializes the input stream into Jack-language tokens.
/// The token types are specified according to the Jack language grammar.
/// </summary>
class JackTokenizer
{
List<string> tokens;
int currentToken = 0;
//This matches the KEYWORD enum of the class Token, so we can use the matching id.
readonly string[] keywords = { "class", "method", "function", "constructor", "int", "boolean", "char", "void",
"var", "static", "field", "let", "do", "if", "else", "while", "return", "true", "false", "null", "this" };
const string symbolReg = "[\\&\\*\\+\\(\\)\\.\\/\\,\\-\\]\\;\\~\\}\\|\\{\\>\\=\\[\\<]";
const string intReg = "[0-9]+";
const string strReg = "\"[^\"\n]*\"";
// const string strReg = "(.*?)";
// const string idReg = "[\\w_]+";
const string idReg = "[a-zA-Z_]\\w*";
// const string idReg = @"([a-zA-Z_]\w*)";
string keywordReg = "";
/// <summary>
/// Opens the input .jack file and gets ready to tokenize it.
/// </summary>
public JackTokenizer(string[] jackLines)
{
Tokenize(jackLines);
}
void Tokenize(string[] lines)
{
tokens = new List<string>();
//build regex pattern
keywordReg = "";
foreach (var keyword in keywords)
keywordReg += keyword + "|";
// string pattern = keywordReg + symbolReg + "|" + intReg + "|" + strReg + "|" + idReg;
string pattern = idReg + "|" + keywordReg + symbolReg + "|" + intReg + "|" + strReg;
keywordReg = @"\b(" + keywordReg.Remove(keywordReg.Length - 1) + @")\b";
foreach(string line in lines)
{
//find tokens
MatchCollection matches = Regex.Matches(line, pattern);
foreach(Match match in matches)
tokens.Add(match.Value);
}
}
/// <summary>
/// Are there more tokens in the input?
/// </summary>
public bool HasMoreTokens()
{
if(currentToken < tokens.Count)
return true;
else
return false;
}
/// <summary>
/// Gets the next token from the input and makes it the current token.
/// This method should be called only if HasMoreTokens returns true.
/// Initially there is no current token.
/// </summary>
public void Advance()
{
currentToken++;
}
/// <summary>
/// Returns the type of the current token, as a constant.
/// </summary>
public TokenType GetTokenType()
{
if(Regex.IsMatch(tokens[currentToken], keywordReg))
return TokenType.KEYWORD;
else if(Regex.IsMatch(tokens[currentToken], strReg))
return TokenType.STRING_CONST;
else if(Regex.IsMatch(tokens[currentToken], symbolReg))
return TokenType.SYMBOL;
else if(Regex.IsMatch(tokens[currentToken], idReg))
return TokenType.IDENTIFIER;
else if(Regex.IsMatch(tokens[currentToken], intReg))
return TokenType.INT_CONST;
else
throw new Exception("Unknown token");
}
/// <summary>
/// Returns the keyword which is the current token, as a constant.
/// This method should be called only if TokenType is KEYWORD.
/// </summary>
public Keyword GetKeyword()
{
string word = Regex.Match(tokens[currentToken], keywordReg).Value;
for(int i = 0; i < keywords.Length; i++)
{
if(keywords[i] == word)
return (Keyword)i;
}
return Keyword.NULL;
}
/// <summary>
/// Returns the character which is the current token.
/// This method should be called only if TokenType is SYMBOL.
/// </summary>
public char GetSymbol()
{
return tokens[currentToken].ToCharArray()[0];
}
/// <summary>
/// Returns the identifier which is the current token.
/// Should be called only if TokenType is IDENTIFIER.
/// </summary>
public string GetIdentifier()
{
return tokens[currentToken];
}
/// <summary>
/// Returns the integer value of the current token.
/// Should be called only if TokenType is INT_CONST.
/// </summary>
public int GetIntVal()
{
return int.Parse(tokens[currentToken]);
}
/// <summary>
/// Returns the string value of the current token, without the two enclosing double quotes.
/// Should be called only if TokenType is STRING_CONST.
/// </summary>
public string GetStringVal()
{
return tokens[currentToken].Trim('"');
}
}
}