-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexer.go
271 lines (259 loc) · 9.18 KB
/
lexer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
package schego
import (
"bytes"
"encoding/binary"
"math"
"strconv"
"strings"
"unicode"
)
type TokenType int
const (
TokenNone TokenType = iota
TokenRParen
TokenLParen
TokenIdent
TokenIntLiteral
TokenFloatLiteral
TokenStringLiteral
TokenBoolLiteral
TokenDot
TokenOp
TokenChar
)
type overrideType int
const (
overrideNone overrideType = iota
overrideIdent
overrideString
)
type Token struct {
Type TokenType
Value bytes.Buffer
}
// NewTokenString is a convenience function that returns a token with
// a given string value.
func NewTokenString(tokenType TokenType, tokenString string) *Token {
tokenValue := bytes.NewBufferString(tokenString)
token := Token{tokenType, *tokenValue}
return &token
}
// NewTokenRaw creates a new token from a raw Buffer.
func NewTokenRaw(tokenType TokenType, tokenBuffer bytes.Buffer) *Token {
tokenSlice := make([]byte, tokenBuffer.Len(), tokenBuffer.Len())
// copy to avoid the slice (and thus buffer data) getting overriden by
// future code
copy(tokenSlice, tokenBuffer.Bytes())
tokenValue := bytes.NewBuffer(tokenSlice)
token := Token{tokenType, *tokenValue}
return &token
}
// bufferStringToNum takes an input buffer and converts it from a string of
// character bytes to a float/int
func bufferStringToNum(tokenType TokenType, inputBuffer bytes.Buffer) *bytes.Buffer {
bufferString := inputBuffer.String()
var byteBuffer [binary.MaxVarintLen64]byte
if tokenType == TokenFloatLiteral {
num, _ := strconv.ParseFloat(bufferString, 64)
binary.LittleEndian.PutUint64(byteBuffer[:], math.Float64bits(num))
} else {
num, _ := strconv.ParseInt(bufferString, 10, 64)
binary.PutVarint(byteBuffer[:], num)
}
returnBuffer := bytes.NewBuffer(byteBuffer[:])
return returnBuffer
}
// flushAccumulator empties the contents of the given Buffer into a new Token
// and resets it and the accumulator token type. A convenience function for LexExp.
func flushAccumulator(
accumulatorType *TokenType,
accumulatorBuffer *bytes.Buffer,
tokenBuffer *[]*Token) {
if *accumulatorType == TokenFloatLiteral || *accumulatorType == TokenIntLiteral {
convertedBuffer := bufferStringToNum(*accumulatorType, *accumulatorBuffer)
*tokenBuffer = append(*tokenBuffer, NewTokenRaw(*accumulatorType, *convertedBuffer))
} else {
*tokenBuffer = append(*tokenBuffer, NewTokenRaw(*accumulatorType, *accumulatorBuffer))
}
accumulatorBuffer.Reset()
*accumulatorType = TokenNone
}
// peek peeks at the next rune in the given input string.
func peek(input string, currentIndex int) rune {
// at the end of the string?
if len(input)-1 == currentIndex {
return '\000'
}
return rune(input[currentIndex+1])
}
// LexExp lexes an input string into Token objects. There are no possible user-facing
// errors from this process.
func LexExp(input string) []*Token {
var tokens []*Token
// accumulation variables for multi-character tokens such as idents and literals
accumulating := false
var accumulatingType TokenType
var accumulatorBuffer bytes.Buffer
// characters that can be used in an ident asides from ., which has meaning outside
// idents
specialInitials := "!$%&*/:<=>?^_~"
// flag as to whether or not the | character has taken effect
// anything enclosed within | | is a valid ident in R7RS
overrideState := overrideNone
// operator characters
operatorChars := "+-/*<=>"
for index, glyphRune := range input {
glyph := string(glyphRune)
if overrideState == overrideIdent {
accumulatorBuffer.WriteString(glyph)
if glyph == "|" {
flushAccumulator(&accumulatingType, &accumulatorBuffer, &tokens)
accumulating = false
overrideState = overrideNone
}
} else if overrideState == overrideString {
if glyph == "\"" {
flushAccumulator(&accumulatingType, &accumulatorBuffer, &tokens)
accumulating = false
overrideState = overrideNone
} else {
accumulatorBuffer.WriteString(glyph)
}
} else if unicode.IsSpace(glyphRune) {
// flush the accumulator if we were trying to accumulate beforehand
// no multi-char token accepts a space
if accumulating == true {
flushAccumulator(&accumulatingType, &accumulatorBuffer, &tokens)
accumulating = false
}
// flush the accumulator for newlines, as well
} else if glyph == "\n" {
flushAccumulator(&accumulatingType, &accumulatorBuffer, &tokens)
accumulating = false
// lparen
} else if glyph == "(" {
if accumulating == true {
flushAccumulator(&accumulatingType, &accumulatorBuffer, &tokens)
accumulating = false
}
tokens = append(tokens, NewTokenString(TokenLParen, glyph))
// rparen
} else if glyph == ")" {
if accumulating == true {
flushAccumulator(&accumulatingType, &accumulatorBuffer, &tokens)
accumulating = false
}
tokens = append(tokens, NewTokenString(TokenRParen, glyph))
// opening " of a string literal
// the overrideState stuff takes care of the closing "
} else if glyph == "\"" {
if accumulating == true {
flushAccumulator(&accumulatingType, &accumulatorBuffer, &tokens)
}
accumulating = true
accumulatingType = TokenStringLiteral
overrideState = overrideString
// identify any operators
// normally they'll be a single character, but >= and <= aren't
} else if strings.ContainsAny(glyph, operatorChars) && (accumulatingType == TokenOp || accumulatingType == TokenNone) {
// handle >= and <= correctly
if (glyph == ">" || glyph == "<") && (peek(input, index) == '=') {
accumulating = true
accumulatingType = TokenOp
accumulatorBuffer.WriteString(glyph)
} else {
// did we already accumulate > or < and are now on =?
if accumulating == true && glyph == "=" {
accumulatorBuffer.WriteString(glyph)
flushAccumulator(&accumulatingType, &accumulatorBuffer, &tokens)
accumulating = false
} else {
// simplest case if we found a single-character op, just inject it directly
tokens = append(tokens, NewTokenString(TokenOp, glyph))
}
}
// idents delimited with | can contain pretty much any character
} else if glyph == "|" {
if accumulating == true && accumulatingType != TokenIdent && accumulatingType != TokenStringLiteral {
flushAccumulator(&accumulatingType, &accumulatorBuffer, &tokens)
} else if accumulating == false {
overrideState = overrideIdent
}
accumulating = true
accumulatorBuffer.WriteString(glyph)
accumulatingType = TokenIdent
} else if glyph == "." {
// . is a valid character in an ident - add it to the accumulator
// if we were building an ident
if accumulating == true && accumulatingType == TokenIdent {
accumulatorBuffer.WriteString(glyph)
// we can't start an ident with . - are we building a floating point literal?
} else if chr := peek(input, index); !unicode.IsSpace(chr) && unicode.IsNumber(chr) {
accumulating = true
accumulatingType = TokenFloatLiteral
accumulatorBuffer.WriteString(glyph)
// there's situations where a standalone . is valid
} else {
tokens = append(tokens, NewTokenString(TokenDot, glyph))
}
// boolean literals
} else if glyph == "#" || (accumulating == true && accumulatingType == TokenBoolLiteral) {
// make sure we didn't find a standalone #
if chr := peek(input, index); chr == 't' || chr == 'f' {
// semi-hacky way way of using the accumulator buffer to skip processing
// of the current glyph
accumulating = true
accumulatingType = TokenBoolLiteral
} else if accumulating == true {
// represent true as 1 and false as 0 (doh)
if glyph == "t" {
accumulatorBuffer.WriteByte(1)
} else {
accumulatorBuffer.WriteByte(0)
}
flushAccumulator(&accumulatingType, &accumulatorBuffer, &tokens)
accumulating = false
} else {
// handle the case of just having a # hanging out all by itself
tokens = append(tokens, NewTokenString(TokenChar, glyph))
}
// ident
} else if unicode.IsLetter(glyphRune) {
// were we building a number literal beforehand?
if accumulating == true && accumulatingType != TokenIdent {
flushAccumulator(&accumulatingType, &accumulatorBuffer, &tokens)
}
accumulating = true
accumulatingType = TokenIdent
accumulatorBuffer.WriteString(glyph)
// were we building an ident and are now trying to add a special initial?
} else if strings.ContainsAny(glyph, specialInitials) {
if accumulating == true && accumulatingType == TokenIdent {
accumulatorBuffer.WriteString(glyph)
} else {
tokens = append(tokens, NewTokenString(TokenChar, glyph))
}
// number literal
} else if unicode.IsNumber(glyphRune) {
if accumulating == true && accumulatingType == TokenIdent {
flushAccumulator(&accumulatingType, &accumulatorBuffer, &tokens)
}
accumulating = true
// only declare that we are accumulating an int if we didn't see a . already
if accumulatingType != TokenFloatLiteral {
accumulatingType = TokenIntLiteral
}
accumulatorBuffer.WriteString(glyph)
// we're not sure what this character is, let the parser deal with it
} else {
tokens = append(tokens, NewTokenString(TokenChar, glyph))
}
}
// corner case if the input string while we're still accumulating
// should never happen in proper Scheme, but still...
if accumulating == true {
flushAccumulator(&accumulatingType, &accumulatorBuffer, &tokens)
accumulating = false
}
return tokens
}