-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLex.x
311 lines (253 loc) · 12.1 KB
/
Lex.x
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
-- -*- haskell -*-
-- Copyright (C) 2018-2024 Jun Zhang <zhangjunphy[at]gmail[dot]com>
--
-- This file is a part of decafc.
--
-- decafc is free software: you can redistribute it and/or modify it under the
-- terms of the MIT (X11) License as described in the LICENSE file.
--
-- decafc is distributed in the hope that it will be useful, but WITHOUT ANY
-- WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-- FOR A PARTICULAR PURPOSE. See the X11 license for more details.
-- Decaf scanner
{
{-# OPTIONS_GHC -w #-}
module Lexer.Lex ( Alex(..)
, alexMonadScan
, runAlex
, AlexState(..)
, AlexUserState(..)
, getAlexState
, alexError
, addError
) where
import Lexer.Token
import qualified Util.SourceLoc as SL
import Types
import Control.Monad.State
import Data.ByteString.Lazy (ByteString)
import qualified Data.ByteString.Lazy as BS
import qualified Data.ByteString.Lazy.Char8 as C8
import Data.Text (Text)
import qualified Data.Text as Text
import qualified Data.Text.Encoding as Enc
}
%wrapper "monadUserState-bytestring"
----------------------------------- Tokens ------------------------------------
-- binary operators
$arithOp = [\+ \- \* \/ \%]
@relOp = \<\= | \>\= | [\< \>]
@eqOp = \=\= | \!\=
@condOp = \&\& | \|\|
@binOp = $arithOp | @relOp | @eqOp | @condOp
-- alphabet and digits
$alpha = [a-zA-Z]
$digit = [0-9]
$alphaNum = [$alpha $digit]
$hexDigit = [$digit a-fA-F]
@specialChar = \\\" | \\\' | \\\\ | \\t | \\n
@char = $printable # [\" \' \\] | @specialChar
-- literals
@decimalLiteral = $digit+
@hexLiteral = 0x $hexDigit+
@boolLiteral = true | false
@intLiteral = @decimalLiteral | @hexLiteral
-- @charLiteral = \'@char\'
-- @stringLiteral = \"@char*\"
-- @literal = @intLiteral | @charLiteral | @boolLiteral
-- assign and increment
$assignOp = \=
@compoundAssignOp = \+\= | \-\=
@incrementOp = \+\+ | \-\-
-- identifiers and keywords
@id = [$alpha _] [$alphaNum _]*
@keyword = bool | break | import | continue | else | for | while | if
| int | return | len | void
-- whitespaces
$white2 = $white # \f -- we want the scanner to error on '\f' (form feed) characters
-- keyword and identifier separators
$syntaxChars = [\; \, \: \? \! \{\} \[\] \(\) \= \+ \- \* \/ \& \| \% $white2 \> \<]
-- invalid characters for better error handling
$invalid = $printable # [a-z A-Z 0-9 _ $syntaxChars \' \"]
-- rules
tokens :-
<0> $white2+ ;
<0> "//".* ;
<0, inComment> "/*" { enterComment `andBegin` inComment }
<inComment> "*/" { exitComment }
<inComment> [.\n] ;
<0> \" { enterString `andBegin` inString }
<inString> "\\" { addToString '\\'}
<inString> "\n" { addToString '\n'}
<inString> "\t" { addToString '\t'}
<inString> "\'" { addToString '\''}
<inString> \\\" { addToString '"'}
<inString> \" { exitString }
<inString> [.\n] { addCurrentToString }
<0> "'" { enterChar `andBegin` inChar }
<inChar> "\\" { addToChar '\\'}
<inChar> "\n" { addToChar '\n'}
<inChar> "\t" { addToChar '\t'}
<inChar> "\'" { addToChar '\''}
<inChar> \\\" { addToChar '"'}
<inChar> "'" { exitChar }
<inChar> [.] { addCurrentToChar }
<inString, inChar> "\" { scannerError $ \_ -> "invalid escape sequence" }
<0> @intLiteral { stringToken IntLiteral }
<0> @boolLiteral { stringToken BooleanLiteral }
<0> $syntaxChars ^ @keyword { stringToken Keyword }
<0> $syntaxChars ^ @id { stringToken Identifier }
<0> $assignOp { plainToken AssignOp }
<0> @compoundAssignOp { stringToken CompoundAssignOp }
<0> @incrementOp { stringToken IncrementOp }
<0> $arithOp { stringToken ArithmeticOp }
<0> @relOp { stringToken RelationOp }
<0> @eqOp { stringToken EquationOp }
<0> @condOp { stringToken ConditionOp }
<0> \{ { plainToken LCurly }
<0> \} { plainToken RCurly }
<0> \( { plainToken LParen }
<0> \) { plainToken RParen }
<0> \[ { plainToken LBrack }
<0> \] { plainToken RBrack }
<0> \? { plainToken Choice }
<0> \: { plainToken Colon }
<0> \; { plainToken Semicolon }
<0> \, { plainToken Comma }
<0> \! { plainToken Negate }
<0> $invalid { scannerError $ \s -> BS.concat ["invalid character: ", s] }
{
---------------------------- Alex interface -----------------------------
-- The below code will be generated by the alex wrapper. They are put here for references.
{-
data AlexPosn = AlexPn !Int -- absolute character offset
!Int -- line number
!Int -- column number
type AlexInput = (AlexPosn, -- current position,
Char, -- previous char
ByteString.ByteString -- current input string
Int64) -- bytes consumed so far
data AlexState = AlexState {
alex_pos :: !AlexPosn, -- position at current input location
alex_inp :: ByteString.ByteString, -- the current input
alex_chr :: !Char, -- the character before the input
alex_scd :: !Int, -- the current startcode
alex_ust :: AlexUserState -- AlexUserState will be defined in the user program
}
newtype Alex a = Alex { unAlex :: AlexState
-> Either String (AlexState, a) }
runAlex :: ByteString.ByteString -> Alex a -> Either String a
alexError :: String -> Alex a
alexMonadScan :: Alex a
-- token :: (ByteString.ByteString -> Int -> token) -> AlexAction token
-}
---------------------------- Helper functions for scanning -----------------------------
-- UserState to track comment depth and value of string literal
data AlexUserState = AlexUserState { lexerCommentDepth :: Int
, lexerStringState :: Bool
, lexerCharState :: Bool
, lexerStringValue :: ByteString
, errors :: [CompileError]
}
alexInitUserState :: AlexUserState
alexInitUserState = AlexUserState { lexerCommentDepth = 0
, lexerStringState = False
, lexerCharState = False
, lexerStringValue = ""
, errors = []
}
posnFromAlex :: AlexPosn -> SL.Posn
posnFromAlex (AlexPn offset row col) = SL.Posn offset (row-1) (col-1)
locatedAt :: AlexPosn -> AlexPosn -> a -> SL.Located a
locatedAt start stop = SL.LocatedAt (SL.Range (posnFromAlex start) (posnFromAlex stop))
getAlexState :: Alex AlexState
getAlexState = Alex $ \s -> Right(s, s)
alexEOF :: Alex (SL.Located Token)
alexEOF = Alex $ \s@AlexState{alex_pos=pos} -> Right(s, locatedAt pos pos EOF)
getLexerCommentDepth :: Alex Int
getLexerCommentDepth = Alex $ \s@AlexState{alex_ust=ust} -> Right (s, lexerCommentDepth ust)
setLexerCommentDepth :: Int -> Alex ()
setLexerCommentDepth depth = Alex $ \s -> Right (s { alex_ust=(alex_ust s) {lexerCommentDepth=depth} }, ())
getLexerStringValue :: Alex ByteString
getLexerStringValue = Alex $ \s@AlexState{alex_ust=ust} -> Right (s, lexerStringValue ust)
setLexerStringValue :: ByteString -> Alex ()
setLexerStringValue value = Alex $ \s -> Right (s { alex_ust=(alex_ust s) {lexerStringValue=value} }, ())
getLexerStringState :: Alex Bool
getLexerStringState = Alex $ \s@AlexState{alex_ust=ust} -> Right (s, lexerStringState ust)
setLexerStringState :: Bool -> Alex ()
setLexerStringState state = Alex $ \s -> Right (s { alex_ust=(alex_ust s) {lexerStringState=state} }, ())
getLexerCharState :: Alex Bool
getLexerCharState = Alex $ \s@AlexState{alex_ust=ust} -> Right (s, lexerCharState ust)
setLexerCharState :: Bool -> Alex ()
setLexerCharState state = Alex $ \s -> Right (s { alex_ust=(alex_ust s) {lexerCharState=state} }, ())
addError :: CompileError -> Alex ()
addError error = Alex $ \s@AlexState{alex_ust=(AlexUserState{errors=errors})} -> Right(s{alex_ust=(alex_ust s) {errors=errors++[error]}}, ())
getTokenStop :: AlexPosn -> ByteString -> AlexPosn
getTokenStop posn inp = Text.foldl' alexMove posn $ Enc.decodeUtf8 $ BS.toStrict inp
----- Scanning functions ------
type Action = AlexInput -> Int64 -> Alex (SL.Located Token)
stringToken :: (Text -> Token) -> Action
stringToken tok (start, _, inp, _) len =
let content = BS.take len inp
stop = getTokenStop start content
in return $ locatedAt start stop $ tok $ Enc.decodeUtf8 $ BS.toStrict content
plainToken :: Token -> Action
plainToken tok (start, _, inp, _) len =
let stop = getTokenStop start $ BS.take len inp
in return $ locatedAt start stop tok
enterComment :: Action
enterComment inp len =
do cd <- getLexerCommentDepth
setLexerCommentDepth (cd + 1)
skip inp len
exitComment :: Action
exitComment inp len =
do cd <- getLexerCommentDepth
setLexerCommentDepth (cd - 1)
when (cd == 1) (alexSetStartCode 0)
skip inp len
enterString :: Action
enterString inp len =
do setLexerStringState True
setLexerStringValue ""
skip inp len
exitString :: Action
exitString (pos, _, _, _) len =
do value <- getLexerStringValue
setLexerStringState False
alexSetStartCode 0
return $ locatedAt pos pos (StringLiteral $ Enc.decodeUtf8 $ BS.toStrict $ BS.reverse value)
addToString :: Char -> Action
addToString c inp len =
do value <- getLexerStringValue
setLexerStringValue $ C8.cons c value
skip inp len
addCurrentToString :: Action
addCurrentToString inp@(_, _, str, _) len = addToString (C8.head str) inp len
enterChar :: Action
enterChar inp len =
do setLexerCharState True
setLexerStringValue ""
skip inp len
exitChar :: Action
exitChar (pos, _, _, _) len =
do value <- getLexerStringValue
setLexerCharState False
alexSetStartCode 0
return $ locatedAt pos pos (CharLiteral $ Enc.decodeUtf8 $ BS.toStrict value)
addToChar :: Char -> Action
addToChar c inp len =
do value <- getLexerStringValue
if (BS.length value > 0)
then scannerError (\_ -> "character literal not closed") inp len
else do setLexerStringValue $ C8.pack [c]
skip inp len
addCurrentToChar :: Action
addCurrentToChar inp@(_, _, str, _) len = addToChar (C8.head str) inp len
scannerError :: (ByteString -> ByteString) -> Action
scannerError fn inp@(start, _, text, _) len = do
let content = BS.take len text
let stop = getTokenStop start content
addError $ CompileError (Just $ SL.Range (posnFromAlex start) (posnFromAlex stop)) (Enc.decodeUtf8 $ BS.toStrict $ fn content)
skip inp len
}