Skip to content

Commit 1f3d20a

Browse files
committed
Parse numbers in Alex's parser, not tokenizer
In different contexts within Alex's surface syntax, something like "2340898" might be a string of characters or a number. The contexts are are only distinguished at the grammar level, not the token level, so this more or less (we could very layer-violation-y tricks) precludes lexing entire number literals. Instead of a number token, we have a digit token. This we treat as "sub-token", making a `DIGIT | CHAR` non-terminal we use everywhere we want to parse a character. For number literals, we just parse a non-empty string of numbers, and the left recursion makes the `* 10` elegant. Fixes #197
1 parent e4843f2 commit 1f3d20a

File tree

4 files changed

+64
-17
lines changed

4 files changed

+64
-17
lines changed

Diff for: src/Parser.y

+17-14
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,8 @@ import Data.Char
5353
BIND { T _ (BindT $$) }
5454
ID { T _ (IdT $$) }
5555
CODE { T _ (CodeT _) }
56+
DIGIT { T _ (DigitT $$) }
5657
CHAR { T _ (CharT $$) }
57-
NUM { T _ (NumT $$) }
5858
SMAC { T _ (SMacT _) }
5959
RMAC { T _ (RMacT $$) }
6060
SMAC_DEF { T _ (SMacDefT $$) }
@@ -174,15 +174,9 @@ rep :: { RExp -> RExp }
174174
: '*' { Star }
175175
| '+' { Plus }
176176
| '?' { Ques }
177-
-- Single digits are CHAR, not NUM.
178-
-- TODO: these don't check for digits
179-
-- properly.
180-
| '{' CHAR '}' { repeat_rng (digit $2) Nothing }
181-
| '{' CHAR ',' '}' { repeat_rng (digit $2) (Just Nothing) }
182-
| '{' CHAR ',' CHAR '}' { repeat_rng (digit $2) (Just (Just (digit $4))) }
183-
| '{' NUM '}' { repeat_rng $2 Nothing }
184-
| '{' NUM ',' '}' { repeat_rng $2 (Just Nothing) }
185-
| '{' NUM ',' NUM '}' { repeat_rng $2 (Just (Just $4)) }
177+
| '{' natnum '}' { repeat_rng $2 Nothing }
178+
| '{' natnum ',' '}' { repeat_rng $2 (Just Nothing) }
179+
| '{' natnum ',' natnum '}' { repeat_rng $2 (Just (Just $4)) }
186180
187181
rexp0 :: { RExp }
188182
: '(' ')' { Eps }
@@ -197,8 +191,8 @@ set :: { CharSet }
197191
| set0 { $1 }
198192
199193
set0 :: { CharSet }
200-
: CHAR { charSetSingleton $1 }
201-
| CHAR '-' CHAR { charSetRange $1 $3 }
194+
: char { charSetSingleton $1 }
195+
| char '-' char { charSetRange $1 $3 }
202196
| smac {% lookupSMac $1 }
203197
| '[' sets ']' { foldr charSetUnion emptyCharSet $2 }
204198
@@ -222,15 +216,24 @@ smac :: { (AlexPosn,String) }
222216
: '.' { (tokPosn $1, ".") }
223217
| SMAC { case $1 of T p (SMacT s) -> (p, s) }
224218
219+
char :: { Char }
220+
: DIGIT { $1 }
221+
| CHAR { $1 }
222+
223+
natnum :: { Int }
224+
: digit { $1 }
225+
| natnum digit { $2 * 10 + $1 }
226+
227+
digit :: { Int }
228+
: DIGIT { digitToInt $1 }
229+
225230
{
226231
happyError :: P a
227232
happyError = failP "parse error"
228233
229234
-- -----------------------------------------------------------------------------
230235
-- Utils
231236
232-
digit c = ord c - ord '0'
233-
234237
repeat_rng :: Int -> Maybe (Maybe Int) -> (RExp->RExp)
235238
repeat_rng n (Nothing) re = foldr (:%%) Eps (replicate n re)
236239
repeat_rng n (Just Nothing) re = foldr (:%%) (Star re) (replicate n re)

Diff for: src/Scan.x

+3-3
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,8 @@ alex :-
5656
<0> \\ x $hexdig+ { hexch }
5757
<0> \\ o $octal+ { octch }
5858
<0> \\ $printable { escape }
59+
<0> $digit { num } -- should be before char
5960
<0> $nonspecial # [\<] { char } -- includes 1 digit numbers
60-
<0> $digit+ { num } -- should be after char
6161
<0> @smac { smac }
6262
<0> @rmac { rmac }
6363
@@ -92,12 +92,12 @@ data Tkn
9292
| IdT String
9393
| StringT String
9494
| BindT String
95+
| DigitT Char
9596
| CharT Char
9697
| SMacT String
9798
| RMacT String
9899
| SMacDefT String
99100
| RMacDefT String
100-
| NumT Int
101101
| WrapperT
102102
| EncodingT
103103
| ActionTypeT
@@ -121,7 +121,7 @@ decch (p,_,str) ln = return $ T p (CharT (do_ech 10 ln (take (ln-1) (tail st
121121
hexch (p,_,str) ln = return $ T p (CharT (do_ech 16 ln (take (ln-2) (drop 2 str))))
122122
octch (p,_,str) ln = return $ T p (CharT (do_ech 8 ln (take (ln-2) (drop 2 str))))
123123
char (p,_,str) _ = return $ T p (CharT (head str))
124-
num (p,_,str) ln = return $ T p $ NumT $ parseInt 10 $ take ln str
124+
num (p,_,str) _ = return $ T p (DigitT (head str))
125125
smac (p,_,str) ln = return $ T p (SMacT (mac ln str))
126126
rmac (p,_,str) ln = return $ T p (RMacT (mac ln str))
127127
smacdef (p,_,str) ln = return $ T p (SMacDefT (macdef ln str))

Diff for: tests/Makefile

+1
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ TESTS = \
5555
issue_71.x \
5656
issue_119.x \
5757
issue_141.x \
58+
issue_197.x \
5859
monad_typeclass.x \
5960
monad_typeclass_bytestring.x \
6061
monadUserState_typeclass.x \

Diff for: tests/issue_197.x

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
{
2+
-- Issue #197
3+
-- reported 2022-01-21 by https://github.com/Commelina
4+
-- fixed 2022-01-23 by Andreas Abel & John Ericson
5+
--
6+
-- Problem was:
7+
-- Surface syntax regressed and could no longer handle character strings
8+
-- that looked like numbers.
9+
10+
module Main (main) where
11+
12+
import System.Exit
13+
}
14+
15+
%wrapper "posn"
16+
%token "Token"
17+
18+
@iec60559suffix = (32|64|128)[x]?
19+
@any = [0-9]+[x]?
20+
21+
:-
22+
23+
$white+ ;
24+
@iec60559suffix { \ _ -> Good }
25+
@any { \ _ -> Bad }
26+
27+
{
28+
data Token = Good String | Bad String
29+
deriving (Eq, Show)
30+
31+
input = "32 32x 99 99x 128x"
32+
expected_result = [Good "32", Good "32x", Bad "99", Bad "99x", Good "128x"]
33+
34+
main :: IO ()
35+
main
36+
| result == expected_result = do
37+
exitWith ExitSuccess
38+
| otherwise = do
39+
print result
40+
exitFailure
41+
where
42+
result = alexScanTokens input
43+
}

0 commit comments

Comments
 (0)