Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse numbers in Alex's parser, not tokenizer #200

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 17 additions & 14 deletions src/Parser.y
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ import Data.Char
BIND { T _ (BindT $$) }
ID { T _ (IdT $$) }
CODE { T _ (CodeT _) }
DIGIT { T _ (DigitT $$) }
CHAR { T _ (CharT $$) }
NUM { T _ (NumT $$) }
SMAC { T _ (SMacT _) }
RMAC { T _ (RMacT $$) }
SMAC_DEF { T _ (SMacDefT $$) }
Expand Down Expand Up @@ -174,15 +174,9 @@ rep :: { RExp -> RExp }
: '*' { Star }
| '+' { Plus }
| '?' { Ques }
-- Single digits are CHAR, not NUM.
-- TODO: these don't check for digits
-- properly.
| '{' CHAR '}' { repeat_rng (digit $2) Nothing }
| '{' CHAR ',' '}' { repeat_rng (digit $2) (Just Nothing) }
| '{' CHAR ',' CHAR '}' { repeat_rng (digit $2) (Just (Just (digit $4))) }
| '{' NUM '}' { repeat_rng $2 Nothing }
| '{' NUM ',' '}' { repeat_rng $2 (Just Nothing) }
| '{' NUM ',' NUM '}' { repeat_rng $2 (Just (Just $4)) }
| '{' natnum '}' { repeat_rng $2 Nothing }
| '{' natnum ',' '}' { repeat_rng $2 (Just Nothing) }
| '{' natnum ',' natnum '}' { repeat_rng $2 (Just (Just $4)) }

rexp0 :: { RExp }
: '(' ')' { Eps }
Expand All @@ -197,8 +191,8 @@ set :: { CharSet }
| set0 { $1 }

set0 :: { CharSet }
: CHAR { charSetSingleton $1 }
| CHAR '-' CHAR { charSetRange $1 $3 }
: char { charSetSingleton $1 }
| char '-' char { charSetRange $1 $3 }
| smac {% lookupSMac $1 }
| '[' sets ']' { foldr charSetUnion emptyCharSet $2 }

Expand All @@ -222,15 +216,24 @@ smac :: { (AlexPosn,String) }
: '.' { (tokPosn $1, ".") }
| SMAC { case $1 of T p (SMacT s) -> (p, s) }

char :: { Char }
: DIGIT { $1 }
| CHAR { $1 }

natnum :: { Int }
: digit { $1 }
| natnum digit { $2 * 10 + $1 }
Copy link
Member

@andreasabel andreasabel Jan 23, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be $1 * 10 + $2.
Is this what breaks #141: https://github.com/simonmar/alex/runs/4911840593?check_suite_focus=true#step:22:116 ?
Anyway, fixed this in #201.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh yes, thanks.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I realized another problem with recognizing numbers in the parser. natnum happily accepts digit separated by spaces, like 1 4 in r{1 4,1 4} is happily accepted as 14 repetitions of r now.
I do not think we want to allow that.
How about a middle ground, recognizing numbers in the scanner, but not storing them as Integer, but as String, so we can get them back into character sequences. I'll play with this solution in #199 a bit...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we can use lexer states to only lex numerals inside the multiplicity-brackets {nnn,mmm}.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, and both of those look interesting to me.


digit :: { Int }
: DIGIT { digitToInt $1 }

{
happyError :: P a
happyError = failP "parse error"

-- -----------------------------------------------------------------------------
-- Utils

digit c = ord c - ord '0'

repeat_rng :: Int -> Maybe (Maybe Int) -> (RExp->RExp)
repeat_rng n (Nothing) re = foldr (:%%) Eps (replicate n re)
repeat_rng n (Just Nothing) re = foldr (:%%) (Star re) (replicate n re)
Expand Down
6 changes: 3 additions & 3 deletions src/Scan.x
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ alex :-
<0> \\ x $hexdig+ { hexch }
<0> \\ o $octal+ { octch }
<0> \\ $printable { escape }
<0> $digit { digit } -- should be before char
<0> $nonspecial # [\<] { char } -- includes 1 digit numbers
<0> $digit+ { num } -- should be after char
<0> @smac { smac }
<0> @rmac { rmac }

Expand Down Expand Up @@ -92,12 +92,12 @@ data Tkn
| IdT String
| StringT String
| BindT String
| DigitT Char
| CharT Char
| SMacT String
| RMacT String
| SMacDefT String
| RMacDefT String
| NumT Int
| WrapperT
| EncodingT
| ActionTypeT
Expand All @@ -121,7 +121,7 @@ decch (p,_,str) ln = return $ T p (CharT (do_ech 10 ln (take (ln-1) (tail st
hexch (p,_,str) ln = return $ T p (CharT (do_ech 16 ln (take (ln-2) (drop 2 str))))
octch (p,_,str) ln = return $ T p (CharT (do_ech 8 ln (take (ln-2) (drop 2 str))))
char (p,_,str) _ = return $ T p (CharT (head str))
num (p,_,str) ln = return $ T p $ NumT $ parseInt 10 $ take ln str
digit (p,_,str) _ = return $ T p (DigitT (head str))
smac (p,_,str) ln = return $ T p (SMacT (mac ln str))
rmac (p,_,str) ln = return $ T p (RMacT (mac ln str))
smacdef (p,_,str) ln = return $ T p (SMacDefT (macdef ln str))
Expand Down
1 change: 1 addition & 0 deletions tests/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ TESTS = \
issue_71.x \
issue_119.x \
issue_141.x \
issue_197.x \
monad_typeclass.x \
monad_typeclass_bytestring.x \
monadUserState_typeclass.x \
Expand Down
43 changes: 43 additions & 0 deletions tests/issue_197.x
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
-- Issue #197
-- reported 2022-01-21 by https://github.com/Commelina
-- fixed 2022-01-23 by Andreas Abel & John Ericson
--
-- Problem was:
-- Surface syntax regressed and could no longer handle character strings
-- that looked like numbers.

module Main (main) where

import System.Exit
}

%wrapper "posn"
%token "Token"

@iec60559suffix = (32|64|128)[x]?
@any = [0-9]+[x]?

:-

$white+ ;
@iec60559suffix { \ _ -> Good }
@any { \ _ -> Bad }

{
data Token = Good String | Bad String
deriving (Eq, Show)

input = "32 32x 99 99x 128x"
expected_result = [Good "32", Good "32x", Bad "99", Bad "99x", Good "128x"]

main :: IO ()
main
| result == expected_result = do
exitWith ExitSuccess
| otherwise = do
print result
exitFailure
where
result = alexScanTokens input
}