From d4790ca1a0fa4908b3e55a0f5eebe19d460df8d9 Mon Sep 17 00:00:00 2001 From: jared <> Date: Tue, 17 Oct 2023 02:46:46 -0600 Subject: [PATCH 01/10] Restructured frontend parser - Uniform handling of whitespace - Added a syntactic specification in the comments - Updated test suite to reflect the changes --- .../src/LambdaBuffers/Frontend/Parsec.hs | 460 +++++++++++++----- .../src/LambdaBuffers/Frontend/Syntax.hs | 31 ++ .../test/Test/LambdaBuffers/Frontend.hs | 2 +- .../Test/LambdaBuffers/Frontend/Parsec.hs | 93 ++-- 4 files changed, 431 insertions(+), 155 deletions(-) diff --git a/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Parsec.hs b/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Parsec.hs index b2e93cf8..6d02d139 100644 --- a/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Parsec.hs +++ b/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Parsec.hs @@ -13,52 +13,263 @@ module LambdaBuffers.Frontend.Parsec ( parseDerive, parseClassDef, parseClassSups, + junk, ) where import Control.Applicative (Alternative ((<|>))) -import Control.Monad (MonadPlus (mzero), void) +import Control.Monad (MonadPlus (mzero), void, when) import Data.Char qualified as Char import Data.Kind (Type) import Data.Maybe (fromJust, isJust) import Data.String (IsString (fromString)) import LambdaBuffers.Compiler.NamingCheck (pClassName, pConstrName, pFieldName, pModuleNamePart, pTyName) -import LambdaBuffers.Frontend.Syntax (ClassConstraint (ClassConstraint), ClassDef (ClassDef), ClassName (ClassName), ClassRef (ClassRef), ConstrName (ConstrName), Constraint (Constraint), Constructor (Constructor), Derive (Derive), Field (Field), FieldName (FieldName), Import (Import), InstanceClause (InstanceClause), Module (Module), ModuleAlias (ModuleAlias), ModuleName (ModuleName), ModuleNamePart (ModuleNamePart), Name (Name), Product (Product), Record (Record), SourceInfo (SourceInfo), SourcePos (SourcePos), Statement (StClassDef, StDerive, StInstanceClause, StTyDef), Sum (Sum), Ty (TyApp, TyRef', TyVar), TyArg (TyArg), TyBody (Opaque, ProductBody, RecordBody, SumBody), TyDef (TyDef), TyName (TyName), TyRef (TyRef), VarName (VarName), kwClassDef, kwDerive, kwInstance, kwTyDefOpaque, kwTyDefProduct, kwTyDefRecord, kwTyDefSum) -import Text.Parsec (ParseError, ParsecT, SourceName, Stream, between, char, endOfLine, eof, getPosition, label, lower, many, many1, optionMaybe, optional, runParserT, satisfy, sepBy, sepEndBy, sourceColumn, sourceLine, sourceName, space, string, try) +import LambdaBuffers.Frontend.Syntax (ClassConstraint (ClassConstraint), ClassDef (ClassDef), ClassName (ClassName), ClassRef (ClassRef), ConstrName (ConstrName), Constraint (Constraint), Constructor (Constructor), Derive (Derive), Field (Field), FieldName (FieldName), Import (Import), InstanceClause (InstanceClause), Module (Module), ModuleAlias (ModuleAlias), ModuleName (ModuleName), ModuleNamePart (ModuleNamePart), Name (Name), Product (Product), Record (Record), SourceInfo (SourceInfo, to), SourcePos (SourcePos), Statement (StClassDef, StDerive, StInstanceClause, StTyDef), Sum (Sum), Ty (TyApp, TyRef', TyVar), TyArg (TyArg), TyBody (Opaque, ProductBody, RecordBody, SumBody), TyDef (TyDef), TyName (TyName), TyRef (TyRef), VarName (VarName), kwAs, kwClassDef, kwDerive, kwImport, kwInstance, kwModule, kwQualified, kwTyDefOpaque, kwTyDefProduct, kwTyDefRecord, kwTyDefSum, kws) +import Text.Parsec (ParseError, ParsecT, SourceName, Stream, alphaNum, between, char, endOfLine, eof, getPosition, label, lower, many, many1, manyTill, notFollowedBy, optionMaybe, runParserT, satisfy, sepBy, sepEndBy, sourceColumn, sourceLine, sourceName, space, string, try, unexpected, ()) type Parser :: Type -> (Type -> Type) -> Type -> Type type Parser s m a = ParsecT s () m a +-- Note: Syntactic Form of Lambda Buffer Files. +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-- The notational conventions used to present the syntax is based off of [1]. +-- So, these notational conventions are used for presenting syntax. +-- +-- - [ pattern ] optional +-- +-- - { pattern } zero or more repetitions +-- +-- - ( pattern ) grouping +-- +-- - pat1 | pat2 choice +-- +-- - pat1\pat2 difference -- elements generated by pat1 except those +-- generated by pat2. +-- +-- - 'terminal' terminal syntax +-- +-- - // comment comment +-- +-- Productions will be of the form +-- - nonterm -> alt1 | ... | altn +-- +-- +-- Tokens form the vocabulary of Lambda Buffer Files. There are classes of +-- tokens (keyword, modulename, longmodulename, tyname, longtyname, varname, +-- punctuation, fieldname, classname, longclassname) as follows. +-- +-- keyword -> 'module' | 'sum' | 'prod' | 'record' | 'opaque' | 'class' | 'instance' | 'import' | 'qualified' | 'as' +-- modulename -> upperCamelCase +-- longmodulename -> long modulename +-- tyname -> upperCamelCase +-- fieldname -> lowerCamelCase\keyword +-- longtyname -> long tyname +-- varname -> lowers\keyword +-- punctuation -> '<=' | ',' | '.' | '(' | ')' | '{' | '}' | ':' | ':-' | '=' | '|' +-- classname -> upperCamelCase +-- longclassname -> long upperCamelCase +-- +-- upperCamelCase -> upper { alphaNum } +-- lowerCamelCase -> lower { alphaNum } +-- long -> { upperCamelCase '.' } +-- upper -> // upper case or title case alphabetic unicode characters (letters) +-- lower -> // lower case alphabetic unicode characters (letters) +-- lowers -> lower { lower } +-- alphaNum -> // alphabetic or numeric unicode characters +-- +-- Input files are broken into *tokens* which are delimited by whitespace or +-- line comments. At each point, the longest possible token satisfying the +-- token definitions is read. +-- +-- Finally, the grammar for Lambda Buffer Files is as follows. +-- +-- module -> 'module' modulename imports statements +-- +-- import -> 'import' [ 'qualified' ] longmodulename +-- [ 'as' longmodulename ] +-- [ '(' [ { tyname ',' } tyname [','] ] ')' ] +-- imports -> { import } +-- +-- statements -> [ { statement newlines1 } statement [ newlines1 ] ] +-- statement -> tydef +-- | classdef +-- | instanceclause +-- | derivedef +-- +-- tydef -> sumtydef | prodtydef | recordtydef | opaquetydef +-- +-- sumtydef -> 'sum' tyname { varname } '=' sum +-- sum -> sumconstructor { '|' sumconstructor } +-- sumconstructor -> tyname prod +-- +-- prodtydef -> 'prod' tyname { varname } '=' prod +-- prod -> { tyexpr } +-- tyexpr -> varname +-- | longtyname +-- | '(' prod ')' +-- +-- recordtydef -> 'record' tyname { varname } '=' record +-- record -> '{' [ field { ',' field } ] '}' +-- field -> fieldname ':' prod +-- +-- opaquetydef -> 'opaque' tyname { varname } +-- +-- classdef -> 'class' [ classexps '<=' ] classname { varname } +-- // Warning: this part makes it not LL(1)! +-- // In the future, we should shift to some form of +-- // an LALR(1) parser. +-- classexp -> classref { varname } +-- | '(' classexps ')' +-- classexps -> [ classexp { ',' classexp } ] +-- +-- instanceclause -> 'instance' constraint [ ':-' classexps ] +-- constraint -> classref { tyexpr } +-- +-- derivedef -> 'derive' constraint +-- +-- References. +-- [1] Haskell 2010 Language Report by Simon Marlow +-- +-- Note: Parser Implementation. +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-- +-- We use Parsec [1] to parse the grammar. +-- +-- We have the following invariant. +-- +-- - Whitespace Invariant: Each parser @pa@ assumes to starts at a +-- nonwhitespace character where whitespace is defined by the parser 'junk' +-- i.e., whitespace or comments. +-- +-- Remark. +-- The Whitespace Invariant is originally from [2]. +-- +-- For the Whitespace Invariant to be initially true, @'runParser' pa@ calls +-- 'junk', then @pa@, then 'Text.Parsec.eof' which ensures: +-- +-- 1. the Whitespace Invariant is initially true for the parser @pa@; and +-- +-- 2. the entire input is consumed. +-- +-- Then, to ensure that all "subparsers" maintain the Whitespace Invariant, we +-- introduce the parser combinator 'token' for which @'token' pa@ runs @pa@, +-- then runs 'junk' to ensure that any following parsers will start at a non +-- whitespace character. +-- +-- Thus, if we want to parse the string @"pomeranian"@ we should write +-- +-- > token (Text.Parser.string "pomeranian") +-- +-- instead of +-- +-- > -- do NOT do this since it will NOT maintain the Whitespace Invariant. +-- > Text.Parser.string "pomeranian" +-- +-- References. +-- +-- [1] Parsec: Direct Style Monadic Parser Combinators For the Real World by +-- Daan Leijen and Erik Meijer +-- +-- [2] Monadic Parser Combinators by Graham Hutton and Erik Meijer + +-- * Primitives + +{- | @'token' pa@ runs the parser @pa@ with 'try' followed by 'junk' to remove + whitespace. Moreover, this gets the SourceInfo of the parsed token w/o the + whitespace + + See [Note: Parser Implementation]. +-} +token :: Stream s m Char => Parser s m a -> Parser s m (SourceInfo, a) +token pa = withSourceInfo (try $ fmap (\a srcInfo -> (srcInfo, a)) pa) <* junk + +token' :: Stream s m Char => Parser s m a -> Parser s m a +token' = fmap snd . token + +{- | 'junk' skips whitespace and comments. + + See [Note: Parser Implementation]. +-} +junk :: forall s m. Stream s m Char => Parser s m () +junk = void (many (spaces1 <|> comment)) + where + spaces1 :: Parser s m () + spaces1 = void $ many1 (space "") + + comment :: Parser s m () + comment = + void $ + try (string "--" "") + *> + -- Note: the 'try' for 'endOfLine' is necessary because of the + -- overlapping instances of both parsers as we may note that + -- 'endOfLine' parses \r\n and \n. + manyTill (satisfy Char.isPrint) (try endOfLine) + +{- | 'keyword' parses the provided keyword ensuring that the keyword does *not* + overlap with varname tokens. +-} +keyword :: Stream s m Char => String -> Parser s m () +keyword k = void $ string k *> notFollowedBy alphaNum + runParser :: (Stream s IO Char) => Parser s IO a -> SourceName -> s -> IO (Either ParseError a) -runParser p = runParserT (p <* eof) () +runParser p = runParserT (junk *> p <* eof) () + +-- * Lexical elements parseModuleNamePart :: Stream s m Char => Parser s m (ModuleNamePart SourceInfo) parseModuleNamePart = withSourceInfo . label' "module part name" $ ModuleNamePart <$> pModuleNamePart parseModuleName :: Stream s m Char => Parser s m (ModuleName SourceInfo) -parseModuleName = withSourceInfo . label' "module name" $ ModuleName <$> sepBy (try parseModuleNamePart) (try $ char '.') +parseModuleName = withSourceInfo . label' "module name" $ ModuleName <$> sepBy parseModuleNamePart (char '.') + +tokenModuleName :: Stream s m Char => Parser s m (ModuleName SourceInfo) +tokenModuleName = token' parseModuleName parseTyVarName :: Stream s m Char => Parser s m (VarName SourceInfo) -parseTyVarName = withSourceInfo . label' "type variable name" $ VarName . fromString <$> many1 lower +parseTyVarName = withSourceInfo . label' "type variable name" $ do + v <- many1 lower + notKeyword v + return . VarName . fromString $ v + +-- | 'notKeyword' tests if the string is not a keyword -- failing otherwise. +notKeyword :: Stream s m Char => String -> Parser s m () +notKeyword v = when (v `elem` kws) $ unexpected "keyword" +-- | 'parseName' is a class or a type name parseName :: Stream s m Char => Parser s m (Name SourceInfo) parseName = withSourceInfo . label' "either class or type name" $ Name <$> pTyName +tokenName :: Stream s m Char => Parser s m (Name SourceInfo) +tokenName = token' parseName + parseTyName :: Stream s m Char => Parser s m (TyName SourceInfo) parseTyName = withSourceInfo . label' "type name" $ TyName <$> pTyName +tokenTyName :: Stream s m Char => Parser s m (TyName SourceInfo) +tokenTyName = token' parseTyName + parseClassName :: Stream s m Char => Parser s m (ClassName SourceInfo) parseClassName = withSourceInfo . label' "class name" $ ClassName <$> pClassName +tokenClassName :: Stream s m Char => Parser s m (ClassName SourceInfo) +tokenClassName = token' parseClassName + parseModuleAliasInRef :: Stream s m Char => Parser s m (ModuleAlias SourceInfo) parseModuleAliasInRef = withSourceInfo . label' "module alias in type or class reference" $ ModuleAlias <$> do + -- some awkwardness with the 'try' here. + -- Ideally, we should use the @.@ to be the first set to determine when + -- to stop parsing this... but oh well... ps <- many1 (try (parseModuleNamePart <* char '.')) withSourceInfo . return $ ModuleName ps parseModuleAliasInImport :: Stream s m Char => Parser s m (ModuleAlias SourceInfo) parseModuleAliasInImport = withSourceInfo . label' "module alias in module import" $ ModuleAlias <$> parseModuleName +tokenModuleAliasInImport :: Stream s m Char => Parser s m (ModuleAlias SourceInfo) +tokenModuleAliasInImport = token' parseModuleAliasInImport + parseTyRef' :: Stream s m Char => Parser s m (TyRef SourceInfo) parseTyRef' = withSourceInfo . label' "type reference" $ do mayAlias <- optionMaybe parseModuleAliasInRef @@ -67,9 +278,53 @@ parseTyRef' = withSourceInfo . label' "type reference" $ do parseTyVar :: Stream s m Char => Parser s m (Ty SourceInfo) parseTyVar = label' "type variable" $ TyVar <$> parseTyVarName +tokenTyVar :: Stream s m Char => Parser s m (Ty SourceInfo) +tokenTyVar = token' parseTyVar + parseTyRef :: Stream s m Char => Parser s m (Ty SourceInfo) parseTyRef = withSourceInfo . label' "type reference" $ TyRef' <$> parseTyRef' +tokenTyRef :: Stream s m Char => Parser s m (Ty SourceInfo) +tokenTyRef = token' parseTyRef + +parseFieldName :: Stream s m Char => Parser s m (FieldName SourceInfo) +parseFieldName = + withSourceInfo . label' "record field name" $ + -- TODO: technically, we should have the following implementation, but the + -- test suite wants keywords to be allowed to use as field names... + -- > v <- pFieldName + -- > notKeyword $ Data.Text.unpack v + -- > return $ FieldName v + -- TODO: fix the documentation to reflect this + FieldName <$> pFieldName + +tokenFieldName :: Stream s m Char => Parser s m (FieldName SourceInfo) +tokenFieldName = token' parseFieldName + +parseConstructorName :: Stream s m Char => Parser s m (ConstrName SourceInfo) +parseConstructorName = withSourceInfo . label' "sum constructor name" $ ConstrName <$> pConstrName + +tokenConstructorName :: Stream s m Char => Parser s m (ConstrName SourceInfo) +tokenConstructorName = token' parseConstructorName + +parseTyArg :: Stream s m Char => Parser s m (TyArg SourceInfo) +parseTyArg = withSourceInfo . label' "type argument" $ do + VarName vn _ <- parseTyVarName + return $ TyArg vn + +tokenTyArg :: Stream s m Char => Parser s m (TyArg SourceInfo) +tokenTyArg = token' parseTyArg + +parseClassRef :: Stream s m Char => Parser s m (ClassRef SourceInfo) +parseClassRef = withSourceInfo . label' "class reference" $ do + mayAlias <- optionMaybe parseModuleAliasInRef + ClassRef mayAlias <$> parseClassName + +tokenClassRef :: Stream s m Char => Parser s m (ClassRef SourceInfo) +tokenClassRef = token' parseClassRef + +-- * Grammar + {- | Inner type expression. Valid examples: @@ -88,16 +343,16 @@ parseTyTopLevel = label' "top level type expression" $ parseTys >>= tysToTy -- | Sexp :- var | TyRef | (Sexp) parseSexp :: forall s m. Stream s m Char => Parser s m (Ty SourceInfo) -parseSexp = label' "s-expression" $ between parseLineSpaces parseLineSpaces (parseSexpList <|> parseSexpAtom) +parseSexp = label' "s-expression" $ parseSexpAtom <|> parseSexpList parseSexpAtom :: forall s m. Stream s m Char => Parser s m (Ty SourceInfo) -parseSexpAtom = try parseTyRef <|> try parseTyVar +parseSexpAtom = tokenTyRef <|> tokenTyVar parseTys :: forall s m. Stream s m Char => Parser s m [Ty SourceInfo] parseTys = many parseSexp parseSexpList :: forall s m. Stream s m Char => Parser s m (Ty SourceInfo) -parseSexpList = between (char '(') (char ')') (parseTys >>= tysToTy) +parseSexpList = between (token (char '(')) (token (char ')')) (parseTys >>= tysToTy) tysToTy :: Stream s m Char => [Ty SourceInfo] -> Parser s m (Ty SourceInfo) tysToTy tys = withSourceInfo $ case tys of @@ -110,11 +365,15 @@ parseSum = withSourceInfo . label' "sum type expression" $ do cs <- sepBy parseSumConstructor - (parseLineSpaces >> char '|' >> parseLineSpaces) + (token (char '|')) return $ Sum cs parseSumConstructor :: Stream s m Char => Parser s m (Constructor SourceInfo) -parseSumConstructor = withSourceInfo . label' "sum type constructor" $ Constructor <$> parseConstructorName <*> (parseLineSpaces >> parseProduct) +parseSumConstructor = + withSourceInfo . label' "sum type constructor" $ + Constructor + <$> tokenConstructorName + <*> parseProduct parseProduct :: Stream s m Char => Parser s m (Product SourceInfo) parseProduct = withSourceInfo . label' "product type expression" $ Product <$> parseTys @@ -123,25 +382,17 @@ parseRecord :: Stream s m Char => Parser s m (Record SourceInfo) parseRecord = withSourceInfo . label' "record type expression" $ do fields <- between - (char '{' >> parseLineSpaces) - (parseLineSpaces >> char '}') - $ sepBy parseField (parseLineSpaces >> char ',' >> parseLineSpaces) + (token $ char '{') + (token $ char '}') + $ sepBy parseField (token $ char ',') return $ Record fields parseField :: Stream s m Char => Parser s m (Field SourceInfo) parseField = withSourceInfo . label' "record field" $ do - fn <- parseFieldName - parseLineSpaces1 - _ <- char ':' - parseLineSpaces1 + fn <- tokenFieldName + _ <- token $ char ':' Field fn <$> parseTyTopLevel -parseFieldName :: Stream s m Char => Parser s m (FieldName SourceInfo) -parseFieldName = withSourceInfo . label' "record field name" $ FieldName <$> pFieldName - -parseConstructorName :: Stream s m Char => Parser s m (ConstrName SourceInfo) -parseConstructorName = withSourceInfo . label' "sum constructor name" $ ConstrName <$> pConstrName - parseTyDef :: Stream s m Char => Parser s m (TyDef SourceInfo) parseTyDef = label' "type definition" $ parseSumTyDef <|> parseProdTyDef <|> parseRecordTyDef <|> parseOpaqueTyDef @@ -156,49 +407,30 @@ parseRecordTyDef = parseTyDef' kwTyDefRecord (RecordBody <$> parseRecord) parseTyDef' :: Stream s m Char => String -> Parser s m (TyBody SourceInfo) -> Parser s m (TyDef SourceInfo) parseTyDef' kw parseBody = withSourceInfo . label' (kw <> " type definition") $ do - _ <- string kw - _ <- parseLineSpaces1 - tyN <- parseTyName - _ <- parseLineSpaces1 - args <- sepEndBy parseTyArg parseLineSpaces1 - _ <- char '=' - _ <- parseLineSpaces1 + _ <- token $ keyword kw + tyN <- tokenTyName + args <- many tokenTyArg + _ <- token $ char '=' TyDef tyN args <$> parseBody parseOpaqueTyDef :: Stream s m Char => Parser s m (TyDef SourceInfo) parseOpaqueTyDef = withSourceInfo . label' "opaque type definition" $ do - _ <- string kwTyDefOpaque - _ <- parseLineSpaces1 - tyN <- parseTyName - maySpace <- optionMaybe parseLineSpace - args <- case maySpace of - Nothing -> parseLineSpaces >> return [] - Just _ -> do - _ <- parseLineSpaces - sepBy parseTyArg parseLineSpaces1 + _ <- token (keyword kwTyDefOpaque) + tyN <- tokenTyName + args <- many tokenTyArg return $ TyDef tyN args Opaque -parseTyArg :: Stream s m Char => Parser s m (TyArg SourceInfo) -parseTyArg = withSourceInfo . label' "type argument" $ do - VarName vn _ <- parseTyVarName - return $ TyArg vn - -parseClassRef :: Stream s m Char => Parser s m (ClassRef SourceInfo) -parseClassRef = withSourceInfo . label' "class reference" $ do - mayAlias <- optionMaybe parseModuleAliasInRef - ClassRef mayAlias <$> parseClassName - parseConstraint :: Stream s m Char => Parser s m (Constraint SourceInfo) -parseConstraint = withSourceInfo . label' "constraint" $ Constraint <$> parseClassRef <*> parseTys +parseConstraint = withSourceInfo . label' "constraint" $ Constraint <$> tokenClassRef <*> parseTys parseDerive :: Stream s m Char => Parser s m (Derive SourceInfo) -parseDerive = label' "derive statement" $ string kwDerive >> parseLineSpaces >> Derive <$> parseConstraint +parseDerive = label' "derive statement" $ token (keyword kwDerive) >> Derive <$> parseConstraint parseInstanceClause :: Stream s m Char => Parser s m (InstanceClause SourceInfo) parseInstanceClause = withSourceInfo . label' "instance clause" $ do - _ <- string kwInstance - clauseHead <- between parseLineSpaces parseLineSpaces parseConstraint - mayBodyFollows <- optionMaybe (string ":-") + _ <- token (keyword kwInstance) + clauseHead <- parseConstraint + mayBodyFollows <- optionMaybe (token (string ":-")) case mayBodyFollows of Nothing -> return $ InstanceClause clauseHead [] Just _ -> InstanceClause clauseHead <$> parseInstanceBody @@ -208,56 +440,57 @@ parseInstanceBody = parseConstraints -- | Constraints sexp. parseConstraintSexp :: Stream s m Char => Parser s m [Constraint SourceInfo] -parseConstraintSexp = between parseLineSpaces parseLineSpaces (parseConstraintList <|> parseConstraintAtom) +parseConstraintSexp = parseConstraintList <|> parseConstraintAtom parseConstraintAtom :: Stream s m Char => Parser s m [Constraint SourceInfo] parseConstraintAtom = pure <$> parseConstraint parseConstraintList :: Stream s m Char => Parser s m [Constraint SourceInfo] -parseConstraintList = between (char '(') (char ')') parseConstraints +parseConstraintList = between (token (char '(')) (token (char ')')) parseConstraints parseConstraints :: Stream s m Char => Parser s m [Constraint SourceInfo] -parseConstraints = concat <$> sepBy parseConstraintSexp (char ',') +parseConstraints = concat <$> sepBy parseConstraintSexp (token (char ',')) parseClassDef :: Stream s m Char => Parser s m (ClassDef SourceInfo) parseClassDef = withSourceInfo . label' "class definition" $ do - _ <- string kwClassDef + _ <- token (keyword kwClassDef) maySups <- optionMaybe + -- TODO: parsing this is problematic for LL(1) parsers, hence the + -- rather large 'try'. + -- We really are abusing the infinite look ahead here... ( try $ do sups <- parseClassSups - _ <- string "<=" + _ <- token (string "<=") return sups ) - _ <- parseLineSpaces1 - clName <- parseClassName - clArgs <- fromJust <$> optionMaybe (parseLineSpaces >> parseClassArgs) + clName <- tokenClassName + clArgs <- parseClassArgs case maySups of Nothing -> return $ ClassDef clName clArgs [] Just sups -> return $ ClassDef clName clArgs sups parseClassArgs :: Stream s m Char => Parser s m [TyArg SourceInfo] -parseClassArgs = label' "class args" $ sepBy parseTyArg (try parseLineSpaces1) +parseClassArgs = label' "class args" $ many tokenTyArg -- | ClassCnstrs sexp. parseClassCnstrSexp :: Stream s m Char => Parser s m [ClassConstraint SourceInfo] -parseClassCnstrSexp = between parseLineSpaces parseLineSpaces (parseClassCnstrList <|> parseClassCnstrAtom) +parseClassCnstrSexp = parseClassCnstrList <|> parseClassCnstrAtom parseClassCnstrAtom :: Stream s m Char => Parser s m [ClassConstraint SourceInfo] parseClassCnstrAtom = pure <$> parseClassCnstr parseClassCnstrList :: Stream s m Char => Parser s m [ClassConstraint SourceInfo] -parseClassCnstrList = between (char '(') (char ')') parseClassSups +parseClassCnstrList = between (token $ char '(') (token $ char ')') parseClassSups -- FIXME(bladyjoker): Should accept "Eq a " parseClassSups :: Stream s m Char => Parser s m [ClassConstraint SourceInfo] -parseClassSups = concat <$> sepBy parseClassCnstrSexp (char ',') +parseClassSups = concat <$> sepBy parseClassCnstrSexp (token (char ',')) parseClassCnstr :: Stream s m Char => Parser s m (ClassConstraint SourceInfo) -parseClassCnstr = label' "class constraint" $ do - ref <- parseClassRef - args <- fromJust <$> optionMaybe (parseLineSpaces >> parseClassArgs) - return $ ClassConstraint ref args +parseClassCnstr = + label' "class constraint" $ + ClassConstraint <$> tokenClassRef <*> parseClassArgs parseStatement :: Stream s m Char => Parser s m (Statement SourceInfo) parseStatement = @@ -266,62 +499,45 @@ parseStatement = <|> (StInstanceClause <$> parseInstanceClause) <|> (StDerive <$> parseDerive) -parseStatements :: Stream s m Char => Parser s m [Statement SourceInfo] -parseStatements = sepEndBy parseStatement (many1 parseNewLine) - parseModule :: Stream s m Char => Parser s m (Module SourceInfo) parseModule = withSourceInfo . label' "module definition" $ do - _ <- string "module" - _ <- parseLineSpaces1 - modName <- parseModuleName - _ <- parseLineSpaces - _ <- many1 parseNewLine - imports <- sepEndBy parseImport (many1 parseNewLine) - stmnts <- parseStatements - _ <- many space + _ <- token $ keyword kwModule + modName <- tokenModuleName + imports <- many parseImport + stmnts <- many parseStatement return $ Module modName imports stmnts parseImport :: Stream s m Char => Parser s m (Import SourceInfo) -parseImport = withSourceInfo . label' "import statement" $ do - _ <- string "import" - _ <- parseLineSpaces1 - isQual <- isJust <$> optionMaybe (string "qualified" >> parseLineSpaces1) - modName <- parseModuleName - may <- - optionMaybe - ( do - mayModAlias <- optionMaybe (try $ parseLineSpaces1 >> string "as" >> parseLineSpaces1 *> parseModuleAliasInImport) - mayNames <- - optionMaybe - ( try $ do - parseLineSpaces1 >> char '(' >> parseLineSpaces - names <- sepEndBy parseName (char ',' >> parseLineSpaces) - _ <- try parseLineSpaces >> char ')' - return names - ) - _ <- try parseLineSpaces - return (mayModAlias, mayNames) - ) - case may of - Nothing -> return $ Import isQual modName Nothing Nothing - Just (mayModAlias, mayNames) -> return $ Import isQual modName mayNames mayModAlias - -parseNewLine :: Stream s m Char => Parser s m () -parseNewLine = label' "lb new line" $ void endOfLine <|> try parseComment - -parseComment :: Stream s m Char => Parser s m () -parseComment = label' "comment" $ void $ between (string "--") endOfLine (many (char ' ' <|> satisfy Char.isPrint)) - -parseLineSpace :: Stream s m Char => Parser s m () -parseLineSpace = label' "line space" $ void $ try $ do - optional endOfLine - char ' ' <|> char '\t' - -parseLineSpaces1 :: Stream s m Char => Parser s m () -parseLineSpaces1 = void $ try $ many1 parseLineSpace - -parseLineSpaces :: Stream s m Char => Parser s m () -parseLineSpaces = void $ try $ many parseLineSpace +parseImport = label' "import statement" $ do + -- Getting the starting position + (srcInfo, _) <- token $ keyword kwImport + + isQual <- isJust <$> optionMaybe (token $ keyword kwQualified) + modName@(ModuleName _ nameSrcInfo) <- tokenModuleName + + mayModAlias <- optionMaybe $ token (keyword kwAs) *> tokenModuleAliasInImport + + mayBracketSrcInfoAndNames <- optionMaybe $ do + _ <- token $ char '(' + names <- sepEndBy tokenName (token $ char ',') + (bracketSrcInfo, _) <- token $ char ')' + return (bracketSrcInfo, names) + + let mayBracketSrcInfo = fmap fst mayBracketSrcInfoAndNames + mayNames = fmap snd mayBracketSrcInfoAndNames + + return $ + Import isQual modName mayNames mayModAlias $ -- Get the rightmost position of the rightmost parsed token + srcInfo + { to = + fromJust $ + fmap to mayBracketSrcInfo + <|> ( case mayModAlias of + Just (ModuleAlias _ modAliasSrcInfo) -> Just $ to modAliasSrcInfo + _ -> Nothing + ) + <|> fmap to (Just nameSrcInfo) + } getSourcePosition :: Stream s m Char => Parser s m SourcePos getSourcePosition = do diff --git a/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Syntax.hs b/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Syntax.hs index fcc35154..daa959a3 100644 --- a/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Syntax.hs +++ b/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Syntax.hs @@ -38,6 +38,11 @@ module LambdaBuffers.Frontend.Syntax ( kwDerive, kwClassDef, kwInstance, + kwImport, + kwQualified, + kwAs, + kwModule, + kws, ) where import Data.Text (Text) @@ -56,6 +61,32 @@ kwInstance :: String kwInstance = "instance" :: String kwClassDef :: String kwClassDef = "class" :: String +kwImport :: String +kwImport = "import" :: String +kwQualified :: String +kwQualified = "qualified" :: String +kwAs :: String +kwAs = "as" :: String +kwModule :: String +kwModule = "module" :: String + +{- | 'kws' is a list of all keywords. + Warning: this invariant must be maintained manually +-} +kws :: [String] +kws = + [ kwTyDefSum + , kwTyDefProduct + , kwTyDefRecord + , kwTyDefOpaque + , kwDerive + , kwInstance + , kwClassDef + , kwImport + , kwModule + , kwQualified + , kwAs + ] tyBodyToTyDefKw :: TyBody info -> String tyBodyToTyDefKw (SumBody _) = kwTyDefSum diff --git a/lambda-buffers-frontend/test/Test/LambdaBuffers/Frontend.hs b/lambda-buffers-frontend/test/Test/LambdaBuffers/Frontend.hs index 4ab20342..7ac37978 100644 --- a/lambda-buffers-frontend/test/Test/LambdaBuffers/Frontend.hs +++ b/lambda-buffers-frontend/test/Test/LambdaBuffers/Frontend.hs @@ -59,7 +59,7 @@ frontendErrorTests dataDir = fileIn = workDir "A.lbf" fileErr = fileIn errOrMod <- runFrontend [workDir] [fileIn] - assertError ("[" <> fileErr <> ":(3:1)] \nunexpected 't'\nexpecting lb new line, import statement, type definition, class definition, instance clause, derive statement, space or end of input") errOrMod + assertError ("[" <> fileErr <> ":(3:1)] \nunexpected 't'\nexpecting import statement, type definition, class definition, instance clause, derive statement or end of input") errOrMod , testCase "Multiple modules found" $ do let workDir = dataDir "multiple_modules_found" fileIn = workDir "A.lbf" diff --git a/lambda-buffers-frontend/test/Test/LambdaBuffers/Frontend/Parsec.hs b/lambda-buffers-frontend/test/Test/LambdaBuffers/Frontend/Parsec.hs index 1a7d8f98..4c1446b5 100644 --- a/lambda-buffers-frontend/test/Test/LambdaBuffers/Frontend/Parsec.hs +++ b/lambda-buffers-frontend/test/Test/LambdaBuffers/Frontend/Parsec.hs @@ -4,7 +4,7 @@ import Test.Tasty (TestTree, testGroup) import Control.Monad (void) import Data.Set qualified as Set -import LambdaBuffers.Frontend.Parsec (parseClassDef, parseClassSups, parseConstraint, parseDerive, parseInstanceBody, parseInstanceClause, parseProduct, parseRecord, parseSum, parseTyInner, parseTyTopLevel) +import LambdaBuffers.Frontend.Parsec (junk, parseClassDef, parseClassSups, parseConstraint, parseDerive, parseInstanceBody, parseInstanceClause, parseProduct, parseRecord, parseSum, parseTyInner, parseTyTopLevel) import LambdaBuffers.Frontend.Syntax (ClassConstraint, Constraint, SourceInfo) import Test.Tasty.HUnit (assertFailure, testCase) import Text.Parsec (Parsec, eof, runParser) @@ -34,7 +34,16 @@ testInnerTypeExpression = "parses" [ parsesEq ["a", " a", "a ", " a ", "(a)", "( a )", "( (a ) )"] parseTyInner , parsesEq ["Int", " Int", "Int ", "(Int)", "( Int)", "(Int )", " (Int)", "(Int) ", "((Int))"] parseTyInner - , parsesEq + , -- TODO: this test case is screwed.. there's problems with the data + -- representation for why this won't pass e.g. @A a a@ is @A@ applied + -- to the list @[a,a]@; so breaking this down to the left associative + -- chain of applications really is broken. + -- , parsesEq + -- [ "(A.B.A a a a)" + -- , "((A.B.A a) a a)" + -- ] + -- parseTyInner + parsesEq [ "(Maybe a)" , " (Maybe a)" , "(Maybe a) " @@ -179,6 +188,9 @@ testTopLevelTypeExpression = , parses "( Maybe ( Maybe ( Maybe (Maybe a ))))" parseTyTopLevel , parses "(Maybe (A a) b (c) (d) )" parseTyTopLevel , parses "Maybe a Int b String" parseTyTopLevel + , parses "Maybe\na" parseTyTopLevel + , parses "Maybe \na" parseTyTopLevel + , parses "Maybe a\n" parseTyTopLevel ] , testGroup "fails" @@ -186,9 +198,6 @@ testTopLevelTypeExpression = , fails "( a ))" parseTyTopLevel , fails "( (a ) ))" parseTyTopLevel , fails "(Int))" parseTyTopLevel - , fails "Maybe\na" parseTyTopLevel - , fails "Maybe \na" parseTyTopLevel - , fails "Maybe a\n" parseTyTopLevel ] ] @@ -205,23 +214,23 @@ testRecordExpression = , parsesEq ["{x : Either a b}", "{ x : Either a b}", "{x : Either a b }", "{ x : Either a b }", "{x : (Either a b)}"] parseRecord , parsesEq ["{x : a, y : Int, z : Maybe a}", "{ x : a,y : Int , z : Maybe a }", "{\n x : a,\n y : Int ,\n z : Maybe a\n }"] parseRecord , parsesEq ["{x : a, y : Prelude.Numeric.Int, z : Prelude.Maybe a}", "{ x : a,y : Prelude.Numeric.Int , z : Prelude.Maybe a }", "{\n x : a,\n y : Prelude.Numeric.Int ,\n z : Prelude.Maybe a\n }"] parseRecord + , parses "{x:y}" parseRecord + , parses "{ x:y }" parseRecord + , parses "{ x: y}" parseRecord + , parses "{ x :y}" parseRecord + , parses "{x :y}" parseRecord + , parses "{x: y}" parseRecord + , parses "{\nx : a}" parseRecord + , parses "{x\n: a}" parseRecord + , parses "{x :\na}" parseRecord + , parses "{x : a\n}" parseRecord + , parses " {}" parseRecord ] , testGroup "fails" - [ fails " {}" parseRecord - , fails "{x}" parseRecord + [ fails "{x}" parseRecord , fails "{ x }" parseRecord - , fails "{x:y}" parseRecord - , fails "{ x:y }" parseRecord , fails "{ x: }" parseRecord - , fails "{ x: y}" parseRecord - , fails "{ x :y}" parseRecord - , fails "{x :y}" parseRecord - , fails "{x: y}" parseRecord - , fails "{\nx : a}" parseRecord - , fails "{x\n: a}" parseRecord - , fails "{x :\na}" parseRecord - , fails "{x : a\n}" parseRecord ] ] @@ -243,14 +252,15 @@ testProductExpression = , parses "a Int (Maybe a)" parseProduct , parses " a y Int z (Maybe a) " parseProduct , parses "Maybe\n Int" parseProduct + , parses "\n" parseProduct + , parses "\nMaybe Int" parseProduct + , parses "Maybe \nInt" parseProduct + , parses "Maybe Int\n" parseProduct ] , testGroup "fails" - [ fails "\n" parseProduct - , fails "\nMaybe Int" parseProduct - , fails "Maybe \nInt" parseProduct - , fails "Maybe Int\n" parseProduct - , fails "()" parseProduct + [ fails "()" parseProduct + , fails "( ) -- dog" parseProduct ] ] @@ -271,11 +281,11 @@ testSumExpression = , parses "A a b | B b a | C c d" parseSum , parses "A ((a) b) | B (b a) | C (c) (d)" parseSum , parses "A Int (Maybe Int String) | B (Prelude.Maybe a) | C Prelude.Numeric.Int Prelude.Numeric.String" parseSum + , parses "\n" parseSum ] , testGroup "fails" - [ fails "\n" parseSum - , fails "A |" parseSum + [ fails "A |" parseSum , fails "A ()| B" parseSum , fails "A | B ()" parseSum , fails "A (B | C)" parseSum @@ -317,6 +327,7 @@ testInstanceBodyExpression = [ testGroup "parses" [ parsesEq ["", "()"] parseIB -- TODO(bladyjoker): Figure out (). + , parsesEq ["Eq a, Show a", "(Eq a, Show a)"] parseIB , parsesEq ["Eq a", "Eq a", "Eq a ", " Eq a", "\n Eq a", "Eq\n a"] parseIB , parsesEq ["Eq a, Eq b", "Eq a , Eq b", "Eq a\n , Eq b", "Eq a\n , Eq b, ()"] parseIB , parses "Eq Int" parseIB @@ -337,11 +348,11 @@ testInstanceBodyExpression = , "Eq a, (Show b, Json c), MPTC (Maybe a) (Either a Int) c" ] parseIB + , parses "\n" parseIB ] , testGroup "fails" - [ fails "\n" parseIB - , fails "eq a" parseIB + [ fails "eq a" parseIB , fails "a" parseIB , fails "Eq a," parseIB , fails "Eq a, " parseIB @@ -405,6 +416,14 @@ testClassSups = [ parses "" parseCS , parses "Eq a" parseCS , parses " Eq a" parseCS + , parses " Eq a, Show a, Eq b" parseCS + , parsesEq + [ " Eq a, Show a, Eq b" + , " Eq a, (Show a, Eq b)" + , " (Eq a, Show a, Eq b)" + , " (Eq a, Show a), Eq b" + ] + parseCS , -- FIX(bladyjoker): parses "Eq a " parseCS parsesEq [ "Eq a" @@ -417,10 +436,12 @@ testClassSups = , "(Eq a)" ] parseCS + , parses "\n" parseCS ] , testGroup "fails" - [ fails "\n" parseCS + [ fails "Eq Int" parseCS + , fails "Eq Int, show Int" parseCS ] ] where @@ -444,6 +465,12 @@ testClassDef = , "class ( Eq a) <= Ord a" ] parseClassDef + , parsesEq + [ "class (Eq a), Eq b <= Ord a" + , "class (Eq a, Eq b) <= Ord a" + , "class Eq a , Eq b <= Ord a" + ] + parseClassDef , parsesEq [ "class Trivial" , "class Trivial" @@ -456,15 +483,17 @@ testClassDef = , "class ((MPTC1 b a, MPTC2 c b a)) <= MPTC a b c" ] parseClassDef + , parses " class Eq a" parseClassDef + , parses "class Eq a " parseClassDef + , parses "class () <= Eq a" parseClassDef ] , testGroup "fails" [ fails "\n" parseClassDef , fails "" parseClassDef - , fails " class Eq a" parseClassDef - , fails "class Eq a " parseClassDef , fails "class Eq a <=" parseClassDef , fails "class Eq a <= " parseClassDef + , fails "class Eq a, Eq a <= Eq a<= Eq a" parseClassDef , fails "class (Eq a)" parseClassDef ] ] @@ -472,7 +501,7 @@ testClassDef = parsesEq :: forall a info. (Functor a, Show (a ()), Ord (a ())) => [String] -> Parsec String () (a info) -> TestTree parsesEq inputs parser = testCase (show inputs <> " should parse the same") $ - let ress = runParser (parser <* eof) () "test" <$> inputs + let ress = runParser (junk *> parser <* eof) () "test" <$> inputs in case foldr ( \res (errs, ps) -> case res of Left err -> (err : errs, ps) @@ -484,11 +513,11 @@ parsesEq inputs parser = (errs, ps) -> assertFailure $ show ("Wanted all to parse the same" :: String, errs, ps) parses :: String -> Parsec String () a -> TestTree -parses input parser = testCase (show input) $ case runParser (parser <* eof) () "test" input of +parses input parser = testCase (show input) $ case runParser (junk *> parser <* eof) () "test" input of Left err -> assertFailure (show err) Right _ -> return () fails :: Show a => String -> Parsec String () a -> TestTree -fails input parser = testCase (show input) $ case runParser (parser <* eof) () "test" input of +fails input parser = testCase (show input) $ case runParser (junk *> parser <* eof) () "test" input of Left _ -> return () Right res -> assertFailure (show res) From e8c328c656f88dad0985424733543b436bf42850 Mon Sep 17 00:00:00 2001 From: jared <> Date: Tue, 17 Oct 2023 03:05:25 -0600 Subject: [PATCH 02/10] Added front end parser test cases --- .../data/good_instance/GoodInstance.lbf | 11 +++++++++++ .../ModuleDocumentation.lbf | 13 +++++++++++++ .../test/Test/LambdaBuffers/Frontend.hs | 10 ++++++++++ .../Test/LambdaBuffers/Frontend/Parsec.hs | 19 +++++++++++++++++++ 4 files changed, 53 insertions(+) create mode 100644 lambda-buffers-frontend/data/good_instance/GoodInstance.lbf create mode 100644 lambda-buffers-frontend/data/good_module_documentation/ModuleDocumentation.lbf diff --git a/lambda-buffers-frontend/data/good_instance/GoodInstance.lbf b/lambda-buffers-frontend/data/good_instance/GoodInstance.lbf new file mode 100644 index 00000000..e95d4812 --- /dev/null +++ b/lambda-buffers-frontend/data/good_instance/GoodInstance.lbf @@ -0,0 +1,11 @@ +module GoodInstance + +instance MyClass A + +class MyClass a + +sum A = A + +-- if we're wondering why this test case is here, previous parser versions +-- confused 'instance' with 'import' and reported an unexpected 'n' in the +-- 'instance' keyword. diff --git a/lambda-buffers-frontend/data/good_module_documentation/ModuleDocumentation.lbf b/lambda-buffers-frontend/data/good_module_documentation/ModuleDocumentation.lbf new file mode 100644 index 00000000..a369b17d --- /dev/null +++ b/lambda-buffers-frontend/data/good_module_documentation/ModuleDocumentation.lbf @@ -0,0 +1,13 @@ + +-- Some documentation here + +module ModuleDocumentation + +-- More documentation +sum A = A + + +-- Woo hoo, documentation is great +-- (who reads it anyways) + +-- dog pomeranian yorkie maltese diff --git a/lambda-buffers-frontend/test/Test/LambdaBuffers/Frontend.hs b/lambda-buffers-frontend/test/Test/LambdaBuffers/Frontend.hs index 7ac37978..92ca207c 100644 --- a/lambda-buffers-frontend/test/Test/LambdaBuffers/Frontend.hs +++ b/lambda-buffers-frontend/test/Test/LambdaBuffers/Frontend.hs @@ -115,6 +115,16 @@ frontendSuccessTests dataDir = fileIn = workDir "BadFormat.lbf" errOrMod' <- runFrontend [workDir] [fileIn] assertSuccess ["A", "BadFormat"] errOrMod' + , testCase "good_module_documentation/ModuleDocumentation.lbf also compiles" $ do + let workDir = dataDir "good_module_documentation" + fileIn = workDir "ModuleDocumentation.lbf" + errOrMod' <- runFrontend [workDir] [fileIn] + assertSuccess ["ModuleDocumentation"] errOrMod' + , testCase "good_instance/GoodInstance.lbf also compiles" $ do + let workDir = dataDir "good_instance" + fileIn = workDir "GoodInstance.lbf" + errOrMod' <- runFrontend [workDir] [fileIn] + assertSuccess ["GoodInstance"] errOrMod' ] ] diff --git a/lambda-buffers-frontend/test/Test/LambdaBuffers/Frontend/Parsec.hs b/lambda-buffers-frontend/test/Test/LambdaBuffers/Frontend/Parsec.hs index 4c1446b5..5cc83644 100644 --- a/lambda-buffers-frontend/test/Test/LambdaBuffers/Frontend/Parsec.hs +++ b/lambda-buffers-frontend/test/Test/LambdaBuffers/Frontend/Parsec.hs @@ -471,6 +471,17 @@ testClassDef = , "class Eq a , Eq b <= Ord a" ] parseClassDef + , parsesEq + [ "class (Eq a), Eq b, Eq c <= Ord a" + , "class (Eq a, Eq b), Eq c <= Ord a" + , "class Eq a, (Eq b, Eq c) <= Ord a" + , "class ((Eq a), Eq b), Eq c <= Ord a" + , "class Eq a, (Eq b, (Eq c)) <= Ord a" + , "class (Eq a, (Eq b)), Eq c <= Ord a" + , "class Eq a, ((Eq b), Eq c) <= Ord a" + , "class (Eq a, ((Eq b), Eq c)) <= Ord a" + ] + parseClassDef , parsesEq [ "class Trivial" , "class Trivial" @@ -498,6 +509,14 @@ testClassDef = ] ] +-- * Parsing testing functions + +-- Note: when testing parses, since all parsers assume the invariant that they +-- _must_ start at a non whitespace character, we always run 'junk' before +-- before running the parser. When the parser finishes, we of course run 'eof' +-- to ensure it consumes the entire input. +-- See [Note: Parser Implementation] in "LambdaBuffers.Frontend.Parsec" for details + parsesEq :: forall a info. (Functor a, Show (a ()), Ord (a ())) => [String] -> Parsec String () (a info) -> TestTree parsesEq inputs parser = testCase (show inputs <> " should parse the same") $ From 2013699b7016de003b4116d85c6e220ebf54a4a3 Mon Sep 17 00:00:00 2001 From: jared <> Date: Tue, 17 Oct 2023 11:23:32 -0600 Subject: [PATCH 03/10] Improved documentation for the parser --- .../src/LambdaBuffers/Frontend/Parsec.hs | 36 ++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Parsec.hs b/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Parsec.hs index 6d02d139..6b865c7b 100644 --- a/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Parsec.hs +++ b/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Parsec.hs @@ -53,9 +53,10 @@ type Parser s m a = ParsecT s () m a -- - nonterm -> alt1 | ... | altn -- -- --- Tokens form the vocabulary of Lambda Buffer Files. There are classes of --- tokens (keyword, modulename, longmodulename, tyname, longtyname, varname, --- punctuation, fieldname, classname, longclassname) as follows. +-- Tokens form the vocabulary of Lambda Buffer Files. The classes of *tokens* +-- (keyword, modulename, longmodulename, tyname, longtyname, varname, +-- punctuation, fieldname, classname, longclassname) are as follows. +-- Note that some of the tokens overlap but may be distinguished via parsing. -- -- keyword -> 'module' | 'sum' | 'prod' | 'record' | 'opaque' | 'class' | 'instance' | 'import' | 'qualified' | 'as' -- modulename -> upperCamelCase @@ -80,8 +81,14 @@ type Parser s m a = ParsecT s () m a -- line comments. At each point, the longest possible token satisfying the -- token definitions is read. -- +-- A *line comment* is any sequence of characters which begins with '--' +-- followed by zero or more printable Unicode character to the first end of +-- line ('\n' or '\r\n'). +-- -- Finally, the grammar for Lambda Buffer Files is as follows. -- +-- start -> module +-- -- module -> 'module' modulename imports statements -- -- import -> 'import' [ 'qualified' ] longmodulename @@ -89,7 +96,7 @@ type Parser s m a = ParsecT s () m a -- [ '(' [ { tyname ',' } tyname [','] ] ')' ] -- imports -> { import } -- --- statements -> [ { statement newlines1 } statement [ newlines1 ] ] +-- statements -> { statement } -- statement -> tydef -- | classdef -- | instanceclause @@ -114,7 +121,7 @@ type Parser s m a = ParsecT s () m a -- opaquetydef -> 'opaque' tyname { varname } -- -- classdef -> 'class' [ classexps '<=' ] classname { varname } --- // Warning: this part makes it not LL(1)! +-- // Warning: this is not LL(1)! -- // In the future, we should shift to some form of -- // an LALR(1) parser. -- classexp -> classref { varname } @@ -174,8 +181,8 @@ type Parser s m a = ParsecT s () m a -- * Primitives {- | @'token' pa@ runs the parser @pa@ with 'try' followed by 'junk' to remove - whitespace. Moreover, this gets the SourceInfo of the parsed token w/o the - whitespace + whitespace. Moreover, this gets the 'SourceInfo' of the parsed token without + the whitespace See [Note: Parser Implementation]. -} @@ -216,6 +223,12 @@ runParser p = runParserT (junk *> p <* eof) () -- * Lexical elements +-- +-- - Functions which have @parse@ as a prefix simply parse the token +-- +-- - Functions which have @token@ as a prefix wrap the corresponding @parse@ +-- function with the 'token' function. + parseModuleNamePart :: Stream s m Char => Parser s m (ModuleNamePart SourceInfo) parseModuleNamePart = withSourceInfo . label' "module part name" $ ModuleNamePart <$> pModuleNamePart @@ -391,6 +404,11 @@ parseField :: Stream s m Char => Parser s m (Field SourceInfo) parseField = withSourceInfo . label' "record field" $ do fn <- tokenFieldName _ <- token $ char ':' + -- TODO: strictly speaking, there's a bug with this when parsing + -- > record A a = { fieldName :-- a } + -- since this will parse the @:--@ as @:@ and @--@ will start a comment. + -- Technically, the specification says that this should parse as the token + -- @:-@, and then the remaining @-@ should parse error. Field fn <$> parseTyTopLevel parseTyDef :: Stream s m Char => Parser s m (TyDef SourceInfo) @@ -527,9 +545,11 @@ parseImport = label' "import statement" $ do mayNames = fmap snd mayBracketSrcInfoAndNames return $ - Import isQual modName mayNames mayModAlias $ -- Get the rightmost position of the rightmost parsed token + Import isQual modName mayNames mayModAlias $ srcInfo { to = + -- Get the rightmost position of the rightmost parsed token + -- Note: the 'fromJust' clearly never fails. fromJust $ fmap to mayBracketSrcInfo <|> ( case mayModAlias of From e3a40b6bedb5b233ff6946444f7a619dee6ef72c Mon Sep 17 00:00:00 2001 From: jared <> Date: Wed, 18 Oct 2023 21:16:07 -0600 Subject: [PATCH 04/10] Parser improvements - Changed parsing test case to no longer use keywords as a field name (as per the specification) - Improved `LambdaBuffers/Frontend/Parsec.hs` documentation - Updated TODO in `Test/LambdaBuffers/Frontend/Parsec.hs` - Fixed `LambdaBuffers/Frontend/Parsec.hs` incorrectly parsing `:--` --- .../data/goldens/good/LambdaBuffers.lbf | 6 +- .../src/LambdaBuffers/Frontend/Parsec.hs | 134 ++---------------- .../Test/LambdaBuffers/Frontend/Parsec.hs | 2 +- 3 files changed, 19 insertions(+), 123 deletions(-) diff --git a/lambda-buffers-frontend/data/goldens/good/LambdaBuffers.lbf b/lambda-buffers-frontend/data/goldens/good/LambdaBuffers.lbf index b98ea211..451da56e 100644 --- a/lambda-buffers-frontend/data/goldens/good/LambdaBuffers.lbf +++ b/lambda-buffers-frontend/data/goldens/good/LambdaBuffers.lbf @@ -52,7 +52,7 @@ record ClassDef = { name : ClassName derive Eq ClassDef -record ClassConstraint = { class : ClassRef, args : List TyArg } +record ClassConstraint = { classRef : ClassRef, args : List TyArg } derive Eq ClassConstraint @@ -64,7 +64,7 @@ prod Derive = Constraint derive Eq Derive -record Constraint = { class : ClassRef, args : List Ty } +record Constraint = { classRef : ClassRef, args : List Ty } derive Eq Constraint @@ -111,4 +111,4 @@ derive Eq ModuleNamePart prod ClassName = Text -derive Eq ClassName \ No newline at end of file +derive Eq ClassName diff --git a/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Parsec.hs b/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Parsec.hs index 6b865c7b..b8aef8d7 100644 --- a/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Parsec.hs +++ b/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Parsec.hs @@ -22,6 +22,7 @@ import Data.Char qualified as Char import Data.Kind (Type) import Data.Maybe (fromJust, isJust) import Data.String (IsString (fromString)) +import Data.Text qualified as Text import LambdaBuffers.Compiler.NamingCheck (pClassName, pConstrName, pFieldName, pModuleNamePart, pTyName) import LambdaBuffers.Frontend.Syntax (ClassConstraint (ClassConstraint), ClassDef (ClassDef), ClassName (ClassName), ClassRef (ClassRef), ConstrName (ConstrName), Constraint (Constraint), Constructor (Constructor), Derive (Derive), Field (Field), FieldName (FieldName), Import (Import), InstanceClause (InstanceClause), Module (Module), ModuleAlias (ModuleAlias), ModuleName (ModuleName), ModuleNamePart (ModuleNamePart), Name (Name), Product (Product), Record (Record), SourceInfo (SourceInfo, to), SourcePos (SourcePos), Statement (StClassDef, StDerive, StInstanceClause, StTyDef), Sum (Sum), Ty (TyApp, TyRef', TyVar), TyArg (TyArg), TyBody (Opaque, ProductBody, RecordBody, SumBody), TyDef (TyDef), TyName (TyName), TyRef (TyRef), VarName (VarName), kwAs, kwClassDef, kwDerive, kwImport, kwInstance, kwModule, kwQualified, kwTyDefOpaque, kwTyDefProduct, kwTyDefRecord, kwTyDefSum, kws) import Text.Parsec (ParseError, ParsecT, SourceName, Stream, alphaNum, between, char, endOfLine, eof, getPosition, label, lower, many, many1, manyTill, notFollowedBy, optionMaybe, runParserT, satisfy, sepBy, sepEndBy, sourceColumn, sourceLine, sourceName, space, string, try, unexpected, ()) @@ -31,110 +32,7 @@ type Parser s m a = ParsecT s () m a -- Note: Syntactic Form of Lambda Buffer Files. -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ --- The notational conventions used to present the syntax is based off of [1]. --- So, these notational conventions are used for presenting syntax. --- --- - [ pattern ] optional --- --- - { pattern } zero or more repetitions --- --- - ( pattern ) grouping --- --- - pat1 | pat2 choice --- --- - pat1\pat2 difference -- elements generated by pat1 except those --- generated by pat2. --- --- - 'terminal' terminal syntax --- --- - // comment comment --- --- Productions will be of the form --- - nonterm -> alt1 | ... | altn --- --- --- Tokens form the vocabulary of Lambda Buffer Files. The classes of *tokens* --- (keyword, modulename, longmodulename, tyname, longtyname, varname, --- punctuation, fieldname, classname, longclassname) are as follows. --- Note that some of the tokens overlap but may be distinguished via parsing. --- --- keyword -> 'module' | 'sum' | 'prod' | 'record' | 'opaque' | 'class' | 'instance' | 'import' | 'qualified' | 'as' --- modulename -> upperCamelCase --- longmodulename -> long modulename --- tyname -> upperCamelCase --- fieldname -> lowerCamelCase\keyword --- longtyname -> long tyname --- varname -> lowers\keyword --- punctuation -> '<=' | ',' | '.' | '(' | ')' | '{' | '}' | ':' | ':-' | '=' | '|' --- classname -> upperCamelCase --- longclassname -> long upperCamelCase --- --- upperCamelCase -> upper { alphaNum } --- lowerCamelCase -> lower { alphaNum } --- long -> { upperCamelCase '.' } --- upper -> // upper case or title case alphabetic unicode characters (letters) --- lower -> // lower case alphabetic unicode characters (letters) --- lowers -> lower { lower } --- alphaNum -> // alphabetic or numeric unicode characters --- --- Input files are broken into *tokens* which are delimited by whitespace or --- line comments. At each point, the longest possible token satisfying the --- token definitions is read. --- --- A *line comment* is any sequence of characters which begins with '--' --- followed by zero or more printable Unicode character to the first end of --- line ('\n' or '\r\n'). --- --- Finally, the grammar for Lambda Buffer Files is as follows. --- --- start -> module --- --- module -> 'module' modulename imports statements --- --- import -> 'import' [ 'qualified' ] longmodulename --- [ 'as' longmodulename ] --- [ '(' [ { tyname ',' } tyname [','] ] ')' ] --- imports -> { import } --- --- statements -> { statement } --- statement -> tydef --- | classdef --- | instanceclause --- | derivedef --- --- tydef -> sumtydef | prodtydef | recordtydef | opaquetydef --- --- sumtydef -> 'sum' tyname { varname } '=' sum --- sum -> sumconstructor { '|' sumconstructor } --- sumconstructor -> tyname prod --- --- prodtydef -> 'prod' tyname { varname } '=' prod --- prod -> { tyexpr } --- tyexpr -> varname --- | longtyname --- | '(' prod ')' --- --- recordtydef -> 'record' tyname { varname } '=' record --- record -> '{' [ field { ',' field } ] '}' --- field -> fieldname ':' prod --- --- opaquetydef -> 'opaque' tyname { varname } --- --- classdef -> 'class' [ classexps '<=' ] classname { varname } --- // Warning: this is not LL(1)! --- // In the future, we should shift to some form of --- // an LALR(1) parser. --- classexp -> classref { varname } --- | '(' classexps ')' --- classexps -> [ classexp { ',' classexp } ] --- --- instanceclause -> 'instance' constraint [ ':-' classexps ] --- constraint -> classref { tyexpr } --- --- derivedef -> 'derive' constraint --- --- References. --- [1] Haskell 2010 Language Report by Simon Marlow +-- See docs/syntax.md -- -- Note: Parser Implementation. -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -213,7 +111,7 @@ junk = void (many (spaces1 <|> comment)) manyTill (satisfy Char.isPrint) (try endOfLine) {- | 'keyword' parses the provided keyword ensuring that the keyword does *not* - overlap with varname tokens. + overlap with varname tokens and fieldname tokens. -} keyword :: Stream s m Char => String -> Parser s m () keyword k = void $ string k *> notFollowedBy alphaNum @@ -302,14 +200,11 @@ tokenTyRef = token' parseTyRef parseFieldName :: Stream s m Char => Parser s m (FieldName SourceInfo) parseFieldName = - withSourceInfo . label' "record field name" $ - -- TODO: technically, we should have the following implementation, but the - -- test suite wants keywords to be allowed to use as field names... - -- > v <- pFieldName - -- > notKeyword $ Data.Text.unpack v - -- > return $ FieldName v - -- TODO: fix the documentation to reflect this - FieldName <$> pFieldName + withSourceInfo . label' "record field name" $ do + v <- pFieldName + -- Recall in the lexical specification that fieldnames are disjoint from keywords + notKeyword $ Text.unpack v + return $ FieldName v tokenFieldName :: Stream s m Char => Parser s m (FieldName SourceInfo) tokenFieldName = token' parseFieldName @@ -403,12 +298,13 @@ parseRecord = withSourceInfo . label' "record type expression" $ do parseField :: Stream s m Char => Parser s m (Field SourceInfo) parseField = withSourceInfo . label' "record field" $ do fn <- tokenFieldName - _ <- token $ char ':' - -- TODO: strictly speaking, there's a bug with this when parsing + _ <- token $ char ':' *> notFollowedBy (char '-') + -- Why is the @'notFollowedBy'@ here? + -- Consider: -- > record A a = { fieldName :-- a } - -- since this will parse the @:--@ as @:@ and @--@ will start a comment. - -- Technically, the specification says that this should parse as the token - -- @:-@, and then the remaining @-@ should parse error. + -- We want to parse the @:--@ as @:-@ and @-@ (the specification says this), + -- but without the @'notFollowedBy'@, this would parse as @:@ and @--@ will + -- start a comment. Field fn <$> parseTyTopLevel parseTyDef :: Stream s m Char => Parser s m (TyDef SourceInfo) @@ -474,7 +370,7 @@ parseClassDef = withSourceInfo . label' "class definition" $ do _ <- token (keyword kwClassDef) maySups <- optionMaybe - -- TODO: parsing this is problematic for LL(1) parsers, hence the + -- Remark: parsing this is problematic for LL(1) parsers, hence the -- rather large 'try'. -- We really are abusing the infinite look ahead here... ( try $ do diff --git a/lambda-buffers-frontend/test/Test/LambdaBuffers/Frontend/Parsec.hs b/lambda-buffers-frontend/test/Test/LambdaBuffers/Frontend/Parsec.hs index 5cc83644..63f48b58 100644 --- a/lambda-buffers-frontend/test/Test/LambdaBuffers/Frontend/Parsec.hs +++ b/lambda-buffers-frontend/test/Test/LambdaBuffers/Frontend/Parsec.hs @@ -34,7 +34,7 @@ testInnerTypeExpression = "parses" [ parsesEq ["a", " a", "a ", " a ", "(a)", "( a )", "( (a ) )"] parseTyInner , parsesEq ["Int", " Int", "Int ", "(Int)", "( Int)", "(Int )", " (Int)", "(Int) ", "((Int))"] parseTyInner - , -- TODO: this test case is screwed.. there's problems with the data + , -- TODO(jaredponn): this test case is screwed.. there's problems with the data -- representation for why this won't pass e.g. @A a a@ is @A@ applied -- to the list @[a,a]@; so breaking this down to the left associative -- chain of applications really is broken. From 24c602fa6faec7a86d109ec53b16e170e2d7ea36 Mon Sep 17 00:00:00 2001 From: jared <> Date: Wed, 18 Oct 2023 22:53:10 -0600 Subject: [PATCH 05/10] Added chapter on syntactic forms of LambdaBuffers files. --- _typos.toml | 5 +- docs/SUMMARY.md | 1 + docs/syntax.md | 255 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 259 insertions(+), 2 deletions(-) create mode 100644 docs/syntax.md diff --git a/_typos.toml b/_typos.toml index e613a6f3..d1730be7 100644 --- a/_typos.toml +++ b/_typos.toml @@ -1,6 +1,7 @@ [default.extend-words] substituters = "substituters" -hask= "hask" +hask = "hask" +Nd = "Nd" [type.pdf] extend-glob = ["*.pdf"] @@ -8,4 +9,4 @@ check-file = false [type.png] extend-glob = ["*.png"] -check-file = false \ No newline at end of file +check-file = false diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index 58a4b6ab..2b85f9a7 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -6,6 +6,7 @@ - [LambdaBuffers to Purescript](purescript.md) - [Design](design.md) - [API](api.md) +- [LambdaBuffers file](syntax.md) - [Compiler](compiler.md) - [Codegen](codegen.md) - [Command line interface](command-line-interface.md) diff --git a/docs/syntax.md b/docs/syntax.md new file mode 100644 index 00000000..cb85c520 --- /dev/null +++ b/docs/syntax.md @@ -0,0 +1,255 @@ +# LambdaBuffers file + +The input to LambdaBuffers is a text file which contains a module that defines + a specification of the types you want to generate. +This section gives the exact syntax of a LambdaBuffers file, and informally describes meaning of the syntactic constructs. + +The name of a LambdaBuffers file must end with `.lbf`. + +## Notation +In the following description of a LambdaBuffers file's syntax, we use + a similar BNF syntax from [Section 10.1 of the Haskell Report](https://www.haskell.org/onlinereport/haskell2010/). +So, the following notational conventions are used for presenting syntax. + +| Syntax | Description | +| ------------- | --------------------------------------------------------------------------- | +| `[pattern]` | optional | +| `{pattern}` | zero or more repetitions | +| `(pattern)` | grouping | +| `pat1⎮pat2` | choice | +| `pat1\pat2` | difference -- elements generated by `pat1` except those generated by `pat2` | +| `'terminal'` | terminal syntax surrounded by single quotes | + + + +Note that the terminal syntax permits C-style escape sequences e.g. + `'\n'` denotes line feed (newline), and `'\r'` denotes carriage return. + +Productions will be of the form + +```text +nonterm -> alt1 | ... | altn +``` + +## Input file representation +The input file is Unicode text where the encoding is subject to the system locale. +We will often use the unqualified term *character* to refer to a Unicode code point in the input file. + +## Characters +The following terms are used to denote specific Unicode character categories: + +- `upper` denotes a Unicode code point categorized as an uppercase letter or titlecase letter (i.e., with General Category value Lt or Lu). + +- `lower` denotes a Unicode code point categorized as a lower-case letter (i.e., with General Category value Ll). + +- `alphanum` denotes either `upper` or `lower`; or a Unicode code point categorized as a modifier letter, other letter, decimal digit number, letter number, or other number (i.e., with General Category value Lt, Lu, Ll, Lm, Lo, Nd, Nl or No). + +- `space` denotes a Unicode code point categorized as a separator space (i.e., with General Category value Zs), or any of the control characters `'\t'`, `'\n'`, `'\r'`, `'\f'`, or `'\v'`. + +Interested readers may find details of Unicode character categories in [Section 4.5 of The Unicode Standard 15.1.0](https://www.unicode.org/versions/Unicode15.1.0/), and the [Unicode Character Database](https://unicode.org/ucd/). + +## Lexical syntax + +Tokens form the vocabulary of LambdaBuffers files. +The classes of tokens are defined as follows. + +```text +keyword -> 'module' | 'sum' | 'prod' | 'record' + | 'opaque' | 'class' | 'instance' | 'import' + | 'qualified' | 'as' +modulename -> uppercamelcase +longmodulename -> long modulename +tyname -> uppercamelcase +fieldname -> lowercamelcase\keyword +longtyname -> long tyname +varname -> lowers\keyword +punctuation -> '<=' | ',' | '(' | ')' | '{' | '}' + | ':' | ':-' | '=' | '|' +classname -> uppercamelcase +longclassname -> long uppercamelcase +``` + +where + +```text +uppercamelcase -> upper { alphanum } +lowercamelcase -> lower { alphanum } +long -> { uppercamelcase '.' } +lowers -> lower { lower } +``` + +Input files are broken into *tokens* which use the *maximal munch* rule i.e., + at each point, the next token is the longest sequence of characters that + form a valid token. +`space`s or line comments are ignored except as it separates tokens that + would otherwise combine into a single token. + +### Line comments +A *line comment* starts with the terminal `'--'` followed by zero or more printable Unicode characters stopping at the first end of line (`'\n'` or `'\r\n'`). + +## Syntax of LambdaBuffers files +A LambdaBuffers file defines a module that is a collection of data types, classes, instance clauses, and derive clauses. + +The overall layout of a LambdaBuffers file is: + +```text +module -> 'module' longmodulename { import } { statement } +``` + +The file must specify the module's `longmodulename` where its `modulename` must match the file's name not including the `.lbf` extension. +After, the file may contain a sequence of `import`s followed by a sequence of `statement`s. + +### Import +Imports bring *entities* (types and classes) of other modules into scope. + +```text +import -> 'import' [ 'qualified' ] longmodulename [ 'as' longmodulename ] [ importspec ] +importspec -> '(' [ { tyname ',' } tyname [','] ] ')' +``` + +If `importspec` is omitted, then all entities specified in the module are imported; otherwise only the specified entities are imported. + +### Statement + +Statements define types, classes, instance clauses, and derive clauses. + +```text +statement -> typedef + | classdef + | instanceclause + | deriveclause +``` + +#### Type definitions +Types may be either sum types, product types, record types, or opaque types. + +```text +typedef -> prodtypedef | sumtypedef | recordtypedef | opaquetypedef +``` + +##### Product type definition +A product type definition defines a new product type. + +```text +prodtypedef -> 'prod' tyname { varname } '=' prod +prod -> { tyexpr } +tyexpr -> varname + | longtyname + | '(' prod ')' +``` + +Product type definitions instruct the code generator to generate a product type for the target language. + +##### Sum type definition +A sum type definition defines a new sum type. + +```text +sumtypedef -> 'sum' tyname { varname } '=' sum +sum -> sumconstructor { '|' sumconstructor } +sumconstructor -> tyname prod +``` + +Sum type definitions instruct the code generator to generate a sum type for the target language. + +##### Record type definition +A record type definition defines a new record type. + +```text +recordtypedef -> 'record' tyname { varname } '=' record +record -> '{' [ field { ',' field } ] '}' +field -> fieldname ':' prod +```` + +Record type definitions instruct the code generator to generate a record type for the target language. + +##### Opaque type +An opaque type definition defines a new opaque type. + +```text +opaquetypedef -> 'opaque' tyname { varname } +``` + +Opaque type definitions do not instruct the code generator to generate code, and an opaque type must be instead implemented in the target language. + +#### Class definition +A class definition introduces a new class. + +```text +classdef -> 'class' [ constraintexps '<=' ] classname { varname } +constraintexp -> classref { varname } + | '(' constraintexps ')' +constraintexps -> [ constraintexp { ',' constraintexp } ] +``` + +Class definitions do not instruct the code generator to generate code, but + instead provides a means to communicate with the code generator the + instances one would like to generate (via a derive clause). + +#### Instance clause +An instance clause specifies a type is an instance of a class. + +```text +instanceclause -> 'instance' constraint [ ':-' constraintexps ] +constraint -> classref { tyexpr } +``` + +Instance clauses do not instruct the code generator to generate code, but + instead instructs the compiler (semantic checking) that the target language + provides instances for the given type provided that the given `constraintexps` + have instances. + +#### Derive clause +Derive clauses instruct the code generator to generate code for a type so that it is an instance of a class. + +```text +deriveclause -> 'derive' constraint +``` + +Note the code generation of a type for a class is implemented via builtin derivation rules (which developers may extend). + +### Syntax reference +The summarized productions of a LambdaBuffers file is as follows. + +```text +module -> 'module' longmodulename { import } { statement } + +import -> 'import' [ 'qualified' ] longmodulename [ 'as' longmodulename ] [ importspec ] +importspec -> '(' [ { tyname ',' } tyname [','] ] ')' + +statement -> typedef + | classdef + | instanceclause + | deriveclause + +typedef -> prodtypedef | sumtypedef | recordtypedef | opaquetypedef + +prodtypedef -> 'prod' tyname { varname } '=' prod +prod -> { tyexpr } +tyexpr -> varname + | longtyname + | '(' prod ')' + +sumtypedef -> 'sum' tyname { varname } '=' sum +sum -> sumconstructor { '|' sumconstructor } +sumconstructor -> tyname prod + +recordtypedef -> 'record' tyname { varname } '=' record +record -> '{' [ field { ',' field } ] '}' +field -> fieldname ':' prod + +opaquetypedef -> 'opaque' tyname { varname } + +classdef -> 'class' [ constraintexps '<=' ] classname { varname } +constraintexp -> classref { varname } + | '(' constraintexps ')' +constraintexps -> [ constraintexp { ',' constraintexp } ] + +instanceclause -> 'instance' constraint [ ':-' constraintexps ] +constraint -> classref { tyexpr } + +deriveclause -> 'derive' constraint +``` From ed6abb26539a0e9ef3d556ced74b9ea661884fa8 Mon Sep 17 00:00:00 2001 From: jared <> Date: Wed, 18 Oct 2023 22:56:09 -0600 Subject: [PATCH 06/10] Lined up arrows in grammar documentation --- docs/syntax.md | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/docs/syntax.md b/docs/syntax.md index cb85c520..7c2d26d0 100644 --- a/docs/syntax.md +++ b/docs/syntax.md @@ -135,11 +135,11 @@ typedef -> prodtypedef | sumtypedef | recordtypedef | opaquetypedef A product type definition defines a new product type. ```text -prodtypedef -> 'prod' tyname { varname } '=' prod -prod -> { tyexpr } -tyexpr -> varname - | longtyname - | '(' prod ')' +prodtypedef -> 'prod' tyname { varname } '=' prod +prod -> { tyexpr } +tyexpr -> varname + | longtyname + | '(' prod ')' ``` Product type definitions instruct the code generator to generate a product type for the target language. @@ -148,9 +148,9 @@ Product type definitions instruct the code generator to generate a product type A sum type definition defines a new sum type. ```text -sumtypedef -> 'sum' tyname { varname } '=' sum -sum -> sumconstructor { '|' sumconstructor } -sumconstructor -> tyname prod +sumtypedef -> 'sum' tyname { varname } '=' sum +sum -> sumconstructor { '|' sumconstructor } +sumconstructor -> tyname prod ``` Sum type definitions instruct the code generator to generate a sum type for the target language. @@ -159,9 +159,9 @@ Sum type definitions instruct the code generator to generate a sum type for the A record type definition defines a new record type. ```text -recordtypedef -> 'record' tyname { varname } '=' record -record -> '{' [ field { ',' field } ] '}' -field -> fieldname ':' prod +recordtypedef -> 'record' tyname { varname } '=' record +record -> '{' [ field { ',' field } ] '}' +field -> fieldname ':' prod ```` Record type definitions instruct the code generator to generate a record type for the target language. @@ -227,19 +227,19 @@ statement -> typedef typedef -> prodtypedef | sumtypedef | recordtypedef | opaquetypedef -prodtypedef -> 'prod' tyname { varname } '=' prod -prod -> { tyexpr } -tyexpr -> varname - | longtyname - | '(' prod ')' +prodtypedef -> 'prod' tyname { varname } '=' prod +prod -> { tyexpr } +tyexpr -> varname + | longtyname + | '(' prod ')' -sumtypedef -> 'sum' tyname { varname } '=' sum -sum -> sumconstructor { '|' sumconstructor } -sumconstructor -> tyname prod +sumtypedef -> 'sum' tyname { varname } '=' sum +sum -> sumconstructor { '|' sumconstructor } +sumconstructor -> tyname prod -recordtypedef -> 'record' tyname { varname } '=' record -record -> '{' [ field { ',' field } ] '}' -field -> fieldname ':' prod +recordtypedef -> 'record' tyname { varname } '=' record +record -> '{' [ field { ',' field } ] '}' +field -> fieldname ':' prod opaquetypedef -> 'opaque' tyname { varname } From 51667a9e83d7548946e7925c93ee0b5974236d75 Mon Sep 17 00:00:00 2001 From: jared <> Date: Wed, 18 Oct 2023 22:59:32 -0600 Subject: [PATCH 07/10] Readded warning about the non LL(1) part of the grammar --- .../src/LambdaBuffers/Frontend/Parsec.hs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Parsec.hs b/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Parsec.hs index b8aef8d7..b0eeb34a 100644 --- a/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Parsec.hs +++ b/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Parsec.hs @@ -34,6 +34,12 @@ type Parser s m a = ParsecT s () m a -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- See docs/syntax.md -- +-- Warning: In the production +-- classdef -> 'class' [ constraintexps '<=' ] classname { varname } +-- this is not LL(1)! Either we live with what we currently have which has a +-- large 'try' around parsing @[ constraintexps '<=' ]@, or we move to an +-- LALR(1) parser generator which should has no issues with parsing this. +-- -- Note: Parser Implementation. -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- From e1756cf709ab05c2034a29e3ad44a6e6545666f2 Mon Sep 17 00:00:00 2001 From: jared <> Date: Wed, 18 Oct 2023 23:16:38 -0600 Subject: [PATCH 08/10] Undo `fieldnames` should be disjoint from `keywords` --- .../data/goldens/good/LambdaBuffers.lbf | 6 +++--- .../src/LambdaBuffers/Frontend/Parsec.hs | 17 ++++++++++++----- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/lambda-buffers-frontend/data/goldens/good/LambdaBuffers.lbf b/lambda-buffers-frontend/data/goldens/good/LambdaBuffers.lbf index 451da56e..b98ea211 100644 --- a/lambda-buffers-frontend/data/goldens/good/LambdaBuffers.lbf +++ b/lambda-buffers-frontend/data/goldens/good/LambdaBuffers.lbf @@ -52,7 +52,7 @@ record ClassDef = { name : ClassName derive Eq ClassDef -record ClassConstraint = { classRef : ClassRef, args : List TyArg } +record ClassConstraint = { class : ClassRef, args : List TyArg } derive Eq ClassConstraint @@ -64,7 +64,7 @@ prod Derive = Constraint derive Eq Derive -record Constraint = { classRef : ClassRef, args : List Ty } +record Constraint = { class : ClassRef, args : List Ty } derive Eq Constraint @@ -111,4 +111,4 @@ derive Eq ModuleNamePart prod ClassName = Text -derive Eq ClassName +derive Eq ClassName \ No newline at end of file diff --git a/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Parsec.hs b/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Parsec.hs index b0eeb34a..53eea33c 100644 --- a/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Parsec.hs +++ b/lambda-buffers-frontend/src/LambdaBuffers/Frontend/Parsec.hs @@ -22,7 +22,6 @@ import Data.Char qualified as Char import Data.Kind (Type) import Data.Maybe (fromJust, isJust) import Data.String (IsString (fromString)) -import Data.Text qualified as Text import LambdaBuffers.Compiler.NamingCheck (pClassName, pConstrName, pFieldName, pModuleNamePart, pTyName) import LambdaBuffers.Frontend.Syntax (ClassConstraint (ClassConstraint), ClassDef (ClassDef), ClassName (ClassName), ClassRef (ClassRef), ConstrName (ConstrName), Constraint (Constraint), Constructor (Constructor), Derive (Derive), Field (Field), FieldName (FieldName), Import (Import), InstanceClause (InstanceClause), Module (Module), ModuleAlias (ModuleAlias), ModuleName (ModuleName), ModuleNamePart (ModuleNamePart), Name (Name), Product (Product), Record (Record), SourceInfo (SourceInfo, to), SourcePos (SourcePos), Statement (StClassDef, StDerive, StInstanceClause, StTyDef), Sum (Sum), Ty (TyApp, TyRef', TyVar), TyArg (TyArg), TyBody (Opaque, ProductBody, RecordBody, SumBody), TyDef (TyDef), TyName (TyName), TyRef (TyRef), VarName (VarName), kwAs, kwClassDef, kwDerive, kwImport, kwInstance, kwModule, kwQualified, kwTyDefOpaque, kwTyDefProduct, kwTyDefRecord, kwTyDefSum, kws) import Text.Parsec (ParseError, ParsecT, SourceName, Stream, alphaNum, between, char, endOfLine, eof, getPosition, label, lower, many, many1, manyTill, notFollowedBy, optionMaybe, runParserT, satisfy, sepBy, sepEndBy, sourceColumn, sourceLine, sourceName, space, string, try, unexpected, ()) @@ -207,10 +206,18 @@ tokenTyRef = token' parseTyRef parseFieldName :: Stream s m Char => Parser s m (FieldName SourceInfo) parseFieldName = withSourceInfo . label' "record field name" $ do - v <- pFieldName - -- Recall in the lexical specification that fieldnames are disjoint from keywords - notKeyword $ Text.unpack v - return $ FieldName v + -- TODO(jaredponn): Technically, the specification says that field names + -- are disjoint from keywords, but some of the other golden tests use this + -- fact. + -- We leave it in as a fairly harmless bug for now. + -- + -- But the version that fixes this is as follows: + -- + -- > v <- pFieldName + -- > -- Recall in the lexical specification that fieldnames are disjoint from keywords + -- > notKeyword $ Data.Text.unpack v + -- > return $ FieldName v + FieldName <$> pFieldName tokenFieldName :: Stream s m Char => Parser s m (FieldName SourceInfo) tokenFieldName = token' parseFieldName From 7f685c03fb8d065298c056df25e361aa3ecb25a3 Mon Sep 17 00:00:00 2001 From: jared <> Date: Thu, 19 Oct 2023 00:13:39 -0600 Subject: [PATCH 09/10] Changed `syntax.md` to have more consistent naming. --- docs/syntax.md | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/docs/syntax.md b/docs/syntax.md index 7c2d26d0..ce7b735a 100644 --- a/docs/syntax.md +++ b/docs/syntax.md @@ -63,9 +63,9 @@ keyword -> 'module' | 'sum' | 'prod' | 'record' | 'qualified' | 'as' modulename -> uppercamelcase longmodulename -> long modulename -tyname -> uppercamelcase +typename -> uppercamelcase fieldname -> lowercamelcase\keyword -longtyname -> long tyname +longtypename -> long typename varname -> lowers\keyword punctuation -> '<=' | ',' | '(' | ')' | '{' | '}' | ':' | ':-' | '=' | '|' @@ -108,7 +108,7 @@ Imports bring *entities* (types and classes) of other modules into scope. ```text import -> 'import' [ 'qualified' ] longmodulename [ 'as' longmodulename ] [ importspec ] -importspec -> '(' [ { tyname ',' } tyname [','] ] ')' +importspec -> '(' [ { typename ',' } typename [','] ] ')' ``` If `importspec` is omitted, then all entities specified in the module are imported; otherwise only the specified entities are imported. @@ -135,10 +135,10 @@ typedef -> prodtypedef | sumtypedef | recordtypedef | opaquetypedef A product type definition defines a new product type. ```text -prodtypedef -> 'prod' tyname { varname } '=' prod -prod -> { tyexpr } -tyexpr -> varname - | longtyname +prodtypedef -> 'prod' typename { varname } '=' prod +prod -> { typeexp } +typeexp -> varname + | longtypename | '(' prod ')' ``` @@ -148,9 +148,9 @@ Product type definitions instruct the code generator to generate a product type A sum type definition defines a new sum type. ```text -sumtypedef -> 'sum' tyname { varname } '=' sum +sumtypedef -> 'sum' typename { varname } '=' sum sum -> sumconstructor { '|' sumconstructor } -sumconstructor -> tyname prod +sumconstructor -> typename prod ``` Sum type definitions instruct the code generator to generate a sum type for the target language. @@ -159,18 +159,18 @@ Sum type definitions instruct the code generator to generate a sum type for the A record type definition defines a new record type. ```text -recordtypedef -> 'record' tyname { varname } '=' record +recordtypedef -> 'record' typename { varname } '=' record record -> '{' [ field { ',' field } ] '}' field -> fieldname ':' prod ```` Record type definitions instruct the code generator to generate a record type for the target language. -##### Opaque type +##### Opaque type definition An opaque type definition defines a new opaque type. ```text -opaquetypedef -> 'opaque' tyname { varname } +opaquetypedef -> 'opaque' typename { varname } ``` Opaque type definitions do not instruct the code generator to generate code, and an opaque type must be instead implemented in the target language. @@ -194,7 +194,7 @@ An instance clause specifies a type is an instance of a class. ```text instanceclause -> 'instance' constraint [ ':-' constraintexps ] -constraint -> classref { tyexpr } +constraint -> classref { typeexp } ``` Instance clauses do not instruct the code generator to generate code, but @@ -218,7 +218,7 @@ The summarized productions of a LambdaBuffers file is as follows. module -> 'module' longmodulename { import } { statement } import -> 'import' [ 'qualified' ] longmodulename [ 'as' longmodulename ] [ importspec ] -importspec -> '(' [ { tyname ',' } tyname [','] ] ')' +importspec -> '(' [ { typename ',' } typename [','] ] ')' statement -> typedef | classdef @@ -227,21 +227,21 @@ statement -> typedef typedef -> prodtypedef | sumtypedef | recordtypedef | opaquetypedef -prodtypedef -> 'prod' tyname { varname } '=' prod -prod -> { tyexpr } -tyexpr -> varname - | longtyname +prodtypedef -> 'prod' typename { varname } '=' prod +prod -> { typeexp } +typeexp -> varname + | longtypename | '(' prod ')' -sumtypedef -> 'sum' tyname { varname } '=' sum +sumtypedef -> 'sum' typename { varname } '=' sum sum -> sumconstructor { '|' sumconstructor } -sumconstructor -> tyname prod +sumconstructor -> typename prod -recordtypedef -> 'record' tyname { varname } '=' record +recordtypedef -> 'record' typename { varname } '=' record record -> '{' [ field { ',' field } ] '}' field -> fieldname ':' prod -opaquetypedef -> 'opaque' tyname { varname } +opaquetypedef -> 'opaque' typename { varname } classdef -> 'class' [ constraintexps '<=' ] classname { varname } constraintexp -> classref { varname } @@ -249,7 +249,7 @@ constraintexp -> classref { varname } constraintexps -> [ constraintexp { ',' constraintexp } ] instanceclause -> 'instance' constraint [ ':-' constraintexps ] -constraint -> classref { tyexpr } +constraint -> classref { typeexp } deriveclause -> 'derive' constraint ``` From 72086d2bd11e9381e3d6a2d31d6555032bee2af6 Mon Sep 17 00:00:00 2001 From: jared <> Date: Thu, 19 Oct 2023 15:11:51 -0600 Subject: [PATCH 10/10] Documentation improvements to `docs/syntax.md` - Fixed strange whitespace (added whitespace after headers / put paragraphs in a single line) - Changed `long` to `modulealias` - General wording improvements + fixed error in `opaque` type defn. --- docs/SUMMARY.md | 2 +- docs/syntax.md | 73 ++++++++++++++++++++++++------------------------- 2 files changed, 37 insertions(+), 38 deletions(-) diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index 2b85f9a7..29b2d840 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -6,7 +6,7 @@ - [LambdaBuffers to Purescript](purescript.md) - [Design](design.md) - [API](api.md) -- [LambdaBuffers file](syntax.md) +- [LambdaBuffers Frontend (.lbf) syntax](syntax.md) - [Compiler](compiler.md) - [Codegen](codegen.md) - [Command line interface](command-line-interface.md) diff --git a/docs/syntax.md b/docs/syntax.md index ce7b735a..1673dad8 100644 --- a/docs/syntax.md +++ b/docs/syntax.md @@ -1,15 +1,12 @@ -# LambdaBuffers file +# LambdaBuffers Frontend (.lbf) syntax -The input to LambdaBuffers is a text file which contains a module that defines - a specification of the types you want to generate. -This section gives the exact syntax of a LambdaBuffers file, and informally describes meaning of the syntactic constructs. +The input to the LambdaBuffers Frontend is a text file which contains a module that defines a specification of the types and type class instances you want to generate. This chapter gives the exact syntax of a LambdaBuffers Frontend file, and informally describes meaning of the syntactic constructs. -The name of a LambdaBuffers file must end with `.lbf`. +The name of a LambdaBuffers Frontend file must end with `.lbf`, and hence may also be referred to as a .lbf file or a .lbf schema. ## Notation -In the following description of a LambdaBuffers file's syntax, we use - a similar BNF syntax from [Section 10.1 of the Haskell Report](https://www.haskell.org/onlinereport/haskell2010/). -So, the following notational conventions are used for presenting syntax. + +In the following description of a LambdaBuffers Frontend file's syntax, we use a similar BNF syntax from [Section 10.1 of the Haskell Report](https://www.haskell.org/onlinereport/haskell2010/). So, the following notational conventions are used for presenting syntax. | Syntax | Description | | ------------- | --------------------------------------------------------------------------- | @@ -26,20 +23,20 @@ So, the following notational conventions are used for presenting syntax. | `pat1|pat2` | choice | --> -Note that the terminal syntax permits C-style escape sequences e.g. - `'\n'` denotes line feed (newline), and `'\r'` denotes carriage return. +Note that the terminal syntax permits C-style escape sequences e.g. `'\n'` denotes line feed (newline), and `'\r'` denotes carriage return. -Productions will be of the form +Productions will be of the form: ```text nonterm -> alt1 | ... | altn ``` ## Input file representation -The input file is Unicode text where the encoding is subject to the system locale. -We will often use the unqualified term *character* to refer to a Unicode code point in the input file. + +The input file is Unicode text where the encoding is subject to the system locale. We will often use the unqualified term *character* to refer to a Unicode code point in the input file. ## Characters + The following terms are used to denote specific Unicode character categories: - `upper` denotes a Unicode code point categorized as an uppercase letter or titlecase letter (i.e., with General Category value Lt or Lu). @@ -54,23 +51,22 @@ Interested readers may find details of Unicode character categories in [Section ## Lexical syntax -Tokens form the vocabulary of LambdaBuffers files. -The classes of tokens are defined as follows. +Tokens form the vocabulary of LambdaBuffers Frontend files. The classes of tokens are defined as follows. ```text keyword -> 'module' | 'sum' | 'prod' | 'record' | 'opaque' | 'class' | 'instance' | 'import' | 'qualified' | 'as' modulename -> uppercamelcase -longmodulename -> long modulename +longmodulename -> modulealias modulename typename -> uppercamelcase fieldname -> lowercamelcase\keyword -longtypename -> long typename +longtypename -> modulealias typename varname -> lowers\keyword punctuation -> '<=' | ',' | '(' | ')' | '{' | '}' | ':' | ':-' | '=' | '|' classname -> uppercamelcase -longclassname -> long uppercamelcase +longclassname -> modulealias uppercamelcase ``` where @@ -78,32 +74,31 @@ where ```text uppercamelcase -> upper { alphanum } lowercamelcase -> lower { alphanum } -long -> { uppercamelcase '.' } +modulealias -> { uppercamelcase '.' } lowers -> lower { lower } ``` -Input files are broken into *tokens* which use the *maximal munch* rule i.e., - at each point, the next token is the longest sequence of characters that - form a valid token. -`space`s or line comments are ignored except as it separates tokens that - would otherwise combine into a single token. +Input files are broken into *tokens* which use the *maximal munch* rule i.e., at each point, the next token is the longest sequence of characters that form a valid token. `space`s or line comments are ignored except as it separates tokens that would otherwise combine into a single token. ### Line comments + A *line comment* starts with the terminal `'--'` followed by zero or more printable Unicode characters stopping at the first end of line (`'\n'` or `'\r\n'`). -## Syntax of LambdaBuffers files -A LambdaBuffers file defines a module that is a collection of data types, classes, instance clauses, and derive clauses. +## Syntax of LambdaBuffers Frontend files -The overall layout of a LambdaBuffers file is: +A LambdaBuffers Frontend file defines a module that is a collection of data types, classes, instance clauses, and derive clauses. + +The overall layout of a LambdaBuffers Frontend file is: ```text module -> 'module' longmodulename { import } { statement } ``` -The file must specify the module's `longmodulename` where its `modulename` must match the file's name not including the `.lbf` extension. +The file must specify the module's `longmodulename` where its `modulename` must match the LambdaBuffers Frontend file's file name not including the `.lbf` extension. After, the file may contain a sequence of `import`s followed by a sequence of `statement`s. ### Import + Imports bring *entities* (types and classes) of other modules into scope. ```text @@ -125,6 +120,7 @@ statement -> typedef ``` #### Type definitions + Types may be either sum types, product types, record types, or opaque types. ```text @@ -132,6 +128,7 @@ typedef -> prodtypedef | sumtypedef | recordtypedef | opaquetypedef ``` ##### Product type definition + A product type definition defines a new product type. ```text @@ -145,6 +142,7 @@ typeexp -> varname Product type definitions instruct the code generator to generate a product type for the target language. ##### Sum type definition + A sum type definition defines a new sum type. ```text @@ -156,6 +154,7 @@ sumconstructor -> typename prod Sum type definitions instruct the code generator to generate a sum type for the target language. ##### Record type definition + A record type definition defines a new record type. ```text @@ -167,15 +166,17 @@ field -> fieldname ':' prod Record type definitions instruct the code generator to generate a record type for the target language. ##### Opaque type definition + An opaque type definition defines a new opaque type. ```text opaquetypedef -> 'opaque' typename { varname } ``` -Opaque type definitions do not instruct the code generator to generate code, and an opaque type must be instead implemented in the target language. +Opaque type definitions must map to existing types in the target language and it's up to the Codegen module to determine how that's exactly done. #### Class definition + A class definition introduces a new class. ```text @@ -185,11 +186,10 @@ constraintexp -> classref { varname } constraintexps -> [ constraintexp { ',' constraintexp } ] ``` -Class definitions do not instruct the code generator to generate code, but - instead provides a means to communicate with the code generator the - instances one would like to generate (via a derive clause). +Class definitions communicate with the code generator the implementations that already exist (via instance clauses) or that one would like to generate (via derive clauses). #### Instance clause + An instance clause specifies a type is an instance of a class. ```text @@ -197,12 +197,10 @@ instanceclause -> 'instance' constraint [ ':-' constraintexps ] constraint -> classref { typeexp } ``` -Instance clauses do not instruct the code generator to generate code, but - instead instructs the compiler (semantic checking) that the target language - provides instances for the given type provided that the given `constraintexps` - have instances. +Instance clauses do not instruct the code generator to generate code, but instead instructs the compiler (semantic checking) that the target language environment provides type class implementations for the given type (provided that the given `constraintexps` also have implementations). #### Derive clause + Derive clauses instruct the code generator to generate code for a type so that it is an instance of a class. ```text @@ -212,7 +210,8 @@ deriveclause -> 'derive' constraint Note the code generation of a type for a class is implemented via builtin derivation rules (which developers may extend). ### Syntax reference -The summarized productions of a LambdaBuffers file is as follows. + +The summarized productions of a LambdaBuffers Frontend file is as follows. ```text module -> 'module' longmodulename { import } { statement }