From 97460a6bd293bcb76d8fa12ba3d4ccb0dc9310f6 Mon Sep 17 00:00:00 2001 From: connorwalsh Date: Mon, 2 Apr 2018 21:55:58 -0400 Subject: [PATCH] begin working on tokenizing identifiers Git issue references: Gamelan music currently playing: Gendhing Rangu-Rangu Co-authoered-by: Mouse Reeve --- forest.go | 6 +++++- lexer.go | 36 ++++++++++++++++++++++++++++++++++++ lexer_test.go | 32 ++++++++++++++++++++++++++++++++ symbols.go | 8 ++++++++ test/identifiers.doc | 1 + test/test_identifiers.doc | 1 + test/variable.doc | 4 ++++ token.go | 3 ++- 8 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 test/identifiers.doc create mode 100644 test/test_identifiers.doc create mode 100644 test/variable.doc diff --git a/forest.go b/forest.go index 7914733..9111ecc 100644 --- a/forest.go +++ b/forest.go @@ -28,6 +28,10 @@ type Expr struct { Globals map[string]AST } +// def f(): +// 1 + 1 +// return 2 + 2 + // evaluate an expression func (e *Expr) Eval() (string, error) { execData := &ExecutionData{ @@ -65,7 +69,7 @@ type Variable struct { func (v *Variable) Eval() (string, error) { return executer.Run( &ExecutionData{ - ComputationType: VARIABLE, + ComputationType: VARIABLE_IDENTIFIER, }, ) } diff --git a/lexer.go b/lexer.go index 06151d1..5b7b2a9 100644 --- a/lexer.go +++ b/lexer.go @@ -51,6 +51,9 @@ func (c *Compterpreter) GetNextToken() (Token, error) { case c.IsNumber(c.CurrentChar): // get full multidigit number token err = c.TokenizeNumber(c.CurrentChar) + case c.IsIdentifierFirstSymbol(c.CurrentChar): + // is it a keyword? + // is it a function/variable identifier? case c.IsPunctuation(c.CurrentChar): err = c.TokenizePunctuation(c.CurrentChar) default: @@ -82,6 +85,14 @@ func (c *Compterpreter) IsNumber(r rune) bool { return unicode.IsDigit(r) } +func (c *Compterpreter) IsIdentifierFirstSymbol(r rune) bool { + return VALID_IDENTIFIER_FIRST_SYMBOL.MatchString(string(r)) +} + +func (c *Compterpreter) IsIdentifier(r rune) bool { + return VALID_IDENTIFIER_SYMBOL.MatchString(string(r)) +} + func (c *Compterpreter) IsOperator(r rune) bool { for _, symbol := range c.Symbols.Operators { if string(r) == symbol { @@ -136,6 +147,31 @@ func (c *Compterpreter) TokenizeNumber(r rune) error { return nil } +func (c *Compterpreter) TokenizeIdentifier(r rune) error { + c.CurrentToken.Type = IDENTIFIER + c.CurrentToken.Value = c.CurrentToken.Value + string(r) + + // check to see if we need to include the next character in the + // current token + if err := c.Advance(); err != nil { + return err + } + + if c.IsIdentifier(c.CurrentChar) { + c.TokenizeIdentifier(c.CurrentChar) + } + + // at this point, we have our current token, but we want to + // check whether it is a keyword of an identifier + for _, keyword := range c.Symbols.Keywords { + if c.CurrentToken.Value == keyword { + c.CurrentToken.Type = KEYWORD + } + } + + return nil +} + func (c *Compterpreter) TokenizeOperator(r rune) error { c.CurrentToken.Type = OPERATOR c.CurrentToken.Value = c.CurrentToken.Value + string(r) diff --git a/lexer_test.go b/lexer_test.go index 64b0d52..a925252 100644 --- a/lexer_test.go +++ b/lexer_test.go @@ -60,6 +60,19 @@ func (s *LexerSuite) TestIsOperator() { } } +func (s *LexerSuite) TestIsIdentifierFirstSymbol() { + conf := &Config{SrcFileName: "test/test.doc"} + compt := NewCompterpreter(conf) + for _, operator := range []rune{'a', 'A', 'z', 'Z', '_'} { + ok := compt.IsIdentifierFirstSymbol(operator) + s.True(ok) + } + for _, operator := range []rune{'❧', '0', ' '} { + ok := compt.IsIdentifierFirstSymbol(operator) + s.False(ok) + } +} + func (s *LexerSuite) TestIsPunctuation() { conf := &Config{SrcFileName: "test/test.doc"} compt := NewCompterpreter(conf) @@ -92,6 +105,25 @@ func (s *LexerSuite) TestTokenizeOperator() { } } +func (s *LexerSuite) TestTokenizeIdentifier() { + conf := &Config{SrcFileName: "test/test_identifiers.doc"} + compt := NewCompterpreter(conf) + + err := compt.LoadSourceCode() + s.NoError(err) + + compt.Advance() + // advance ptr to first character + for _, op := range []string{"myVariable"} { + compt.CurrentToken = Token{} + compt.TokenizeOperator(compt.CurrentChar) + if string(compt.CurrentChar) == "EOF" { + break + } + s.EqualValues(compt.CurrentToken.Value, op) + } +} + func (s *LexerSuite) TestLex() { conf := &Config{SrcFileName: "test/test_tokenize.doc"} compt := NewCompterpreter(conf) diff --git a/symbols.go b/symbols.go index 83cceeb..53363bd 100644 --- a/symbols.go +++ b/symbols.go @@ -1,5 +1,7 @@ package dockerlang +import "regexp" + const ( ADDITION_OPERATOR = "+" SUBTRACTION_OPERATOR = "†" @@ -13,6 +15,9 @@ const ( R_PAREN_PUNCTION = "(" L_PAREN_PUNCUTATION = ")" + + VARIABLE_IDENTIFIER = "VARIABLE_IDENTIFIER" + FUNCTION_IDENTIFIER = "FUNCTION_IDENTIFIER" ) var ( @@ -27,6 +32,9 @@ var ( EXIT_OPERATOR: 1, NOOP: 1, } + + VALID_IDENTIFIER_FIRST_SYMBOL = regexp.MustCompile("[a-zA-Z_]") + VALID_IDENTIFIER_SYMBOL = regexp.MustCompile("[a-zA-Z_\\d]") ) // all the language-defined tokens in dockerlang diff --git a/test/identifiers.doc b/test/identifiers.doc new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/test/identifiers.doc @@ -0,0 +1 @@ + diff --git a/test/test_identifiers.doc b/test/test_identifiers.doc new file mode 100644 index 0000000..c9d11f5 --- /dev/null +++ b/test/test_identifiers.doc @@ -0,0 +1 @@ +myVariable diff --git a/test/variable.doc b/test/variable.doc new file mode 100644 index 0000000..437c0e4 --- /dev/null +++ b/test/variable.doc @@ -0,0 +1,4 @@ +(≡ aVariable 0) + +if asdf +ifIamKewl diff --git a/token.go b/token.go index cd622c1..57f4828 100644 --- a/token.go +++ b/token.go @@ -2,7 +2,8 @@ package dockerlang const ( OPERATOR = "OPERATOR" - VARIABLE = "VARIABLE" + IDENTIFIER = "IDENTIFIER" + KEYWORD = "KEYWORD" INT = "INTEGER" PUNCTUATION = "PUNCTUATION" // parens )