From d2dcdcf6572a48149666e8ed1b7acad90426e268 Mon Sep 17 00:00:00 2001
From: BlackBeans <adrien.mathieu.net@gmail.com>
Date: Thu, 5 Jul 2018 19:58:30 -0400
Subject: [PATCH] [gmrreader] Lexer generator | [lexer.gmr] full lexer grammar
 | [parser.gmr] first part of parser grammar

---
 beansast/gmrreader.py | 85 +++++++++++++++++++++++++++++++++++++++++++
 beansast/lexer.gmr    | 66 +++++++++++++++++++++++++++++++++
 beansast/parser.gmr   | 27 ++++++++++++++
 3 files changed, 178 insertions(+)
 create mode 100644 beansast/gmrreader.py
 create mode 100644 beansast/lexer.gmr
 create mode 100644 beansast/parser.gmr
diff --git a/beansast/gmrreader.py b/beansast/gmrreader.py
new file mode 100644
index 0000000..6e18f57
--- /dev/null
+++ b/beansast/gmrreader.py
@@ -0,0 +1,85 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+import re
+
+class Token:
+    def __init__(self, name, **attributes):
+        self.name = name
+        self.attributes = attributes
+
+class Tokenizer:
+    def __init__(self, name, rule, ignore=False):
+        self.name = name
+        self.rule = re.compile(rule)
+        self.ignore = ignore
+    def __call__(self, flux, pos):
+        result = self.rule.match(flux[pos:])
+        if result:
+            if self.ignore: return True, result.end, None
+            return True, result.end, Token(self.name, **result.groupdict())
+        else:
+            return False, pos, None
+    def __repr__(self):
+        return "<Tokenizer named %s with rule %s %s>" % (self.name, self.rule, "- ignored" * int(self.ignore))
+
+class LexerReader:
+    def __init__(self, inp):
+        self.inp = inp
+        self.pos = 0
+    def read(self):
+        tokenizers = {}
+        while self.pos < len(self.inp):
+            self.pos = self.ignore_lines(self.pos)
+            self.pos, ignore = self.read_ignore(self.pos)
+            self.pos, name = self.read_name(self.pos)
+            self.pos = self.ignore_spaces(self.pos)
+            self.pos = self.ignore_assignment(self.pos)
+            self.pos = self.ignore_spaces(self.pos)
+            self.pos, rule = self.read_rule(self.pos)
+            tokenizers[name] = Tokenizer(name, rule, ignore)
+        return tokenizers
+            
+    def read_ignore(self, pos):
+        if self.inp[pos:].startswith("ignore "):
+            return pos+len("ignore "), True
+        else:
+            return pos, False
+    def read_name(self, pos):
+        result = ""
+        while self.inp[pos] not in {" ", "\t"}:
+            result += self.inp[pos]
+            pos += 1
+        return pos, result
+    def ignore_assignment(self, pos):
+        if self.inp[pos:].startswith("::="):
+            return pos+len("::=")
+        raise SyntaxError("syntax is wrong at character %s of line %s" % pos2coords(pos, self.inp))
+    def read_rule(self, pos):
+        maxsize = len(self.inp) # in case it doesn't end with \n
+        rule = ''
+        while pos < maxsize and self.inp[pos] != '\n':
+            rule += self.inp[pos]
+            pos += 1
+        return pos + 1, rule
+    def ignore_spaces(self, pos):
+        maxsize = len(self.inp)
+        while pos < maxsize and self.inp[pos] in {" ", "\t"}:
+            pos += 1
+        return pos
+    def ignore_lines(self, pos):
+        maxsize = len(self.inp)
+        while pos < maxsize and self.inp[pos] in {" ", "\t", "\n"}:
+            pos += 1
+        return pos
+
+def pos2coords(pos, flux):
+    y = 1
+    x = 1
+    for char in flux[:pos]:
+        if char == '\n':
+            y += 1
+            x = 1
+        else:
+            x += 1
+    return x, y
diff --git a/beansast/lexer.gmr b/beansast/lexer.gmr
new file mode 100644
index 0000000..81422ca
--- /dev/null
+++ b/beansast/lexer.gmr
@@ -0,0 +1,66 @@
+IF ::= if
+ELSE ::= else
+ELIF ::= elif
+WHILE ::= while
+FOR ::= for
+FUNCTION ::= function
+CLASS ::= class
+METHOD ::= method
+MODULE ::= module
+FROM ::= from
+IMPORT ::= import
+DATA ::= data
+SPACENAME ::= spacename
+PACKAGE ::= package
+PUBLIC ::= public
+PRIVATE ::= private
+
+EOF ::= \Z
+STRING ::= "(?P<value>((\\")|[^"])*)"
+CHAR ::= '(?P<value>((\\')|[^'])?)'
+INT ::= (?P<value>\d+\.?)
+FLOAT ::= (?P<value>\d*\.\d+)
+ignore SPACE ::= [\t ]
+ignore EOL ::= $
+ID ::= (?P<name>\w+)
+
+DOT ::= [.]
+
+LPAR ::= \(
+RPAR ::= \)
+LBRACE ::= {
+RBRACE ::= }
+LBRACKET ::= \[
+RBRACKET ::= \]
+
+EQ ::= ==
+GT ::= >
+LT ::= <
+GE ::= >=
+LE ::= <=
+NE ::= !=
+
+IN ::= in
+
+AND ::= and
+OR ::= or
+NOT ::= not
+
+EQUALS ::= =
+
+PLUS ::= \+
+MINUS ::= -
+ASTERISK ::= \*
+DASTERISK ::= \*\*
+DSLASH ::= //
+SLASH ::= /
+PERCENTAGE ::= %
+
+COLON ::= :
+GRAVE ::= `
+DGRAVE ::= ``
+SEMICOLON ::= ;
+COMMA ::= ,
+TILDE ::= ~
+EXCLAMATION ::= !
+QUESTION ::= \?
\ No newline at end of file
diff --git a/beansast/parser.gmr b/beansast/parser.gmr
new file mode 100644
index 0000000..af38ffc
--- /dev/null
+++ b/beansast/parser.gmr
@@ -0,0 +1,27 @@
+Expression ::=
+  NUMBER@value <.>
+ : ID@value <.>
+ : STRING@value <.>
+ : FLOAT@value <.>
+ : LPAR Expression@value RPAR <.>
+ : Expression@var LPAR ExpressionList?@arguments RPAR <. + {"op": "call"}>
+ : Expression@var LBRACKET Expression@argument LBRACKET <. + {"op": "subscription"}>
+ : Expression@father DOT@op Expression@child <.>
+ : (PLUS|MINUS)@op Expression@value <.>
+ : Expression@left DASTERISK@op Expression@right <.>
+ : Expression@left (ASTERISK|SLASH|DSLASH|PERCENTAGE)@op Expression@right <.>
+ : Expression@left (PLUS|MINUS)@op Expression@right <.>
+ : Expression@left (IN|EQ|GT|LT|GE|LE|NE)@op Expression@right <.>
+ : NOT@op Expression@value <.>
+ : AND@op Expression@value <.>
+ : OR@op Expression@value <.>;
+
+ExpressionList ::= Expression@value <{"value": [value]}>
+ : ExpressionList@values COMMA Expression@value <{"value": values["value"] + [value]}>;
+
+Assignment ::= ID@key EQUALS Expression@value <.>;
+FunctionCall ::= Expression@var LPAR ExpressionList?@arguments RPAR <.>;
+
+Statement ::=
+ Assignment
+ :FunctionCall;