Lexer skeleton

rust-web · Nov 23, 2015 · 14daf30 · 14daf30
1 parent 7ce79b1
commit 14daf30
Showing 1 changed file with 161 additions and 2 deletions.
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,3 +1,162 @@
-#[test]
-fn it_works() {
+#[cfg(test)]
+mod test {
+    use tokens::{ Lexer, LexerOptions };
+
+    #[test]
+    fn lexer_usage() {
+        // build the lexer once for project environment with extensions.
+        let lexer = Lexer::new(LexerOptions::default(), vec![]);
+
+        // use many times.
+        {
+            let source: String = "{{ var }}".into();
+            for token in lexer.tokens(&source) {
+                println!("{:?}", token);
+            }
+        }
+    }
+}
+
+// I don't know where to put this, keeping it in root for now.
+#[derive(Debug, Default, Clone)]
+pub struct Position {
+    pub line: usize,
+    pub column: usize,
+}
+
+// Named this "tokens", in plural, to mean a place where you should expect
+// to find your tokens.
+// Similar convention would work for "nodes", that's where AST lives, and possibly
+// "instructions", if we decide to go that route.
+pub mod tokens {
+
+    // It is possible to delay all string manipulation for later,
+    // and we can simply store the slices into original full source
+    // string.
+    //
+    // We can keep doing that even for Node<'a>, provided the strings remain untouched.
+    // If something needs to be changed, we can create a special Node for that.
+    #[derive(Debug)]
+    pub enum TokenRef<'a> {
+        Text(&'a str),
+    }
+
+    impl<'a> TokenRef<'a> {
+        // Not used "into", because Gankro criticises using into for anything more than
+        // moving data around. Not used into_owned, because we don't implement ToOwned trait.
+        // So the only logical name remains `into_token`.
+        pub fn into_token(self) -> Token {
+            match self {
+                TokenRef::Text(v) => Token::Text(v.into()),
+            }
+        }
+    }
+
+    // This will be used when we need to carry token lifetime longer than original
+    // source string, for example, in error messages.
+    pub enum Token {
+        Text(String),
+    }
+
+    // Not pub, to make API more convenient.
+    mod lexing {
+        use Position;
+        use tokens::TokenRef;
+
+
+
+        /// TokenRef wrapper for `Lexer` that additionaly has position.
+        #[derive(Debug)]
+        pub struct ItemRef<'t> {
+            pub token: TokenRef<'t>,
+            pub position: Position,
+        }
+
+
+
+        // TBD simple lexer options (delimiters, whitespace, etc).
+        #[derive(Copy, Clone)]
+        pub struct Options;
+
+        impl Options {
+            pub fn default() -> Options { Options }
+        }
+
+
+
+        // I will be refering to 't as template lifetime, 'i as iteration lifetime.
+        // This lexer should be reusable between the `tokenize` calls.
+        // In addition to this I had `LexingEnvironment`, but it turned out to be redundant.
+        pub struct Lexer;
+
+        impl Lexer {
+            // It's responsibility of someone else to take operators from extensions,
+            // resolve any conflicts and compile final "operators" list.
+            //
+            // It looks like Lexer does not care if they are unary or binary, that will
+            // become important in parser.
+            //
+            // Funny note: I found that "=" is considered neither unary nor binary ;)
+            pub fn new(options: Options, operators: Vec<&'static str>) -> Lexer {
+                // Here we will create patterns (I called them matchers), and
+                // store them in Lexer
+                Lexer
+            }
+
+            // twig-rust: https://github.com/colin-kiegel/twig-rust/blob/master/src/lexer/mod.rs#L64
+            // twig-rs: https://github.com/Nercury/twig-rs/blob/master/src/tokens/lexer/mod.rs#L40
+            //
+            // I think it is possible to avoid the Template::Raw in lexer API.
+            // We can probably deal with newlines in patterns?
+            // Also maybe we won't need to fix line endings, but right now we both do that.
+            //
+            // twig-rs result was "Iter", twig-rust - "Job" :)
+            //
+            // I changed it to comcrete "Tokens" for now, which will implement Iterator.
+            // No Result. Let's avoid lexing until Parser requests first token.
+            pub fn tokens<'i, 't>(&'i self, code: &'t str) -> Tokens<'i, 't> {
+                // Just take whole lexer by reference ;)
+                Tokens::new(self, code)
+            }
+        }
+
+
+
+        // 'i is iteration lifetime, or "one use of lexer".
+        // 't is template lifetime. It will live longer than this iteration.
+        pub struct Tokens<'i, 't> {
+            env: &'i Lexer,
+            code: &'t str,
+        }
+
+        impl<'i, 't> Tokens<'i, 't> {
+
+            pub fn new<'ii, 'tt>(lexer: &'ii Lexer, code: &'tt str) -> Tokens<'ii, 'tt> {
+                Tokens {
+                    env: lexer,
+                    code: code,
+                }
+            }
+        }
+
+        // I think we can avoid storing all tokens in Vec, instead just keep in memory the next
+        // chunk of lexed tokens.
+        impl<'i, 't> Iterator for Tokens<'i, 't> {
+            // TODO: Use proper Result once we merge error handling.
+            type Item = Result<ItemRef<'t>, ()>;
+
+            fn next(&mut self) -> Option<Result<ItemRef<'t>, ()>> {
+
+                return None;
+            }
+        }
+
+    }
+
+    pub use self::lexing::{
+        Lexer,
+        Tokens,
+        ItemRef as LexerItemRef,
+        Options as LexerOptions,
+    };
 }