Skip to content

Commit

Permalink
Lexer skeleton
Browse files Browse the repository at this point in the history
  • Loading branch information
Nercury committed Nov 23, 2015
1 parent 7ce79b1 commit 14daf30
Showing 1 changed file with 161 additions and 2 deletions.
163 changes: 161 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,162 @@
#[test]
fn it_works() {
#[cfg(test)]
mod test {
use tokens::{ Lexer, LexerOptions };

#[test]
fn lexer_usage() {
// build the lexer once for project environment with extensions.
let lexer = Lexer::new(LexerOptions::default(), vec![]);

// use many times.
{
let source: String = "{{ var }}".into();
for token in lexer.tokens(&source) {
println!("{:?}", token);
}
}
}
}

// I don't know where to put this, keeping it in root for now.
#[derive(Debug, Default, Clone)]
pub struct Position {
pub line: usize,
pub column: usize,
}

// Named this "tokens", in plural, to mean a place where you should expect
// to find your tokens.
// Similar convention would work for "nodes", that's where AST lives, and possibly
// "instructions", if we decide to go that route.
pub mod tokens {

// It is possible to delay all string manipulation for later,
// and we can simply store the slices into original full source
// string.
//
// We can keep doing that even for Node<'a>, provided the strings remain untouched.
// If something needs to be changed, we can create a special Node for that.
#[derive(Debug)]
pub enum TokenRef<'a> {
Text(&'a str),
}

impl<'a> TokenRef<'a> {
// Not used "into", because Gankro criticises using into for anything more than
// moving data around. Not used into_owned, because we don't implement ToOwned trait.
// So the only logical name remains `into_token`.
pub fn into_token(self) -> Token {
match self {
TokenRef::Text(v) => Token::Text(v.into()),
}
}
}

// This will be used when we need to carry token lifetime longer than original
// source string, for example, in error messages.
pub enum Token {
Text(String),
}

// Not pub, to make API more convenient.
mod lexing {
use Position;
use tokens::TokenRef;



/// TokenRef wrapper for `Lexer` that additionaly has position.
#[derive(Debug)]
pub struct ItemRef<'t> {
pub token: TokenRef<'t>,
pub position: Position,
}



// TBD simple lexer options (delimiters, whitespace, etc).
#[derive(Copy, Clone)]
pub struct Options;

impl Options {
pub fn default() -> Options { Options }
}



// I will be refering to 't as template lifetime, 'i as iteration lifetime.
// This lexer should be reusable between the `tokenize` calls.
// In addition to this I had `LexingEnvironment`, but it turned out to be redundant.
pub struct Lexer;

impl Lexer {
// It's responsibility of someone else to take operators from extensions,
// resolve any conflicts and compile final "operators" list.
//
// It looks like Lexer does not care if they are unary or binary, that will
// become important in parser.
//
// Funny note: I found that "=" is considered neither unary nor binary ;)
pub fn new(options: Options, operators: Vec<&'static str>) -> Lexer {
// Here we will create patterns (I called them matchers), and
// store them in Lexer
Lexer
}

// twig-rust: https://github.com/colin-kiegel/twig-rust/blob/master/src/lexer/mod.rs#L64
// twig-rs: https://github.com/Nercury/twig-rs/blob/master/src/tokens/lexer/mod.rs#L40
//
// I think it is possible to avoid the Template::Raw in lexer API.
// We can probably deal with newlines in patterns?
// Also maybe we won't need to fix line endings, but right now we both do that.
//
// twig-rs result was "Iter", twig-rust - "Job" :)
//
// I changed it to comcrete "Tokens" for now, which will implement Iterator.
// No Result. Let's avoid lexing until Parser requests first token.
pub fn tokens<'i, 't>(&'i self, code: &'t str) -> Tokens<'i, 't> {
// Just take whole lexer by reference ;)
Tokens::new(self, code)
}
}



// 'i is iteration lifetime, or "one use of lexer".
// 't is template lifetime. It will live longer than this iteration.
pub struct Tokens<'i, 't> {
env: &'i Lexer,
code: &'t str,
}

impl<'i, 't> Tokens<'i, 't> {

pub fn new<'ii, 'tt>(lexer: &'ii Lexer, code: &'tt str) -> Tokens<'ii, 'tt> {
Tokens {
env: lexer,
code: code,
}
}
}

// I think we can avoid storing all tokens in Vec, instead just keep in memory the next
// chunk of lexed tokens.
impl<'i, 't> Iterator for Tokens<'i, 't> {
// TODO: Use proper Result once we merge error handling.
type Item = Result<ItemRef<'t>, ()>;

fn next(&mut self) -> Option<Result<ItemRef<'t>, ()>> {

return None;
}
}

}

pub use self::lexing::{
Lexer,
Tokens,
ItemRef as LexerItemRef,
Options as LexerOptions,
};
}

0 comments on commit 14daf30

Please sign in to comment.