-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
161 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,162 @@ | ||
#[test] | ||
fn it_works() { | ||
#[cfg(test)] | ||
mod test { | ||
use tokens::{ Lexer, LexerOptions }; | ||
|
||
#[test] | ||
fn lexer_usage() { | ||
// build the lexer once for project environment with extensions. | ||
let lexer = Lexer::new(LexerOptions::default(), vec![]); | ||
|
||
// use many times. | ||
{ | ||
let source: String = "{{ var }}".into(); | ||
for token in lexer.tokens(&source) { | ||
println!("{:?}", token); | ||
} | ||
} | ||
} | ||
} | ||
|
||
// I don't know where to put this, keeping it in root for now. | ||
#[derive(Debug, Default, Clone)] | ||
pub struct Position { | ||
pub line: usize, | ||
pub column: usize, | ||
} | ||
|
||
// Named this "tokens", in plural, to mean a place where you should expect | ||
// to find your tokens. | ||
// Similar convention would work for "nodes", that's where AST lives, and possibly | ||
// "instructions", if we decide to go that route. | ||
pub mod tokens { | ||
|
||
// It is possible to delay all string manipulation for later, | ||
// and we can simply store the slices into original full source | ||
// string. | ||
// | ||
// We can keep doing that even for Node<'a>, provided the strings remain untouched. | ||
// If something needs to be changed, we can create a special Node for that. | ||
#[derive(Debug)] | ||
pub enum TokenRef<'a> { | ||
Text(&'a str), | ||
} | ||
|
||
impl<'a> TokenRef<'a> { | ||
// Not used "into", because Gankro criticises using into for anything more than | ||
// moving data around. Not used into_owned, because we don't implement ToOwned trait. | ||
// So the only logical name remains `into_token`. | ||
pub fn into_token(self) -> Token { | ||
match self { | ||
TokenRef::Text(v) => Token::Text(v.into()), | ||
} | ||
} | ||
} | ||
|
||
// This will be used when we need to carry token lifetime longer than original | ||
// source string, for example, in error messages. | ||
pub enum Token { | ||
Text(String), | ||
} | ||
|
||
// Not pub, to make API more convenient. | ||
mod lexing { | ||
use Position; | ||
use tokens::TokenRef; | ||
|
||
|
||
|
||
/// TokenRef wrapper for `Lexer` that additionaly has position. | ||
#[derive(Debug)] | ||
pub struct ItemRef<'t> { | ||
pub token: TokenRef<'t>, | ||
pub position: Position, | ||
} | ||
|
||
|
||
|
||
// TBD simple lexer options (delimiters, whitespace, etc). | ||
#[derive(Copy, Clone)] | ||
pub struct Options; | ||
|
||
impl Options { | ||
pub fn default() -> Options { Options } | ||
} | ||
|
||
|
||
|
||
// I will be refering to 't as template lifetime, 'i as iteration lifetime. | ||
// This lexer should be reusable between the `tokenize` calls. | ||
// In addition to this I had `LexingEnvironment`, but it turned out to be redundant. | ||
pub struct Lexer; | ||
|
||
impl Lexer { | ||
// It's responsibility of someone else to take operators from extensions, | ||
// resolve any conflicts and compile final "operators" list. | ||
// | ||
// It looks like Lexer does not care if they are unary or binary, that will | ||
// become important in parser. | ||
// | ||
// Funny note: I found that "=" is considered neither unary nor binary ;) | ||
pub fn new(options: Options, operators: Vec<&'static str>) -> Lexer { | ||
// Here we will create patterns (I called them matchers), and | ||
// store them in Lexer | ||
Lexer | ||
} | ||
|
||
// twig-rust: https://github.com/colin-kiegel/twig-rust/blob/master/src/lexer/mod.rs#L64 | ||
// twig-rs: https://github.com/Nercury/twig-rs/blob/master/src/tokens/lexer/mod.rs#L40 | ||
// | ||
// I think it is possible to avoid the Template::Raw in lexer API. | ||
// We can probably deal with newlines in patterns? | ||
// Also maybe we won't need to fix line endings, but right now we both do that. | ||
// | ||
// twig-rs result was "Iter", twig-rust - "Job" :) | ||
// | ||
// I changed it to comcrete "Tokens" for now, which will implement Iterator. | ||
// No Result. Let's avoid lexing until Parser requests first token. | ||
pub fn tokens<'i, 't>(&'i self, code: &'t str) -> Tokens<'i, 't> { | ||
// Just take whole lexer by reference ;) | ||
Tokens::new(self, code) | ||
} | ||
} | ||
|
||
|
||
|
||
// 'i is iteration lifetime, or "one use of lexer". | ||
// 't is template lifetime. It will live longer than this iteration. | ||
pub struct Tokens<'i, 't> { | ||
env: &'i Lexer, | ||
code: &'t str, | ||
} | ||
|
||
impl<'i, 't> Tokens<'i, 't> { | ||
|
||
pub fn new<'ii, 'tt>(lexer: &'ii Lexer, code: &'tt str) -> Tokens<'ii, 'tt> { | ||
Tokens { | ||
env: lexer, | ||
code: code, | ||
} | ||
} | ||
} | ||
|
||
// I think we can avoid storing all tokens in Vec, instead just keep in memory the next | ||
// chunk of lexed tokens. | ||
impl<'i, 't> Iterator for Tokens<'i, 't> { | ||
// TODO: Use proper Result once we merge error handling. | ||
type Item = Result<ItemRef<'t>, ()>; | ||
|
||
fn next(&mut self) -> Option<Result<ItemRef<'t>, ()>> { | ||
|
||
return None; | ||
} | ||
} | ||
|
||
} | ||
|
||
pub use self::lexing::{ | ||
Lexer, | ||
Tokens, | ||
ItemRef as LexerItemRef, | ||
Options as LexerOptions, | ||
}; | ||
} |