From 3e0cca29674301c8fb156d023a8e4e8d21f2f652 Mon Sep 17 00:00:00 2001 From: Carson McManus Date: Sat, 13 Jul 2024 16:13:15 -0400 Subject: [PATCH] feat(yaml): lexer that can lex very simple examples --- Cargo.lock | 3 + crates/biome_yaml_parser/Cargo.toml | 3 + crates/biome_yaml_parser/src/lexer/mod.rs | 286 +++++++++++++++++++- crates/biome_yaml_parser/src/lexer/tests.rs | 233 ++++++++++++++++ 4 files changed, 511 insertions(+), 14 deletions(-) create mode 100644 crates/biome_yaml_parser/src/lexer/tests.rs diff --git a/Cargo.lock b/Cargo.lock index 14ae537309cb..437e9fd67b3d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1117,7 +1117,10 @@ dependencies = [ "biome_parser", "biome_rowan", "biome_unicode_table", + "biome_yaml_factory", "biome_yaml_syntax", + "quickcheck", + "quickcheck_macros", "tracing", ] diff --git a/crates/biome_yaml_parser/Cargo.toml b/crates/biome_yaml_parser/Cargo.toml index 94f5f3ed3eca..8e9c27bf9ca2 100644 --- a/crates/biome_yaml_parser/Cargo.toml +++ b/crates/biome_yaml_parser/Cargo.toml @@ -19,8 +19,11 @@ biome_diagnostics = { workspace = true } biome_parser = { workspace = true } biome_rowan = { workspace = true } biome_unicode_table = { workspace = true } +biome_yaml_factory = { workspace = true } biome_yaml_syntax = { workspace = true } tracing = { workspace = true } [dev-dependencies] +quickcheck = { workspace = true } +quickcheck_macros = { workspace = true } diff --git a/crates/biome_yaml_parser/src/lexer/mod.rs b/crates/biome_yaml_parser/src/lexer/mod.rs index 02d7d8cbef6b..6cea8c725fad 100644 --- a/crates/biome_yaml_parser/src/lexer/mod.rs +++ b/crates/biome_yaml_parser/src/lexer/mod.rs @@ -1,10 +1,31 @@ +use std::iter::FusedIterator; + use biome_parser::{ diagnostic::ParseDiagnostic, lexer::{LexContext, Lexer, TokenFlags}, }; use biome_rowan::{TextRange, TextSize}; -use biome_yaml_syntax::YamlSyntaxKind::*; -pub use biome_yaml_syntax::*; +use biome_yaml_syntax::YamlSyntaxKind; + +#[rustfmt::skip] +mod tests; + +pub struct Token { + kind: YamlSyntaxKind, + range: TextRange, +} + +impl Token { + #[allow(dead_code)] + pub fn kind(&self) -> YamlSyntaxKind { + self.kind + } + + #[allow(dead_code)] + pub fn range(&self) -> TextRange { + self.range + } +} pub(crate) struct YamlLexer<'src> { /// Source text @@ -27,20 +48,223 @@ pub(crate) struct YamlLexer<'src> { /// The kind of the current token current_kind: YamlSyntaxKind, - /// Byte offset of the current token from the start of the source - /// The range of the current token can be computed by - /// `self.position - self.current_start`. - /// Flags for the current token current_flags: TokenFlags, /// diagnostics emitted during the parsing phase diagnostics: Vec, + + context: YamlLexContext, +} + +impl<'source> YamlLexer<'source> { + /// Creates a new lexer from the given string + #[allow(dead_code)] + pub fn from_str(source: &'source str) -> Self { + Self { + source, + position: 0, + after_newline: false, + unicode_bom_length: 0, + current_start: TextSize::from(0), + current_kind: YamlSyntaxKind::EOF, + current_flags: TokenFlags::empty(), + diagnostics: vec![], + context: YamlLexContext::Regular, + } + } + + fn current_char(&self) -> Option { + self.source.as_bytes().get(self.position).copied() + } + + fn peek_next_char(&self) -> Option { + self.source.as_bytes().get(self.position + 1).copied() + } + + /// Consumes and returns the next token, if any + fn consume_token(&mut self) -> Option { + let start = self.text_position(); + let char = self.current_char()?; + let kind = self.consume_token_in_context(char, self.context); + self.current_kind = kind; + let end = self.text_position(); + Some(Token { + kind, + range: TextRange::new(start, end), + }) + } + + /// Consume a byte in the given context + fn consume_token_in_context(&mut self, current: u8, context: YamlLexContext) -> YamlSyntaxKind { + if self.position >= self.source.len() { + return YamlSyntaxKind::EOF; + } + + let start = self.text_position(); + + let kind = match current { + b'#' => self.consume_comment(), + b'-' => { + if self.peek_next_char() == Some(b' ') { + self.context = YamlLexContext::AfterArray; + self.advance_char_unchecked(); + YamlSyntaxKind::DASH + } else { + self.consume_identifer_or_value() + } + } + b':' => { + if self.peek_next_char() == Some(b' ') { + self.context = YamlLexContext::AfterIdent; + self.advance_char_unchecked(); + YamlSyntaxKind::COLON + } else { + self.consume_identifer_or_value() + } + } + b'\'' | b'"' => self.consume_string_literal(current), + b' ' => self.consume_newline_or_whitespaces(), + b'\n' => self.consume_newline_or_whitespaces(), + b'[' => self.consume_array_inline_start(), + b',' => { + if self.context == YamlLexContext::AfterInlineArray { + self.advance_char_unchecked(); + YamlSyntaxKind::COMMA + } else { + self.consume_identifer_or_value() + } + } + b']' => { + if self.context == YamlLexContext::AfterInlineArray { + self.consume_array_inline_end() + } else { + self.consume_identifer_or_value() + } + } + _ => match context { + YamlLexContext::Regular => self.consume_identifer_or_value(), + YamlLexContext::AfterArray => self.consume_identifer_or_value(), + YamlLexContext::AfterIdent => self.consume_value(), + YamlLexContext::AfterInlineArray => self.consume_value(), + }, + }; + + debug_assert!(self.text_position() > start, "Lexer did not advance"); + kind + } + + fn consume_comment(&mut self) -> YamlSyntaxKind { + self.assert_byte(b'#'); + self.consume_until_newline(); + YamlSyntaxKind::COMMENT + } + + fn consume_until_newline(&mut self) { + while let Some(c) = self.current_char() { + if c == b'\n' { + break; + } + self.advance_char_unchecked(); + } + self.context = YamlLexContext::Regular; + } + + fn consume_string_literal(&mut self, quote: u8) -> YamlSyntaxKind { + self.assert_current_char_boundary(); + self.assert_byte(quote); + self.advance_char_unchecked(); + + let mut escape = false; + loop { + match self.current_char() { + Some(b'\\') => { + escape = true; + self.advance_char_unchecked(); + } + Some(c) if c == quote && !escape => { + self.advance_char_unchecked(); + break; + } + Some(_) => { + escape = false; + self.advance_char_unchecked(); + } + None => { + break; + } + } + } + YamlSyntaxKind::YAML_STRING_VALUE + } + + /// Consume a line up to the colon or the end of the line + fn consume_identifer_or_value(&mut self) -> YamlSyntaxKind { + let start = self.position; + let mut is_ident = false; + while let Some(c) = self.current_char() { + if c == b'\n' { + break; + } + if c == b':' && self.peek_next_char() == Some(b' ') { + is_ident = true; + break; + } + self.advance_char_unchecked(); + } + if is_ident { + YamlSyntaxKind::YAML_IDENTIFIER + } else { + let value = &self.source[start..self.position]; + interpret_value(value) + } + } + + fn consume_value(&mut self) -> YamlSyntaxKind { + let start = self.position; + while let Some(c) = self.current_char() { + if c == b'\n' { + break; + } + if self.context == YamlLexContext::AfterInlineArray && (c == b',' || c == b']') { + break; + } + self.advance_char_unchecked(); + } + let value = &self.source[start..self.position]; + interpret_value(value) + } + + fn consume_array_inline_start(&mut self) -> YamlSyntaxKind { + self.assert_byte(b'['); + self.advance_char_unchecked(); + self.context = YamlLexContext::AfterInlineArray; + YamlSyntaxKind::L_BRACK + } + + fn consume_array_inline_end(&mut self) -> YamlSyntaxKind { + self.assert_byte(b']'); + self.advance_char_unchecked(); + self.context = YamlLexContext::Regular; + YamlSyntaxKind::R_BRACK + } +} + +fn interpret_value(value: &str) -> YamlSyntaxKind { + match value { + "true" | "false" => YamlSyntaxKind::YAML_BOOLEAN_VALUE, + "null" => YamlSyntaxKind::YAML_NULL_VALUE, + _ => value + .parse::() + .map_or(YamlSyntaxKind::YAML_STRING_VALUE, |_| { + YamlSyntaxKind::YAML_NUMBER_VALUE + }), + } } impl<'src> Lexer<'src> for YamlLexer<'src> { - const NEWLINE: Self::Kind = NEWLINE; - const WHITESPACE: Self::Kind = WHITESPACE; + const NEWLINE: Self::Kind = YamlSyntaxKind::NEWLINE; + const WHITESPACE: Self::Kind = YamlSyntaxKind::WHITESPACE; type Kind = YamlSyntaxKind; type LexContext = YamlLexContext; @@ -61,7 +285,7 @@ impl<'src> Lexer<'src> for YamlLexer<'src> { #[inline] fn advance_char_unchecked(&mut self) { let c = self.current_char_unchecked(); - self.position += c.len_utf8(); + self.advance(c.len_utf8()); } #[inline] @@ -69,8 +293,10 @@ impl<'src> Lexer<'src> for YamlLexer<'src> { self.current_start } - fn next_token(&mut self, _context: Self::LexContext) -> Self::Kind { - todo!() + fn next_token(&mut self, context: Self::LexContext) -> Self::Kind { + self.current_start = TextSize::from(self.position as u32); + self.current_flags = TokenFlags::empty(); + self.consume_token_in_context(self.current_char().unwrap_or(b'\0'), context) } fn has_preceding_line_break(&self) -> bool { @@ -112,19 +338,35 @@ impl<'src> Lexer<'src> for YamlLexer<'src> { fn consume_newline_or_whitespaces(&mut self) -> YamlSyntaxKind { if self.consume_newline() { self.after_newline = true; - NEWLINE + YamlSyntaxKind::NEWLINE } else { self.consume_whitespaces(); - WHITESPACE + YamlSyntaxKind::WHITESPACE } } } +impl Iterator for YamlLexer<'_> { + type Item = Token; + + fn next(&mut self) -> Option { + self.consume_token() + } +} + +impl FusedIterator for YamlLexer<'_> {} + /// Context in which the lexer should lex the next token #[derive(Debug, Copy, Clone, Eq, PartialEq, Default)] pub enum YamlLexContext { #[default] Regular, + /// The lexer has just lexed an identifier and is expecting a value to come next. + AfterIdent, + /// The lexer has just lexed a dash and is expecting a value to come next. + AfterArray, + /// The lexer has lexed an inline array and is expecting a value or the end of the array to come next. + AfterInlineArray, } impl LexContext for YamlLexContext { @@ -134,6 +376,22 @@ impl LexContext for YamlLexContext { } } -/// Context in which the [YamlLexContext]'s current yoken should be re-lexed. +/// Context in which the [YamlLexContext]'s current token should be re-lexed. #[derive(Debug, Copy, Clone, Eq, PartialEq)] pub enum YamlReLexContext {} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_interpret_value() { + assert_eq!(interpret_value("true"), YamlSyntaxKind::YAML_BOOLEAN_VALUE); + assert_eq!(interpret_value("false"), YamlSyntaxKind::YAML_BOOLEAN_VALUE); + assert_eq!(interpret_value("null"), YamlSyntaxKind::YAML_NULL_VALUE); + assert_eq!(interpret_value("foo"), YamlSyntaxKind::YAML_STRING_VALUE); + assert_eq!(interpret_value("1"), YamlSyntaxKind::YAML_NUMBER_VALUE); + assert_eq!(interpret_value("1.0"), YamlSyntaxKind::YAML_NUMBER_VALUE); + assert_eq!(interpret_value("1.0.0"), YamlSyntaxKind::YAML_STRING_VALUE); + } +} diff --git a/crates/biome_yaml_parser/src/lexer/tests.rs b/crates/biome_yaml_parser/src/lexer/tests.rs new file mode 100644 index 000000000000..e389a7193148 --- /dev/null +++ b/crates/biome_yaml_parser/src/lexer/tests.rs @@ -0,0 +1,233 @@ +#![cfg(test)] +#![allow(unused_mut, unused_variables, unused_assignments)] + +use crate::lexer::YamlLexer; + +use super::TextSize; +use quickcheck_macros::quickcheck; +use std::sync::mpsc::channel; +use std::thread; +use std::time::Duration; + +// Assert the result of lexing a piece of source code, +// and make sure the tokens yielded are fully lossless and the source can be reconstructed from only the tokens +macro_rules! assert_lex { + ($src:expr, $($kind:ident:$len:expr $(,)?)*) => {{ + let mut lexer = YamlLexer::from_str($src); + let mut idx = 0; + let mut tok_idx = TextSize::default(); + + let mut new_str = String::with_capacity($src.len()); + let tokens: Vec<_> = lexer.collect(); + + $( + assert_eq!( + tokens[idx].kind, + biome_yaml_syntax::YamlSyntaxKind::$kind, + "expected token kind {}, but found {:?}", + stringify!($kind), + tokens[idx].kind, + ); + + assert_eq!( + tokens[idx].range.len(), + TextSize::from($len), + "expected token length of {}, but found {:?} for token {:?}", + $len, + tokens[idx].range.len(), + tokens[idx].kind, + ); + + new_str.push_str(&$src[tokens[idx].range]); + tok_idx += tokens[idx].range.len(); + + idx += 1; + )* + + if idx < tokens.len() { + panic!( + "expected {} tokens but lexer returned {}, first unexpected token is '{:?}'", + idx, + tokens.len(), + tokens[idx].kind + ); + } else { + assert_eq!(idx, tokens.len()); + } + + assert_eq!($src, new_str, "Failed to reconstruct input"); + }}; +} + +// This is for testing if the lexer is truly lossless +// It parses random strings and puts them back together with the produced tokens and compares +#[quickcheck] +fn losslessness(string: String) -> bool { + // using an mpsc channel allows us to spawn a thread and spawn the lexer there, then if + // it takes more than 2 seconds we panic because it is 100% infinite recursion + let cloned = string.clone(); + let (sender, receiver) = channel(); + thread::spawn(move || { + let mut lexer = YamlLexer::from_str(&cloned); + let tokens: Vec<_> = lexer.map(|token| token.range).collect(); + + sender + .send(tokens) + .expect("Could not send tokens to receiver"); + }); + let token_ranges = receiver + .recv_timeout(Duration::from_secs(2)) + .unwrap_or_else(|_| panic!("Lexer is infinitely recursing with this code: ->{string}<-")); + + let mut new_str = String::with_capacity(string.len()); + let mut idx = TextSize::from(0); + + for range in token_ranges { + new_str.push_str(&string[range]); + idx += range.len(); + } + + string == new_str +} + +#[test] +fn lex_booleans() { + assert_lex!( + "true", + YAML_BOOLEAN_VALUE:4, + ); + + assert_lex!( + "false", + YAML_BOOLEAN_VALUE:5, + ); +} + +#[test] +fn lex_null() { + assert_lex!( + "null", + YAML_NULL_VALUE:4, + ); +} + +#[test] +fn lex_float() { + assert_lex!( + "123.456", + YAML_NUMBER_VALUE:7, + ); +} + +#[test] +fn lex_invalid_float_as_string() { + assert_lex!( + "123.456.789", + YAML_STRING_VALUE:11, + ); +} + +#[test] +fn lex_quoted_string() { + assert_lex!( + "\"hello world\"", + YAML_STRING_VALUE:13, + ); +} + +#[test] +fn lex_key_value_pair() { + assert_lex!( + "key: value", + YAML_IDENTIFIER:3, + COLON:1, + WHITESPACE:1, + YAML_STRING_VALUE:5, + ); +} + +#[test] +fn lex_invalid_key_value_pair() { + assert_lex!( + "key:value", + YAML_STRING_VALUE:9, + ); +} + +#[test] +fn lex_kinda_invalid_key_value_pair() { + assert_lex!( + "foo:bar: baz", + YAML_IDENTIFIER:7, + COLON:1, + WHITESPACE:1, + YAML_STRING_VALUE:3, + ); +} + +#[test] +fn lex_comment() { + assert_lex!( + "# this is a comment", + COMMENT:19, + ); +} + +#[test] +fn lex_list() { + assert_lex!( + "- foo", + DASH:1, + WHITESPACE:1, + YAML_STRING_VALUE:3, + ); +} + +#[test] +fn lex_list_invalid() { + assert_lex!( + "-foo", + YAML_STRING_VALUE:4, + ); +} + +#[test] +fn lex_nested_list() { + assert_lex!( + "- - bar", + DASH:1, + WHITESPACE:1, + DASH:1, + WHITESPACE:1, + YAML_STRING_VALUE:3, + ); +} + +#[test] +fn lex_array_inline() { + assert_lex!( + "[1]", + L_BRACK:1, + YAML_NUMBER_VALUE:1, + R_BRACK:1, + ); +} +#[test] +fn lex_array_inline_2() { + assert_lex!( + "[1,2]", + L_BRACK:1, + YAML_NUMBER_VALUE:1, + COMMA:1, + YAML_NUMBER_VALUE:1, + R_BRACK:1, + ); +} + +#[test] +fn lex_array_inline_invalid() { + assert_lex!( + "1]", + YAML_STRING_VALUE:2, + ); +}