From af4885addb315b7e18aab57e8ad49e1ebeea72a3 Mon Sep 17 00:00:00 2001 From: Matthew Healy Date: Tue, 13 Dec 2022 14:36:42 +0100 Subject: [PATCH 1/3] Decouple string start & end delimiters in parser logic Since both multiline and symbolic strings will end with `"%`, it will no longer be true that the start & end delimiters of a string must match. It will also no longer be the case that we can infer whether or not indentation must be stripped from the starting delimiter. This commit separates the opening and closing delimeter into two different types, and provides a more explicit API for working with them, which will be resilient to the addition of a new `StringStartDelimiter` in a later commit. --- src/parser/grammar.lalrpop | 22 ++++++++++++---------- src/parser/utils.rs | 34 +++++++++++++++++++++++++++++++--- 2 files changed, 43 insertions(+), 13 deletions(-) diff --git a/src/parser/grammar.lalrpop b/src/parser/grammar.lalrpop index 7fcf877ed5..4078bfbe13 100644 --- a/src/parser/grammar.lalrpop +++ b/src/parser/grammar.lalrpop @@ -477,7 +477,10 @@ Bool: bool = { // Strings that support interpolation. StrChunks: RichTerm = { => { - debug_assert_eq!(start, end); + debug_assert!( + start.is_closed_by(&end), + "Fatal parser error: a string starting with {start:?} should never be closed by {end:?}" + ); let chunks: Vec> = fst.into_iter() .map(StrChunk::Literal) @@ -490,10 +493,9 @@ StrChunks: RichTerm = { .chain(lasts.into_iter()) .collect(); - let mut chunks = if start == StringKind::Multiline { + let mut chunks = if start.needs_strip_indent() { strip_indent(chunks) - } - else { + } else { chunks }; chunks.reverse(); @@ -502,14 +504,14 @@ StrChunks: RichTerm = { }, }; -StringStart : StringKind = { - "\"" => StringKind::Standard, - "m%\"" => StringKind::Multiline, +StringStart : StringStartDelimiter = { + "\"" => StringStartDelimiter::Standard, + "m%\"" => StringStartDelimiter::Multiline, }; -StringEnd : StringKind = { - "\"" => StringKind::Standard, - "\"%" => StringKind::Multiline, +StringEnd : StringEndDelimiter = { + "\"" => StringEndDelimiter::Standard, + "\"%" => StringEndDelimiter::Multiline, }; ChunkLiteral : String = diff --git a/src/parser/utils.rs b/src/parser/utils.rs index 46fc628f9e..92f2dccf21 100644 --- a/src/parser/utils.rs +++ b/src/parser/utils.rs @@ -23,10 +23,38 @@ use crate::{ types::{TypeF, Types}, }; -/// Distinguish between the standard string separators `"`/`"` and the multi-line string separators -/// `m%"`/`"%` in the parser. +/// Distinguish between the standard string opening delimiter `"` and the multi-line string +/// opening delimter `m%"` #[derive(Copy, Clone, Eq, PartialEq, Debug)] -pub enum StringKind { +pub enum StringStartDelimiter { + Standard, + Multiline, +} + +impl StringStartDelimiter { + pub fn is_closed_by(&self, close: &StringEndDelimiter) -> bool { + matches!( + (self, close), + (StringStartDelimiter::Standard, StringEndDelimiter::Standard) + | ( + StringStartDelimiter::Multiline, + StringEndDelimiter::Multiline + ) + ) + } + + pub fn needs_strip_indent(&self) -> bool { + match self { + StringStartDelimiter::Standard => false, + StringStartDelimiter::Multiline => true, + } + } +} + +/// Distinguish between the standard string closing delimter `"` and the multi-line string +/// closing delimeter `"%`. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum StringEndDelimiter { Standard, Multiline, } From da165dbdf6d11c48d4a206f65ba9000414e858a9 Mon Sep 17 00:00:00 2001 From: Matthew Healy Date: Wed, 14 Dec 2022 10:59:42 +0100 Subject: [PATCH 2/3] Desugar symbolic strings to arrays This commit implements the symbolic string syntax `s%"..."%`. These strings using the same parsing rules as multiline strings, but are then desugared to `Term::Array`s rather than `Term::StrChunks`. --- src/parser/grammar.lalrpop | 21 ++- src/parser/lexer.rs | 7 +- src/parser/tests.rs | 157 ++++++++++++-------- src/parser/utils.rs | 11 +- tests/integration/pass/symbolic-strings.ncl | 44 ++++++ 5 files changed, 169 insertions(+), 71 deletions(-) create mode 100644 tests/integration/pass/symbolic-strings.ncl diff --git a/src/parser/grammar.lalrpop b/src/parser/grammar.lalrpop index 4078bfbe13..9ee7d09640 100644 --- a/src/parser/grammar.lalrpop +++ b/src/parser/grammar.lalrpop @@ -474,7 +474,9 @@ Bool: bool = { "false" => false, }; -// Strings that support interpolation. +// String-like syntax which supports interpolation. +// Depending on the opening brace, these either parse as strings, or as "symbolic strings", +// which get desugared here to an array of terms. StrChunks: RichTerm = { => { debug_assert!( @@ -493,20 +495,30 @@ StrChunks: RichTerm = { .chain(lasts.into_iter()) .collect(); - let mut chunks = if start.needs_strip_indent() { + let chunks = if start.needs_strip_indent() { strip_indent(chunks) } else { chunks }; - chunks.reverse(); - RichTerm::from(Term::StrChunks(chunks)) + if start == StringStartDelimiter::Symbolic { + let terms = chunks.into_iter().map(|chunk| match chunk { + StrChunk::Literal(l) => Term::Str(l).into(), + StrChunk::Expr(e, _) => e, + }).collect(); + RichTerm::from(Term::Array(terms, Default::default())) + } else { + let mut chunks = chunks; + chunks.reverse(); + RichTerm::from(Term::StrChunks(chunks)) + } }, }; StringStart : StringStartDelimiter = { "\"" => StringStartDelimiter::Standard, "m%\"" => StringStartDelimiter::Multiline, + "s%\"" => StringStartDelimiter::Symbolic, }; StringEnd : StringEndDelimiter = { @@ -886,6 +898,7 @@ extern { "\"" => Token::Normal(NormalToken::DoubleQuote), "\"%" => Token::MultiStr(MultiStringToken::End), "m%\"" => Token::Normal(NormalToken::MultiStringStart()), + "s%\"" => Token::Normal(NormalToken::SymbolicStringStart()), "Num" => Token::Normal(NormalToken::Num), "Dyn" => Token::Normal(NormalToken::Dyn), diff --git a/src/parser/lexer.rs b/src/parser/lexer.rs index e0d021c49e..32fb85e82d 100644 --- a/src/parser/lexer.rs +++ b/src/parser/lexer.rs @@ -160,6 +160,8 @@ pub enum NormalToken<'input> { Underscore, #[regex("m(%+)\"", |lex| lex.slice().len())] MultiStringStart(usize), + #[regex("s(%+)\"", |lex| lex.slice().len())] + SymbolicStringStart(usize), #[token("%tag%")] Tag, @@ -605,7 +607,10 @@ impl<'input> Iterator for Lexer<'input> { Some(Normal(NormalToken::DoubleQuote | NormalToken::StrEnumTagBegin)) => { self.enter_str() } - Some(Normal(NormalToken::MultiStringStart(delim_size))) => { + Some(Normal( + NormalToken::MultiStringStart(delim_size) + | NormalToken::SymbolicStringStart(delim_size), + )) => { // for interpolation & closing delimeters we only care about // the number of `%`s (plus the opening `"` or `{`) so we // drop the "kind marker" size here (i.e. the `m` character). diff --git a/src/parser/tests.rs b/src/parser/tests.rs index 4590e330c2..f43acbc822 100644 --- a/src/parser/tests.rs +++ b/src/parser/tests.rs @@ -287,69 +287,100 @@ fn invalid_record_types() { #[test] fn string_lexing() { - assert_eq!( - lex_without_pos("\"Good\" \"strings\""), - Ok(vec![ - Token::Normal(NormalToken::DoubleQuote), - Token::Str(StringToken::Literal("Good")), - Token::Normal(NormalToken::DoubleQuote), - Token::Normal(NormalToken::DoubleQuote), - Token::Str(StringToken::Literal("strings")), - Token::Normal(NormalToken::DoubleQuote), - ]) - ); - - assert_eq!( - lex_without_pos("\"Good\\nEscape\\t\\\"\""), - Ok(vec![ - Token::Normal(NormalToken::DoubleQuote), - Token::Str(StringToken::Literal("Good")), - Token::Str(StringToken::EscapedChar('\n')), - Token::Str(StringToken::Literal("Escape")), - Token::Str(StringToken::EscapedChar('\t')), - Token::Str(StringToken::EscapedChar('\"')), - Token::Normal(NormalToken::DoubleQuote), - ]) - ); - - assert_eq!( - lex_without_pos("\"1 + %{ 1 } + 2\""), - Ok(vec![ - Token::Normal(NormalToken::DoubleQuote), - Token::Str(StringToken::Literal("1 + ")), - Token::Str(StringToken::Interpolation), - Token::Normal(NormalToken::NumLiteral(1.0)), - Token::Normal(NormalToken::RBrace), - Token::Str(StringToken::Literal(" + 2")), - Token::Normal(NormalToken::DoubleQuote), - ]) - ); - - assert_eq!( - lex_without_pos("\"1 + %{ \"%{ 1 }\" } + 2\""), - Ok(vec![ - Token::Normal(NormalToken::DoubleQuote), - Token::Str(StringToken::Literal("1 + ")), - Token::Str(StringToken::Interpolation), - Token::Normal(NormalToken::DoubleQuote), - Token::Str(StringToken::Interpolation), - Token::Normal(NormalToken::NumLiteral(1.0)), - Token::Normal(NormalToken::RBrace), - Token::Normal(NormalToken::DoubleQuote), - Token::Normal(NormalToken::RBrace), - Token::Str(StringToken::Literal(" + 2")), - Token::Normal(NormalToken::DoubleQuote), - ]) - ); - - assert_eq!( - lex_without_pos(r#"m%%""%"%%"#), - Ok(vec![ - Token::Normal(NormalToken::MultiStringStart(4)), - Token::MultiStr(MultiStringToken::Literal("\"%")), - Token::MultiStr(MultiStringToken::End), - ]) - ); + for (name, input, expected) in [ + ( + "simple strings", + r#""Good" "strings""#, + vec![ + Token::Normal(NormalToken::DoubleQuote), + Token::Str(StringToken::Literal("Good")), + Token::Normal(NormalToken::DoubleQuote), + Token::Normal(NormalToken::DoubleQuote), + Token::Str(StringToken::Literal("strings")), + Token::Normal(NormalToken::DoubleQuote), + ], + ), + ( + "valid escape sequence", + r#""Good\nEscape\t\"""#, + vec![ + Token::Normal(NormalToken::DoubleQuote), + Token::Str(StringToken::Literal("Good")), + Token::Str(StringToken::EscapedChar('\n')), + Token::Str(StringToken::Literal("Escape")), + Token::Str(StringToken::EscapedChar('\t')), + Token::Str(StringToken::EscapedChar('\"')), + Token::Normal(NormalToken::DoubleQuote), + ], + ), + ( + "simple interpolation", + r#""1 + %{ 1 } + 2""#, + vec![ + Token::Normal(NormalToken::DoubleQuote), + Token::Str(StringToken::Literal("1 + ")), + Token::Str(StringToken::Interpolation), + Token::Normal(NormalToken::NumLiteral(1.0)), + Token::Normal(NormalToken::RBrace), + Token::Str(StringToken::Literal(" + 2")), + Token::Normal(NormalToken::DoubleQuote), + ], + ), + ( + "nested interpolated strings", + r#""1 + %{ "%{ 1 }" } + 2""#, + vec![ + Token::Normal(NormalToken::DoubleQuote), + Token::Str(StringToken::Literal("1 + ")), + Token::Str(StringToken::Interpolation), + Token::Normal(NormalToken::DoubleQuote), + Token::Str(StringToken::Interpolation), + Token::Normal(NormalToken::NumLiteral(1.0)), + Token::Normal(NormalToken::RBrace), + Token::Normal(NormalToken::DoubleQuote), + Token::Normal(NormalToken::RBrace), + Token::Str(StringToken::Literal(" + 2")), + Token::Normal(NormalToken::DoubleQuote), + ], + ), + ( + "multiline strings only close on delmiter with correct number of %s", + r#"m%%""%"%%"#, + vec![ + Token::Normal(NormalToken::MultiStringStart(4)), + Token::MultiStr(MultiStringToken::Literal("\"%")), + Token::MultiStr(MultiStringToken::End), + ], + ), + ( + "empty symbolic string lexes like multi-line str", + r#"s%""%"#, + vec![ + Token::Normal(NormalToken::SymbolicStringStart(3)), + Token::MultiStr(MultiStringToken::End), + ], + ), + ( + "symbolic string with interpolation", + r#"s%"text %{ 1 } etc."%"#, + vec![ + Token::Normal(NormalToken::SymbolicStringStart(3)), + Token::MultiStr(MultiStringToken::Literal("text ")), + Token::MultiStr(MultiStringToken::Interpolation), + Token::Normal(NormalToken::NumLiteral(1.0)), + Token::Normal(NormalToken::RBrace), + Token::MultiStr(MultiStringToken::Literal(" etc.")), + Token::MultiStr(MultiStringToken::End), + ], + ), + ] { + assert_eq!( + lex_without_pos(input), + Ok(expected), + "Case failed: {}", + name + ) + } } #[test] diff --git a/src/parser/utils.rs b/src/parser/utils.rs index 92f2dccf21..9ebe17af8b 100644 --- a/src/parser/utils.rs +++ b/src/parser/utils.rs @@ -23,12 +23,13 @@ use crate::{ types::{TypeF, Types}, }; -/// Distinguish between the standard string opening delimiter `"` and the multi-line string -/// opening delimter `m%"` +/// Distinguish between the standard string opening delimiter `"`, the multi-line string +/// opening delimter `m%"`, and the symbolic string opening delimiter `s%"`. #[derive(Copy, Clone, Eq, PartialEq, Debug)] pub enum StringStartDelimiter { Standard, Multiline, + Symbolic, } impl StringStartDelimiter { @@ -40,13 +41,17 @@ impl StringStartDelimiter { StringStartDelimiter::Multiline, StringEndDelimiter::Multiline ) + | ( + StringStartDelimiter::Symbolic, + StringEndDelimiter::Multiline + ) ) } pub fn needs_strip_indent(&self) -> bool { match self { StringStartDelimiter::Standard => false, - StringStartDelimiter::Multiline => true, + StringStartDelimiter::Multiline | StringStartDelimiter::Symbolic => true, } } } diff --git a/tests/integration/pass/symbolic-strings.ncl b/tests/integration/pass/symbolic-strings.ncl new file mode 100644 index 0000000000..9e4d7e7ab6 --- /dev/null +++ b/tests/integration/pass/symbolic-strings.ncl @@ -0,0 +1,44 @@ +let {check, ..} = import "lib/assert.ncl" in + +[ + # Static symbolic string + s%"hello, world"% == ["hello, world"], + # Interpolating a string + let s = "test" in + s%"This is a %{s}"% == ["This is a ", "test"], + # Interpolating an interpolated string + let f = "f" in + s%"abc %{"de%{f}"}"% == ["abc ", "def"], + # Interpolating a number + s%"num: %{100}"% == ["num: ", 100], + # Interpolating a bool + s%"bool: %{true}"% == ["bool: ", true], + # Interpolating an array + s%"array: %{[true, 1, "yes"]}"% == ["array: ", [true, 1, "yes"]], + # Interpolating a record + let r = { a = 1, b = false } in + s%"record: %{r}"% == ["record: ", r], + # Interpolating multiple values + let str = "some string" in + let num = 999.999 in + let bool = false in + let array = ["an", "array", 100] in + let record = { a = 1, simple = "yes", record = true } in + let actual = s%" + 1. %{str} + 2. %{num} + 3. %{bool} + 4. %{array} + 5. %{record}"% + in + let expected = [ + "1. ", str, + "\n2. ", num, + "\n3. ", bool, + "\n4. ", array, + "\n5. ", record + ] + in + actual == expected, +] +|> check From e0623469fa0067b59fcc837959f0ccf8ba7cc96a Mon Sep 17 00:00:00 2001 From: Matthew Healy Date: Fri, 16 Dec 2022 11:15:50 +0100 Subject: [PATCH 3/3] `StringEndDelimiter::Multiline` -> `StringEndDelimiter::Special` --- src/parser/grammar.lalrpop | 2 +- src/parser/utils.rs | 14 ++++---------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/src/parser/grammar.lalrpop b/src/parser/grammar.lalrpop index 9ee7d09640..0631056627 100644 --- a/src/parser/grammar.lalrpop +++ b/src/parser/grammar.lalrpop @@ -523,7 +523,7 @@ StringStart : StringStartDelimiter = { StringEnd : StringEndDelimiter = { "\"" => StringEndDelimiter::Standard, - "\"%" => StringEndDelimiter::Multiline, + "\"%" => StringEndDelimiter::Special, }; ChunkLiteral : String = diff --git a/src/parser/utils.rs b/src/parser/utils.rs index 9ebe17af8b..ddbdd6fe4b 100644 --- a/src/parser/utils.rs +++ b/src/parser/utils.rs @@ -37,14 +37,8 @@ impl StringStartDelimiter { matches!( (self, close), (StringStartDelimiter::Standard, StringEndDelimiter::Standard) - | ( - StringStartDelimiter::Multiline, - StringEndDelimiter::Multiline - ) - | ( - StringStartDelimiter::Symbolic, - StringEndDelimiter::Multiline - ) + | (StringStartDelimiter::Multiline, StringEndDelimiter::Special) + | (StringStartDelimiter::Symbolic, StringEndDelimiter::Special) ) } @@ -56,12 +50,12 @@ impl StringStartDelimiter { } } -/// Distinguish between the standard string closing delimter `"` and the multi-line string +/// Distinguish between the standard string closing delimter `"` and the "special" string /// closing delimeter `"%`. #[derive(Copy, Clone, Debug, Eq, PartialEq)] pub enum StringEndDelimiter { Standard, - Multiline, + Special, } /// Distinguish between a normal case `id => exp` and a default case `_ => exp`.