Skip to content

Commit

Permalink
Cleanup a few parser edgecases (#7702)
Browse files Browse the repository at this point in the history
  • Loading branch information
aljazerzen authored Sep 4, 2024
1 parent 61dad7d commit 99782b9
Show file tree
Hide file tree
Showing 11 changed files with 160 additions and 98 deletions.
4 changes: 2 additions & 2 deletions edb/edgeql-parser/edgeql-parser-python/src/normalize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ fn is_operator(token: &Token) -> bool {
| OpenBrace | CloseBrace | Dot | Semicolon | Colon | Add | Sub | Mul | Div | Modulo
| Pow | Less | Greater | Eq | Ampersand | Pipe | At => true,
DecimalConst | FloatConst | IntConst | BigIntConst | BinStr | Parameter
| ParameterAndType | Str | BacktickName | Keyword(_) | Ident | Substitution | EOF | EOI
| ParameterAndType | Str | BacktickName | Keyword(_) | Ident | Substitution | EOI
| Epsilon | StartBlock | StartExtension | StartFragment | StartMigration
| StartSDLDocument => false,
}
Expand All @@ -232,7 +232,7 @@ fn serialize_tokens(tokens: &[Token]) -> String {
let mut buf = String::new();
let mut needs_space = false;
for token in tokens {
if matches!(token.kind, Kind::EOF | Kind::EOI) {
if matches!(token.kind, Kind::EOI) {
break;
}

Expand Down
8 changes: 1 addition & 7 deletions edb/edgeql-parser/edgeql-parser-python/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ pub fn parse(
let context = parser::Context::new(spec);
let (cst, errors) = parser::parse(&tokens, &context);

let cst = cst.map(|c| to_py_cst(c, py)).transpose()?;
let cst = cst.map(|c| to_py_cst(&c, py)).transpose()?;

let errors = errors
.into_iter()
Expand Down Expand Up @@ -86,12 +86,6 @@ fn downcast_tokens(
buf.push(parser::Terminal::from_token(token));
}

// adjust the span of the starting token for nicer error message spans
if buf.len() >= 2 {
buf[0].span.start = buf[1].span.start;
buf[0].span.end = buf[1].span.start;
}

Ok(buf)
}

Expand Down
113 changes: 71 additions & 42 deletions edb/edgeql-parser/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ impl<'s> Context<'s> {
/// when changing.
const UNEXPECTED: &str = "Unexpected";

pub fn parse<'a>(input: &'a [Terminal], ctx: &'a Context) -> (Option<&'a CSTNode<'a>>, Vec<Error>) {
pub fn parse<'a>(input: &'a [Terminal], ctx: &'a Context) -> (Option<CSTNode<'a>>, Vec<Error>) {
let stack_top = ctx.arena.alloc(StackNode {
parent: None,
state: 0,
Expand All @@ -43,7 +43,15 @@ pub fn parse<'a>(input: &'a [Terminal], ctx: &'a Context) -> (Option<&'a CSTNode
has_custom_error: false,
};

// append EIO
// Append EIO token.
// We have a weird setup that requires two EOI tokens:
// - one is consumed by the grammar generator and does not contribute to
// span of the nodes.
// - second is consumed by explicit EOF tokens in EdgeQLGrammar NonTerm.
// Since these are children of productions, they do contribute to the
// spans of the top-level nodes.
// First EOI is produced by tokenizer (with correct offset) and second one
// is injected here.
let end = input.last().map(|t| t.span.end).unwrap_or_default();
let eoi = ctx.alloc_terminal(Terminal {
kind: Kind::EOI,
Expand All @@ -52,7 +60,7 @@ pub fn parse<'a>(input: &'a [Terminal], ctx: &'a Context) -> (Option<&'a CSTNode
value: None,
is_placeholder: false,
});
let input = input.iter().chain(Some(eoi));
let input = input.iter().chain([eoi]);

let mut parsers = vec![initial_track];
let mut prev_span: Option<Span> = None;
Expand Down Expand Up @@ -127,7 +135,7 @@ pub fn parse<'a>(input: &'a [Terminal], ctx: &'a Context) -> (Option<&'a CSTNode
let mut skip = parser;
let error = Error::new(format!("{UNEXPECTED} {token}")).with_span(token.span);
skip.push_error(error, ERROR_COST_SKIP);
if token.kind == Kind::EOF || token.kind == Kind::Semicolon {
if token.kind == Kind::EOI || token.kind == Kind::Semicolon {
// extra penalty
skip.error_cost += ERROR_COST_INJECT_MAX;
skip.can_recover = false;
Expand Down Expand Up @@ -158,16 +166,16 @@ pub fn parse<'a>(input: &'a [Terminal], ctx: &'a Context) -> (Option<&'a CSTNode
new_parsers.push(recovered);
}
}
}

// prune: pick only 1 best parsers that has cost > ERROR_COST_INJECT_MAX
if new_parsers[0].error_cost > ERROR_COST_INJECT_MAX {
new_parsers.drain(1..);
}
// prune: pick only 1 best parsers that has cost > ERROR_COST_INJECT_MAX
if new_parsers[0].error_cost > ERROR_COST_INJECT_MAX {
new_parsers.drain(1..);
}

// prune: pick only X best parsers
if new_parsers.len() > PARSER_COUNT_MAX {
new_parsers.drain(PARSER_COUNT_MAX..);
// prune: pick only X best parsers
if new_parsers.len() > PARSER_COUNT_MAX {
new_parsers.drain(PARSER_COUNT_MAX..);
}
}

assert!(parsers.is_empty());
Expand All @@ -177,18 +185,30 @@ pub fn parse<'a>(input: &'a [Terminal], ctx: &'a Context) -> (Option<&'a CSTNode

// there will always be a parser left,
// since we always allow a token to be skipped
let mut parser = parsers.into_iter().min_by_key(|p| p.error_cost).unwrap();
parser.finish();
let parser = parsers
.into_iter()
.min_by(|a, b| {
Ord::cmp(&a.error_cost, &b.error_cost).then_with(|| {
Ord::cmp(
&starts_with_unexpected_error(a),
&starts_with_unexpected_error(b),
)
.reverse()
})
})
.unwrap();

let node = if parser.can_recover {
Some(&parser.stack_top.value)
} else {
None
};
let node = parser.finish(ctx);
let errors = custom_errors::post_process(parser.errors);
(node, errors)
}

fn starts_with_unexpected_error(a: &Parser) -> bool {
a.errors
.first()
.map_or(true, |x| x.message.starts_with(UNEXPECTED))
}

impl<'s> Context<'s> {
fn alloc_terminal(&self, t: Terminal) -> &'_ Terminal {
let idx = self.terminal_arena.push(t);
Expand Down Expand Up @@ -397,29 +417,37 @@ impl<'s> Parser<'s> {
self.stack_top = ctx.arena.alloc(node);
}

pub fn finish(&mut self) {
// XXX: This assert was failing. Should it be fixed or removed?
// debug_assert!(matches!(
// &self.stack_top.value,
// CSTNode::Terminal(Terminal {
// kind: Kind::EOI,
// ..
// })
// ));
self.stack_top = self.stack_top.parent.unwrap();
pub fn finish(&self, _ctx: &'s Context) -> Option<CSTNode<'s>> {
if !self.can_recover || self.has_custom_error {
return None;
}

// self.print_stack();
// pop the EOI from the top of the stack
assert!(
matches!(
&self.stack_top.value,
CSTNode::Terminal(Terminal {
kind: Kind::EOI,
..
})
),
"expected EOI CST node, got {:?}",
self.stack_top.value
);

let final_node = self.stack_top.parent.unwrap();

// self.print_stack(_ctx);
// println!(" --> accept");

#[cfg(debug_assertions)]
{
let first = self.stack_top.parent.unwrap();
assert!(
matches!(&first.value, CSTNode::Empty),
"expected 'Empty' found {:?}",
first.value
);
}
let first = final_node.parent.unwrap();
assert!(
matches!(&first.value, CSTNode::Empty),
"expected empty CST node, found {:?}",
first.value
);

Some(final_node.value)
}

#[cfg(never)]
Expand Down Expand Up @@ -588,7 +616,9 @@ fn injection_cost(kind: &Kind) -> u16 {
impl std::fmt::Display for Terminal {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if (self.is_placeholder && self.kind == Kind::Ident) || self.text.is_empty() {
return write!(f, "{}", self.kind.user_friendly_text().unwrap_or_default());
if let Some(user_friendly) = self.kind.user_friendly_text() {
return write!(f, "{}", user_friendly);
}
}

match self.kind {
Expand Down Expand Up @@ -695,8 +725,7 @@ fn get_token_kind(token_name: &str) -> Kind {
">" => Greater,

"IDENT" => Ident,
"EOF" => EOF,
"<$>" => EOI,
"EOI" | "<$>" => EOI,
"<e>" => Epsilon,

"BCONST" => BinStr,
Expand Down
4 changes: 1 addition & 3 deletions edb/edgeql-parser/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,7 @@ pub enum Kind {
Keyword(Keyword),

Ident,
EOF,
EOI, // <$> (needed for LR parser)
EOI, // end of input
Epsilon, // <e> (needed for LR parser)

StartBlock,
Expand Down Expand Up @@ -1056,7 +1055,6 @@ impl Kind {
use Kind::*;
Some(match self {
Ident => "identifier",
EOF => "end of file",
EOI => "end of input",

BinStr => "binary constant",
Expand Down
2 changes: 1 addition & 1 deletion edb/edgeql-parser/src/validation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ impl<'a> Iterator for WithEof<'a> {
let pos = self.inner.current_pos().offset;

Some(Ok(Token {
kind: Kind::EOF,
kind: Kind::EOI,
text: "".into(),
value: None,
span: Span {
Expand Down
40 changes: 17 additions & 23 deletions edb/edgeql/parser/grammar/start.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,29 +44,23 @@
class EdgeQLGrammar(Nonterm):
"%start"

@parsing.inline(1)
def reduce_STARTBLOCK_EdgeQLBlock_EOF(self, *kids):
pass
def reduce_STARTBLOCK_EdgeQLBlock_EOI(self, *kids):
self.val = kids[1].val

@parsing.inline(1)
def reduce_STARTEXTENSION_CreateExtensionPackageCommandsBlock_EOF(self, *k):
pass
def reduce_STARTEXTENSION_CreateExtensionPackageCommandsBlock_EOI(self, *k):
self.val = k[1].val

@parsing.inline(1)
def reduce_STARTMIGRATION_CreateMigrationCommandsBlock_EOF(self, *kids):
pass
def reduce_STARTMIGRATION_CreateMigrationCommandsBlock_EOI(self, *kids):
self.val = kids[1].val

@parsing.inline(1)
def reduce_STARTFRAGMENT_ExprStmt_EOF(self, *kids):
pass
def reduce_STARTFRAGMENT_ExprStmt_EOI(self, *kids):
self.val = kids[1].val

@parsing.inline(1)
def reduce_STARTFRAGMENT_Expr_EOF(self, *kids):
pass
def reduce_STARTFRAGMENT_Expr_EOI(self, *kids):
self.val = kids[1].val

@parsing.inline(1)
def reduce_STARTSDLDOCUMENT_SDLDocument(self, *kids):
pass
def reduce_STARTSDLDOCUMENT_SDLDocument_EOI(self, *kids):
self.val = kids[1].val


class EdgeQLBlock(Nonterm):
Expand Down Expand Up @@ -110,12 +104,12 @@ class StatementBlock(


class SDLDocument(Nonterm):
def reduce_OptSemicolons_EOF(self, *kids):
def reduce_OptSemicolons(self, *kids):
self.val = qlast.Schema(declarations=[])

def reduce_statement_without_semicolons(self, *kids):
r"""%reduce \
OptSemicolons SDLShortStatement EOF
OptSemicolons SDLShortStatement
"""
declarations = [kids[1].val]
commondl._validate_declarations(declarations)
Expand All @@ -124,18 +118,18 @@ def reduce_statement_without_semicolons(self, *kids):
def reduce_statements_without_optional_trailing_semicolons(self, *kids):
r"""%reduce \
OptSemicolons SDLStatements \
OptSemicolons SDLShortStatement EOF
OptSemicolons SDLShortStatement
"""
declarations = kids[1].val + [kids[3].val]
commondl._validate_declarations(declarations)
self.val = qlast.Schema(declarations=declarations)

def reduce_OptSemicolons_SDLStatements_EOF(self, *kids):
def reduce_OptSemicolons_SDLStatements(self, *kids):
declarations = kids[1].val
commondl._validate_declarations(declarations)
self.val = qlast.Schema(declarations=declarations)

def reduce_OptSemicolons_SDLStatements_Semicolons_EOF(self, *kids):
def reduce_OptSemicolons_SDLStatements_Semicolons(self, *kids):
declarations = kids[1].val
commondl._validate_declarations(declarations)
self.val = qlast.Schema(declarations=declarations)
2 changes: 1 addition & 1 deletion edb/edgeql/parser/grammar/tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ class T_IDENT(Token):
pass


class T_EOF(Token):
class T_EOI(Token):
pass


Expand Down
Loading

0 comments on commit 99782b9

Please sign in to comment.