Skip to content

Commit

Permalink
Improve parser errors on unexpected tokens (#698)
Browse files Browse the repository at this point in the history
Signed-off-by: John Kastner <[email protected]>
  • Loading branch information
john-h-kastner-aws authored Mar 7, 2024
1 parent 1dc9f9e commit 5ed3ff9
Show file tree
Hide file tree
Showing 6 changed files with 251 additions and 53 deletions.
66 changes: 66 additions & 0 deletions cedar-policy-core/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -689,6 +689,7 @@ mod parse_tests {
use super::*;
use crate::test_utils::*;
use cool_asserts::assert_matches;
use itertools::Itertools;
use miette::Diagnostic;

#[test]
Expand Down Expand Up @@ -1059,4 +1060,69 @@ mod parse_tests {
}
})
}

#[test]
fn unexpected_token_errors() {
#[track_caller]
fn assert_labeled_span(policy: &str, label: impl Into<String>) {
assert_matches!(parse_policy(None, policy), Err(e) => {
let actual_label = e.labels().and_then(|l| {
l.exactly_one()
.ok()
.expect("Assumed that there would be exactly one label if labels are present")
.label()
.map(|l| l.to_string())
});
assert_eq!(Some(label.into()), actual_label, "Did not see expected labeled span.");
});
}

// Don't list out all the special case identifiers
assert_labeled_span("@", "expected identifier");
assert_labeled_span(
"permit(principal, action, resource) when { principal.",
"expected identifier",
);

// We specifically want `when` or `unless`, but we previously listed all
// identifier tokens, so this is an improvement.
assert_labeled_span(
"permit(principal, action, resource)",
"expected `;` or identifier",
);
// AST actually requires `permit` or `forbid`, but grammar looks for any
// identifier.
assert_labeled_span("@if(\"a\")", "expected `@` or identifier");
// AST actually requires `principal` (`action`, `resource`, resp.). In
// the `principal` case we also claim to expect `)` because an empty scope
// initially parses to a CST. The trailing comma rules this out in the others.
assert_labeled_span("permit(", "expected `)` or identifier");
assert_labeled_span("permit(,,);", "expected `)` or identifier");
assert_labeled_span("permit(principal,", "expected identifier");
assert_labeled_span("permit(principal,action,", "expected identifier");
// Nothing will actually convert to an AST here.
assert_labeled_span("permit(principal,action,resource,", "expected identifier");
// We still list out `if` as an expected token because it doesn't get
// parsed as an ident in this position.
assert_labeled_span(
"permit(principal, action, resource) when {",
"expected `!`, `(`, `-`, `[`, `{`, `}`, `false`, identifier, `if`, number, `?principal`, `?resource`, string literal, or `true`",
);
// The right operand of an `is` gets parsed as any `Expr`, so we will
// list out all the possible expression tokens even though _only_
// `identifier` is accepted. This choice allows nicer error messages for
// `principal is User::"alice"`, but it doesn't work in our favor here.
assert_labeled_span(
"permit(principal, action, resource) when { principal is",
"expected `!`, `(`, `-`, `[`, `{`, `false`, identifier, `if`, number, `?principal`, `?resource`, string literal, or `true`",
);

// We expect binary operators, but don't claim to expect `=`, `%` or
// `/`. We still expect `::` even though `true` is a reserved identifier
// and so we can't have an entity reference `true::"eid"`
assert_labeled_span(
"permit(principal, action, resource) when { if true",
"expected `!=`, `&&`, `(`, `*`, `+`, `-`, `.`, `::`, `<`, `<=`, `==`, `>`, `>=`, `[`, `||`, `has`, `in`, `is`, `like`, or `then`",
)
}
}
140 changes: 105 additions & 35 deletions cedar-policy-core/src/parser/err.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

use std::collections::HashMap;
use std::collections::{BTreeSet, HashMap, HashSet};
use std::fmt::{self, Display, Write};
use std::iter;
use std::ops::{Deref, DerefMut};
Expand Down Expand Up @@ -556,11 +556,11 @@ impl Diagnostic for ToCSTError {
let labeled_span = match &self.err {
OwnedRawParseError::InvalidToken { .. } => LabeledSpan::underline(primary_source_span),
OwnedRawParseError::UnrecognizedEof { expected, .. } => LabeledSpan::new_with_span(
expected_to_string(expected, &FRIENDLY_TOKEN_NAMES),
expected_to_string(expected, &POLICY_TOKEN_CONFIG),
primary_source_span,
),
OwnedRawParseError::UnrecognizedToken { expected, .. } => LabeledSpan::new_with_span(
expected_to_string(expected, &FRIENDLY_TOKEN_NAMES),
expected_to_string(expected, &POLICY_TOKEN_CONFIG),
primary_source_span,
),
OwnedRawParseError::ExtraToken { .. } => LabeledSpan::underline(primary_source_span),
Expand All @@ -570,41 +570,111 @@ impl Diagnostic for ToCSTError {
}
}

/// Defines configurable rules for how tokens in an `UnrecognizedToken` or
/// `UnrecognizedEof` error should be displayed to users.
#[derive(Debug)]
pub struct ExpectedTokenConfig {
/// Defines user-friendly names for tokens used by our parser. Keys are the
/// names of tokens as defined in the `.lalrpop` grammar file. A token may
/// be omitted from this map if the name is already friendly enough.
pub friendly_token_names: HashMap<&'static str, &'static str>,

/// Some tokens defined in our grammar always cause later processing to fail.
/// Our policy grammar defines a token for the mod operator `%`, but we
/// reject any CST that uses the operator. To reduce confusion we filter
/// these from the list of expected tokens in an error message.
pub impossible_tokens: HashSet<&'static str>,

/// Both our policy and schema grammar have a generic identifier token
/// and some more specific identifier tokens that we use to parse specific
/// constructs. It is very often not useful to explicitly list out all of
/// these special identifier because the parser really just wants any
/// generic identifier. That it would accept these does not give any
/// useful information.
pub special_identifier_tokens: HashSet<&'static str>,

/// If this token is expected, then the parser expected a generic identifier, so
/// we omit the specific identifiers favor of saying we expect an "identifier".
pub identifier_sentinel: &'static str,

/// Special identifiers that may be worth displaying even if the parser
/// wants a generic identifier. These can tokens will be parsed as something
/// other than an identifier when they occur as the first token in an
/// expression (or a type, in the case of the schema grammar).
pub first_set_identifier_tokens: HashSet<&'static str>,

/// If this token is expected, then the parser was looking to start parsing
/// an expression (or type, in the schema). We know that we should report the
/// tokens that aren't parsed as identifiers at the start of an expression.
pub first_set_sentinel: &'static str,
}

lazy_static! {
/// Keys mirror the token names defined in the `match` block of
/// `grammar.lalrpop`.
static ref FRIENDLY_TOKEN_NAMES: HashMap<&'static str, &'static str> = HashMap::from([
("TRUE", "`true`"),
("FALSE", "`false`"),
("IF", "`if`"),
("PERMIT", "`permit`"),
("FORBID", "`forbid`"),
("WHEN", "`when`"),
("UNLESS", "`unless`"),
("IN", "`in`"),
("HAS", "`has`"),
("LIKE", "`like`"),
("IS", "`is`"),
("THEN", "`then`"),
("ELSE", "`else`"),
("PRINCIPAL", "`principal`"),
("ACTION", "`action`"),
("RESOURCE", "`resource`"),
("CONTEXT", "`context`"),
("PRINCIPAL_SLOT", "`?principal`"),
("RESOURCE_SLOT", "`?resource`"),
("OTHER_SLOT", "template slot"),
("IDENTIFIER", "identifier"),
("NUMBER", "number"),
("STRINGLIT", "string literal"),
]);
static ref POLICY_TOKEN_CONFIG: ExpectedTokenConfig = ExpectedTokenConfig {
friendly_token_names: HashMap::from([
("TRUE", "`true`"),
("FALSE", "`false`"),
("IF", "`if`"),
("PERMIT", "`permit`"),
("FORBID", "`forbid`"),
("WHEN", "`when`"),
("UNLESS", "`unless`"),
("IN", "`in`"),
("HAS", "`has`"),
("LIKE", "`like`"),
("IS", "`is`"),
("THEN", "`then`"),
("ELSE", "`else`"),
("PRINCIPAL", "`principal`"),
("ACTION", "`action`"),
("RESOURCE", "`resource`"),
("CONTEXT", "`context`"),
("PRINCIPAL_SLOT", "`?principal`"),
("RESOURCE_SLOT", "`?resource`"),
("IDENTIFIER", "identifier"),
("NUMBER", "number"),
("STRINGLIT", "string literal"),
]),
impossible_tokens: HashSet::from(["\"=\"", "\"%\"", "\"/\"", "OTHER_SLOT"]),
special_identifier_tokens: HashSet::from([
"PERMIT",
"FORBID",
"WHEN",
"UNLESS",
"IN",
"HAS",
"LIKE",
"IS",
"THEN",
"ELSE",
"PRINCIPAL",
"ACTION",
"RESOURCE",
"CONTEXT",
]),
identifier_sentinel: "IDENTIFIER",
first_set_identifier_tokens: HashSet::from(["TRUE", "FALSE", "IF"]),
first_set_sentinel: "\"!\"",
};
}

/// Format lalrpop expected error messages
pub fn expected_to_string(
expected: &[String],
token_map: &HashMap<&'static str, &'static str>,
) -> Option<String> {
pub fn expected_to_string(expected: &[String], config: &ExpectedTokenConfig) -> Option<String> {
let mut expected = expected
.iter()
.filter(|e| !config.impossible_tokens.contains(e.as_str()))
.map(|e| e.as_str())
.collect::<BTreeSet<_>>();
if expected.contains(config.identifier_sentinel) {
for token in config.special_identifier_tokens.iter() {
expected.remove(*token);
}
if !expected.contains(config.first_set_sentinel) {
for token in config.first_set_identifier_tokens.iter() {
expected.remove(*token);
}
}
}
if expected.is_empty() {
return None;
}
Expand All @@ -616,7 +686,7 @@ pub fn expected_to_string(
&mut expected_string,
"or",
expected,
|f, token| match token_map.get(token.as_str()) {
|f, token| match config.friendly_token_names.get(token) {
Some(friendly_token_name) => write!(f, "{}", friendly_token_name),
None => write!(f, "{}", token.replace('"', "`")),
},
Expand Down
1 change: 1 addition & 0 deletions cedar-policy-validator/src/err.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ pub enum HumanSchemaError {
#[error("{0}")]
IO(#[from] std::io::Error),
#[error("{0}")]
#[diagnostic(transparent)]
Parsing(#[from] HumanSyntaxParseErrors),
}

Expand Down
64 changes: 46 additions & 18 deletions cedar-policy-validator/src/human_schema/err.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
use std::{collections::HashMap, fmt::Display};

use cedar_policy_core::parser::{err::expected_to_string, unescape::UnescapeError, Loc, Node};
use std::{
collections::{HashMap, HashSet},
fmt::Display,
};

use cedar_policy_core::parser::{
err::{expected_to_string, ExpectedTokenConfig},
unescape::UnescapeError,
Loc, Node,
};
use lalrpop_util as lalr;
use lazy_static::lazy_static;
use miette::{Diagnostic, LabeledSpan, SourceSpan};
Expand Down Expand Up @@ -30,19 +37,40 @@ pub(crate) type RawErrorRecovery<'a> = lalr::ErrorRecovery<RawLocation, RawToken
type OwnedRawParseError = lalr::ParseError<RawLocation, String, RawUserError>;

lazy_static! {
/// Keys mirror the token names defined in the `match` block of
/// `grammar.lalrpop`.
static ref FRIENDLY_TOKEN_NAMES: HashMap<&'static str, &'static str> = HashMap::from([
("IN", "`in`"),
("PRINCIPAL", "`principal`"),
("ACTION", "`action`"),
("RESOURCE", "`resource`"),
("CONTEXT", "`context`"),
("STRINGLIT", "string literal"),
("ENTITY", "`entity`"),
("NAMESPACE", "`namespace`"),
("TYPE", "`type`"),
]);
static ref SCHEMA_TOKEN_CONFIG: ExpectedTokenConfig = ExpectedTokenConfig {
friendly_token_names: HashMap::from([
("IN", "`in`"),
("PRINCIPAL", "`principal`"),
("ACTION", "`action`"),
("RESOURCE", "`resource`"),
("CONTEXT", "`context`"),
("STRINGLIT", "string literal"),
("ENTITY", "`entity`"),
("NAMESPACE", "`namespace`"),
("TYPE", "`type`"),
("SET", "`Set`"),
("IDENTIFIER", "identifier"),
]),
impossible_tokens: HashSet::new(),
special_identifier_tokens: HashSet::from([
"NAMESPACE",
"ENTITY",
"IN",
"TYPE",
"APPLIESTO",
"PRINCIPAL",
"ACTION",
"RESOURCE",
"CONTEXT",
"ATTRIBUTES",
"LONG",
"STRING",
"BOOL",
]),
identifier_sentinel: "IDENTIFIER",
first_set_identifier_tokens: HashSet::from(["SET"]),
first_set_sentinel: "\"{\"",
};
}

/// For errors during parsing
Expand Down Expand Up @@ -115,11 +143,11 @@ impl Diagnostic for ParseError {
let labeled_span = match err {
OwnedRawParseError::InvalidToken { .. } => LabeledSpan::underline(primary_source_span),
OwnedRawParseError::UnrecognizedEof { expected, .. } => LabeledSpan::new_with_span(
expected_to_string(expected, &FRIENDLY_TOKEN_NAMES),
expected_to_string(expected, &SCHEMA_TOKEN_CONFIG),
primary_source_span,
),
OwnedRawParseError::UnrecognizedToken { expected, .. } => LabeledSpan::new_with_span(
expected_to_string(expected, &FRIENDLY_TOKEN_NAMES),
expected_to_string(expected, &SCHEMA_TOKEN_CONFIG),
primary_source_span,
),
OwnedRawParseError::ExtraToken { .. } => LabeledSpan::underline(primary_source_span),
Expand Down
31 changes: 31 additions & 0 deletions cedar-policy-validator/src/human_schema/test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ mod demo_tests {
NamespaceDefinition, SchemaFragment, SchemaTypeVariant, TypeOfAttribute,
};

use itertools::Itertools;
use miette::Diagnostic;

#[test]
fn no_applies_to() {
let src = r#"
Expand Down Expand Up @@ -756,6 +759,34 @@ mod demo_tests {
_ => panic!("Wrong type for shape"),
}
}

#[test]
fn expected_tokens() {
#[track_caller]
fn assert_labeled_span(src: &str, label: impl Into<String>) {
assert_matches!(SchemaFragment::from_str_natural(src).map(|(s, _)| s), Err(e) => {
let actual_label = e.labels().and_then(|l| {
l.exactly_one()
.ok()
.expect("Assumed that there would be exactly one label if labels are present")
.label()
.map(|l| l.to_string())
});
assert_eq!(Some(label.into()), actual_label, "Did not see expected labeled span.");
});
}

assert_labeled_span("namespace", "expected identifier");
assert_labeled_span("type", "expected identifier");
assert_labeled_span("entity", "expected identifier");
assert_labeled_span("action", "expected identifier or string literal");
assert_labeled_span("type t =", "expected `{`, identifier, or `Set`");
assert_labeled_span(
"entity User {",
"expected `}`, identifier, or string literal",
);
assert_labeled_span("entity User { name:", "expected `{`, identifier, or `Set`");
}
}

#[cfg(test)]
Expand Down
2 changes: 2 additions & 0 deletions cedar-policy/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
return a `RequestBuilder<&Schema>` so the `RequestBuilder<&Schema>::build`
method checks the request against the schema provided and the
`RequestBuilder<UnsetSchema>::build` method becomes infallible. (#559)
- Improved "unexpected token" parse errors when the schema or policy parsers
expect an identifier. (#698)

### Fixed

Expand Down

0 comments on commit 5ed3ff9

Please sign in to comment.