Improve parser errors on unexpected tokens (#698)

Signed-off-by: John Kastner <[email protected]>
cedar-policy · Mar 7, 2024 · 5ed3ff9 · 5ed3ff9
1 parent 1dc9f9e
commit 5ed3ff9
Show file tree

Hide file tree

Showing 6 changed files with 251 additions and 53 deletions.
diff --git a/cedar-policy-core/src/parser.rs b/cedar-policy-core/src/parser.rs
@@ -689,6 +689,7 @@ mod parse_tests {
     use super::*;
     use crate::test_utils::*;
     use cool_asserts::assert_matches;
+    use itertools::Itertools;
     use miette::Diagnostic;
 
     #[test]
@@ -1059,4 +1060,69 @@ mod parse_tests {
             }
         })
     }
+
+    #[test]
+    fn unexpected_token_errors() {
+        #[track_caller]
+        fn assert_labeled_span(policy: &str, label: impl Into<String>) {
+            assert_matches!(parse_policy(None, policy), Err(e) => {
+                let actual_label = e.labels().and_then(|l| {
+                    l.exactly_one()
+                        .ok()
+                        .expect("Assumed that there would be exactly one label if labels are present")
+                        .label()
+                        .map(|l| l.to_string())
+                });
+                assert_eq!(Some(label.into()), actual_label, "Did not see expected labeled span.");
+            });
+        }
+
+        // Don't list out all the special case identifiers
+        assert_labeled_span("@", "expected identifier");
+        assert_labeled_span(
+            "permit(principal, action, resource) when { principal.",
+            "expected identifier",
+        );
+
+        // We specifically want `when` or `unless`, but we previously listed all
+        // identifier tokens, so this is an improvement.
+        assert_labeled_span(
+            "permit(principal, action, resource)",
+            "expected `;` or identifier",
+        );
+        // AST actually requires `permit` or `forbid`, but grammar looks for any
+        // identifier.
+        assert_labeled_span("@if(\"a\")", "expected `@` or identifier");
+        // AST actually requires `principal` (`action`, `resource`, resp.). In
+        // the `principal` case we also claim to expect `)` because an empty scope
+        // initially parses to a CST. The trailing comma rules this out in the others.
+        assert_labeled_span("permit(", "expected `)` or identifier");
+        assert_labeled_span("permit(,,);", "expected `)` or identifier");
+        assert_labeled_span("permit(principal,", "expected identifier");
+        assert_labeled_span("permit(principal,action,", "expected identifier");
+        // Nothing will actually convert to an AST here.
+        assert_labeled_span("permit(principal,action,resource,", "expected identifier");
+        // We still list out `if` as an expected token because it doesn't get
+        // parsed as an ident in this position.
+        assert_labeled_span(
+            "permit(principal, action, resource) when {",
+            "expected `!`, `(`, `-`, `[`, `{`, `}`, `false`, identifier, `if`, number, `?principal`, `?resource`, string literal, or `true`",
+        );
+        // The right operand of an `is` gets parsed as any `Expr`, so we will
+        // list out all the possible expression tokens even though _only_
+        // `identifier` is accepted. This choice allows nicer error messages for
+        // `principal is User::"alice"`, but it doesn't work in our favor here.
+        assert_labeled_span(
+            "permit(principal, action, resource) when { principal is",
+            "expected `!`, `(`, `-`, `[`, `{`, `false`, identifier, `if`, number, `?principal`, `?resource`, string literal, or `true`",
+        );
+
+        // We expect binary operators, but don't claim to expect `=`, `%` or
+        // `/`. We still expect `::` even though `true` is a reserved identifier
+        // and so we can't have an entity reference `true::"eid"`
+        assert_labeled_span(
+            "permit(principal, action, resource) when { if true",
+            "expected `!=`, `&&`, `(`, `*`, `+`, `-`, `.`, `::`, `<`, `<=`, `==`, `>`, `>=`, `[`, `||`, `has`, `in`, `is`, `like`, or `then`",
+        )
+    }
 }
diff --git a/cedar-policy-core/src/parser/err.rs b/cedar-policy-core/src/parser/err.rs
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-use std::collections::HashMap;
+use std::collections::{BTreeSet, HashMap, HashSet};
 use std::fmt::{self, Display, Write};
 use std::iter;
 use std::ops::{Deref, DerefMut};
@@ -556,11 +556,11 @@ impl Diagnostic for ToCSTError {
         let labeled_span = match &self.err {
             OwnedRawParseError::InvalidToken { .. } => LabeledSpan::underline(primary_source_span),
             OwnedRawParseError::UnrecognizedEof { expected, .. } => LabeledSpan::new_with_span(
-                expected_to_string(expected, &FRIENDLY_TOKEN_NAMES),
+                expected_to_string(expected, &POLICY_TOKEN_CONFIG),
                 primary_source_span,
             ),
             OwnedRawParseError::UnrecognizedToken { expected, .. } => LabeledSpan::new_with_span(
-                expected_to_string(expected, &FRIENDLY_TOKEN_NAMES),
+                expected_to_string(expected, &POLICY_TOKEN_CONFIG),
                 primary_source_span,
             ),
             OwnedRawParseError::ExtraToken { .. } => LabeledSpan::underline(primary_source_span),
@@ -570,41 +570,111 @@ impl Diagnostic for ToCSTError {
     }
 }
 
+/// Defines configurable rules for how tokens in an `UnrecognizedToken` or
+/// `UnrecognizedEof` error should be displayed to users.
+#[derive(Debug)]
+pub struct ExpectedTokenConfig {
+    /// Defines user-friendly names for tokens used by our parser. Keys are the
+    /// names of tokens as defined in the `.lalrpop` grammar file. A token may
+    /// be omitted from this map if the name is already friendly enough.
+    pub friendly_token_names: HashMap<&'static str, &'static str>,
+
+    /// Some tokens defined in our grammar always cause later processing to fail.
+    /// Our policy grammar defines a token for the mod operator `%`, but we
+    /// reject any CST that uses the operator. To reduce confusion we filter
+    /// these from the list of expected tokens in an error message.
+    pub impossible_tokens: HashSet<&'static str>,
+
+    /// Both our policy and schema grammar have a generic identifier token
+    /// and some more specific identifier tokens that we use to parse specific
+    /// constructs. It is very often not useful to explicitly list out all of
+    /// these special identifier because the parser really just wants any
+    /// generic identifier. That it would accept these does not give any
+    /// useful information.
+    pub special_identifier_tokens: HashSet<&'static str>,
+
+    /// If this token is expected, then the parser expected a generic identifier, so
+    /// we omit the specific identifiers favor of saying we expect an "identifier".
+    pub identifier_sentinel: &'static str,
+
+    /// Special identifiers that may be worth displaying even if the parser
+    /// wants a generic identifier. These can tokens will be parsed as something
+    /// other than an identifier when they occur as the first token in an
+    /// expression (or a type, in the case of the schema grammar).
+    pub first_set_identifier_tokens: HashSet<&'static str>,
+
+    /// If this token is expected, then the parser was looking to start parsing
+    /// an expression (or type, in the schema). We know that we should report the
+    /// tokens that aren't parsed as identifiers at the start of an expression.
+    pub first_set_sentinel: &'static str,
+}
+
 lazy_static! {
-    /// Keys mirror the token names defined in the `match` block of
-    /// `grammar.lalrpop`.
-    static ref FRIENDLY_TOKEN_NAMES: HashMap<&'static str, &'static str> = HashMap::from([
-        ("TRUE", "`true`"),
-        ("FALSE", "`false`"),
-        ("IF", "`if`"),
-        ("PERMIT", "`permit`"),
-        ("FORBID", "`forbid`"),
-        ("WHEN", "`when`"),
-        ("UNLESS", "`unless`"),
-        ("IN", "`in`"),
-        ("HAS", "`has`"),
-        ("LIKE", "`like`"),
-        ("IS", "`is`"),
-        ("THEN", "`then`"),
-        ("ELSE", "`else`"),
-        ("PRINCIPAL", "`principal`"),
-        ("ACTION", "`action`"),
-        ("RESOURCE", "`resource`"),
-        ("CONTEXT", "`context`"),
-        ("PRINCIPAL_SLOT", "`?principal`"),
-        ("RESOURCE_SLOT", "`?resource`"),
-        ("OTHER_SLOT", "template slot"),
-        ("IDENTIFIER", "identifier"),
-        ("NUMBER", "number"),
-        ("STRINGLIT", "string literal"),
-    ]);
+    static ref POLICY_TOKEN_CONFIG: ExpectedTokenConfig = ExpectedTokenConfig {
+        friendly_token_names: HashMap::from([
+            ("TRUE", "`true`"),
+            ("FALSE", "`false`"),
+            ("IF", "`if`"),
+            ("PERMIT", "`permit`"),
+            ("FORBID", "`forbid`"),
+            ("WHEN", "`when`"),
+            ("UNLESS", "`unless`"),
+            ("IN", "`in`"),
+            ("HAS", "`has`"),
+            ("LIKE", "`like`"),
+            ("IS", "`is`"),
+            ("THEN", "`then`"),
+            ("ELSE", "`else`"),
+            ("PRINCIPAL", "`principal`"),
+            ("ACTION", "`action`"),
+            ("RESOURCE", "`resource`"),
+            ("CONTEXT", "`context`"),
+            ("PRINCIPAL_SLOT", "`?principal`"),
+            ("RESOURCE_SLOT", "`?resource`"),
+            ("IDENTIFIER", "identifier"),
+            ("NUMBER", "number"),
+            ("STRINGLIT", "string literal"),
+        ]),
+        impossible_tokens: HashSet::from(["\"=\"", "\"%\"", "\"/\"", "OTHER_SLOT"]),
+        special_identifier_tokens: HashSet::from([
+            "PERMIT",
+            "FORBID",
+            "WHEN",
+            "UNLESS",
+            "IN",
+            "HAS",
+            "LIKE",
+            "IS",
+            "THEN",
+            "ELSE",
+            "PRINCIPAL",
+            "ACTION",
+            "RESOURCE",
+            "CONTEXT",
+        ]),
+        identifier_sentinel: "IDENTIFIER",
+        first_set_identifier_tokens: HashSet::from(["TRUE", "FALSE", "IF"]),
+        first_set_sentinel: "\"!\"",
+    };
 }
 
 /// Format lalrpop expected error messages
-pub fn expected_to_string(
-    expected: &[String],
-    token_map: &HashMap<&'static str, &'static str>,
-) -> Option<String> {
+pub fn expected_to_string(expected: &[String], config: &ExpectedTokenConfig) -> Option<String> {
+    let mut expected = expected
+        .iter()
+        .filter(|e| !config.impossible_tokens.contains(e.as_str()))
+        .map(|e| e.as_str())
+        .collect::<BTreeSet<_>>();
+    if expected.contains(config.identifier_sentinel) {
+        for token in config.special_identifier_tokens.iter() {
+            expected.remove(*token);
+        }
+        if !expected.contains(config.first_set_sentinel) {
+            for token in config.first_set_identifier_tokens.iter() {
+                expected.remove(*token);
+            }
+        }
+    }
     if expected.is_empty() {
         return None;
     }
@@ -616,7 +686,7 @@ pub fn expected_to_string(
         &mut expected_string,
         "or",
         expected,
-        |f, token| match token_map.get(token.as_str()) {
+        |f, token| match config.friendly_token_names.get(token) {
             Some(friendly_token_name) => write!(f, "{}", friendly_token_name),
             None => write!(f, "{}", token.replace('"', "`")),
         },

diff --git a/cedar-policy-validator/src/err.rs b/cedar-policy-validator/src/err.rs
@@ -35,6 +35,7 @@ pub enum HumanSchemaError {
     #[error("{0}")]
     IO(#[from] std::io::Error),
     #[error("{0}")]
+    #[diagnostic(transparent)]
     Parsing(#[from] HumanSyntaxParseErrors),
 }
 

diff --git a/cedar-policy-validator/src/human_schema/err.rs b/cedar-policy-validator/src/human_schema/err.rs
@@ -1,6 +1,13 @@
-use std::{collections::HashMap, fmt::Display};
-
-use cedar_policy_core::parser::{err::expected_to_string, unescape::UnescapeError, Loc, Node};
+use std::{
+    collections::{HashMap, HashSet},
+    fmt::Display,
+};
+
+use cedar_policy_core::parser::{
+    err::{expected_to_string, ExpectedTokenConfig},
+    unescape::UnescapeError,
+    Loc, Node,
+};
 use lalrpop_util as lalr;
 use lazy_static::lazy_static;
 use miette::{Diagnostic, LabeledSpan, SourceSpan};
@@ -30,19 +37,40 @@ pub(crate) type RawErrorRecovery<'a> = lalr::ErrorRecovery<RawLocation, RawToken
 type OwnedRawParseError = lalr::ParseError<RawLocation, String, RawUserError>;
 
 lazy_static! {
-    /// Keys mirror the token names defined in the `match` block of
-    /// `grammar.lalrpop`.
-    static ref FRIENDLY_TOKEN_NAMES: HashMap<&'static str, &'static str> = HashMap::from([
-        ("IN", "`in`"),
-        ("PRINCIPAL", "`principal`"),
-        ("ACTION", "`action`"),
-        ("RESOURCE", "`resource`"),
-        ("CONTEXT", "`context`"),
-        ("STRINGLIT", "string literal"),
-        ("ENTITY", "`entity`"),
-        ("NAMESPACE", "`namespace`"),
-        ("TYPE", "`type`"),
-    ]);
+    static ref SCHEMA_TOKEN_CONFIG: ExpectedTokenConfig = ExpectedTokenConfig {
+        friendly_token_names: HashMap::from([
+            ("IN", "`in`"),
+            ("PRINCIPAL", "`principal`"),
+            ("ACTION", "`action`"),
+            ("RESOURCE", "`resource`"),
+            ("CONTEXT", "`context`"),
+            ("STRINGLIT", "string literal"),
+            ("ENTITY", "`entity`"),
+            ("NAMESPACE", "`namespace`"),
+            ("TYPE", "`type`"),
+            ("SET", "`Set`"),
+            ("IDENTIFIER", "identifier"),
+        ]),
+        impossible_tokens: HashSet::new(),
+        special_identifier_tokens: HashSet::from([
+            "NAMESPACE",
+            "ENTITY",
+            "IN",
+            "TYPE",
+            "APPLIESTO",
+            "PRINCIPAL",
+            "ACTION",
+            "RESOURCE",
+            "CONTEXT",
+            "ATTRIBUTES",
+            "LONG",
+            "STRING",
+            "BOOL",
+        ]),
+        identifier_sentinel: "IDENTIFIER",
+        first_set_identifier_tokens: HashSet::from(["SET"]),
+        first_set_sentinel: "\"{\"",
+    };
 }
 
 /// For errors during parsing
@@ -115,11 +143,11 @@ impl Diagnostic for ParseError {
         let labeled_span = match err {
             OwnedRawParseError::InvalidToken { .. } => LabeledSpan::underline(primary_source_span),
             OwnedRawParseError::UnrecognizedEof { expected, .. } => LabeledSpan::new_with_span(
-                expected_to_string(expected, &FRIENDLY_TOKEN_NAMES),
+                expected_to_string(expected, &SCHEMA_TOKEN_CONFIG),
                 primary_source_span,
             ),
             OwnedRawParseError::UnrecognizedToken { expected, .. } => LabeledSpan::new_with_span(
-                expected_to_string(expected, &FRIENDLY_TOKEN_NAMES),
+                expected_to_string(expected, &SCHEMA_TOKEN_CONFIG),
                 primary_source_span,
             ),
             OwnedRawParseError::ExtraToken { .. } => LabeledSpan::underline(primary_source_span),

diff --git a/cedar-policy-validator/src/human_schema/test.rs b/cedar-policy-validator/src/human_schema/test.rs
@@ -17,6 +17,9 @@ mod demo_tests {
         NamespaceDefinition, SchemaFragment, SchemaTypeVariant, TypeOfAttribute,
     };
 
+    use itertools::Itertools;
+    use miette::Diagnostic;
+
     #[test]
     fn no_applies_to() {
         let src = r#"
@@ -756,6 +759,34 @@ mod demo_tests {
             _ => panic!("Wrong type for shape"),
         }
     }
+
+    #[test]
+    fn expected_tokens() {
+        #[track_caller]
+        fn assert_labeled_span(src: &str, label: impl Into<String>) {
+            assert_matches!(SchemaFragment::from_str_natural(src).map(|(s, _)| s), Err(e) => {
+                let actual_label = e.labels().and_then(|l| {
+                    l.exactly_one()
+                        .ok()
+                        .expect("Assumed that there would be exactly one label if labels are present")
+                        .label()
+                        .map(|l| l.to_string())
+                });
+                assert_eq!(Some(label.into()), actual_label, "Did not see expected labeled span.");
+            });
+        }
+
+        assert_labeled_span("namespace", "expected identifier");
+        assert_labeled_span("type", "expected identifier");
+        assert_labeled_span("entity", "expected identifier");
+        assert_labeled_span("action", "expected identifier or string literal");
+        assert_labeled_span("type t =", "expected `{`, identifier, or `Set`");
+        assert_labeled_span(
+            "entity User {",
+            "expected `}`, identifier, or string literal",
+        );
+        assert_labeled_span("entity User { name:", "expected `{`, identifier, or `Set`");
+    }
 }
 
 #[cfg(test)]

diff --git a/cedar-policy/CHANGELOG.md b/cedar-policy/CHANGELOG.md
@@ -57,6 +57,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 return a `RequestBuilder<&Schema>` so the `RequestBuilder<&Schema>::build`
 method checks the request against the schema provided and the
 `RequestBuilder<UnsetSchema>::build` method becomes infallible. (#559)
+- Improved "unexpected token" parse errors when the schema or policy parsers
+  expect an identifier. (#698)
 
 ### Fixed