Merge pull request #2035 from usethesource/error-disambiguation-in-java

Reimplemented error tree disambiguation in Java
usethesource · Oct 8, 2024 · b0e4514 · b0e4514
2 parents c039de5 + 9827f5a
commit b0e4514
Show file tree

Hide file tree

Showing 17 changed files with 432 additions and 123 deletions.
diff --git a/src/org/rascalmpl/library/ParseTree.rsc b/src/org/rascalmpl/library/ParseTree.rsc
@@ -144,7 +144,7 @@ extend Message;
 extend List;
 
 import String;
-import Set;
+import util::Maybe;
 
 @synopsis{The Tree data type as produced by the parser.}
 @description{
@@ -355,6 +355,16 @@ The latter option terminates much faster, i.e. always in cubic time, and always
 while constructing ambiguous parse forests may grow to O(n^p+1), where p is the length of the longest production rule and n 
 is the length of the input.
 
+The `allowRecovery` can be set to `true` to enable error recovery. This is an experimental feature.
+When error recovery is enabled, the parser will attempt to recover from parse errors and continue parsing.
+If successful, a parse tree with error and skipped productions is returned (see the definition of `Production` above).
+A number of functions is provided to analyze trees with errors, for example `hasErrors`, `getSkipped`, and `getErrorText`.
+Note that the resulting parse forest can contain a lot of error nodes. `disambiguateErrors` can be used to prune the forest
+and leave a tree with a single (or even zero) errors based on simple heuristics.
+When `allowAmbiguity` is set to false, `allowRecovery` is set to true, and `filters` is empty, this disambiguation is done
+automatically so you should end up with a tree with no error ambiguities. Regular ambiguities can still occur
+and will result in an error.
+
 The `filters` set contains functions which may be called optionally after the parse algorithm has finished and just before
 the Tree representation is built. The set of functions contain alternative functions, only on of them is successfully applied
 to each node in a tree. If such a function fails to apply, the other ones are tried. There is no fixed-point computation, so
@@ -784,7 +794,15 @@ list[Tree] findAllErrors(Tree tree) =  [err | /err:appl(error(_, _, _), _) := tr
 Tree findFirstError(/err:appl(error(_, _, _), _)) = err;
 
 @synopsis{Find the best error from a tree containing errors. This function will fail if `tree` does not contain an error.}
-Tree findBestError(Tree tree) = findFirstError(defaultErrorDisambiguationFilter(tree));
+Maybe[Tree] findBestError(Tree tree) {
+  Tree disambiguated = disambiguateErrors(tree);
+  if (/err:appl(error(_, _, _), _) := disambiguated) {
+    return just(err);
+  }
+
+  // All errors have disappeared
+  return nothing();
+}
 
 @synopsis{Get the symbol (sort) of the failing production}
 Symbol getErrorSymbol(appl(error(Symbol sym, _, _), _)) = sym;
@@ -803,35 +821,9 @@ If you want the text of the whole error tree, you can just use string interpolat
 }
 str getErrorText(appl(error(_, _, _), [*_, appl(skipped(_), chars)])) = stringChars([c | char(c) <- chars]);
 
+@javaClass{org.rascalmpl.parser.gtd.recovery.ParseErrorDisambiguator}
 @synopsis{Error recovery often produces ambiguous trees where errors can be recovered in multiple ways.
 This filter removes error trees until no ambiguities caused by error recovery are left.
-Note that regular ambiguous trees remain in the parse forest.
-}
-Tree defaultErrorDisambiguationFilter(Tree t) {
-  return visit(t) {
-    case a:amb(_) => ambDisambiguation(a)
-  };
-}
-
-private Tree ambDisambiguation(amb(set[Tree] alternatives)) {
-  // Go depth-first
-  rel[int score, Tree alt] scoredErrorTrees = { <scoreErrors(alt), alt> | Tree alt <- alternatives };
-  set[Tree] nonErrorTrees = scoredErrorTrees[0];
-
-  if (nonErrorTrees == {}) {
-    return (getFirstFrom(scoredErrorTrees) | it.score > c.score ? c : it | c <- scoredErrorTrees).alt;
-  }
-  
-  if ({Tree single} := nonErrorTrees) {
-    // One ambiguity left, no ambiguity concerns here
-    return single;
-  }
-  
-  // Multiple non-error trees left, return an ambiguity node with just the non-error trees
-  return amb(nonErrorTrees);
+Note that regular ambiguous trees remain in the parse forest unless `allowAmbiguity` is set to false in which case an error is thrown.
 }
-
-private int scoreErrors(Tree t) = (0 | it + getSkipped(e).src.length | /e:appl(error(_,_,_),_) := t);
-
-// Handle char and cycle nodes
-default Tree defaultErrorDisambiguationFilter(Tree t) = t;
+java Tree disambiguateErrors(Tree t, bool allowAmbiguity=true);
diff --git a/src/org/rascalmpl/library/lang/rascal/tests/concrete/recovery/BasicRecoveryTests.rsc b/src/org/rascalmpl/library/lang/rascal/tests/concrete/recovery/BasicRecoveryTests.rsc
@@ -15,6 +15,7 @@
 module lang::rascal::tests::concrete::recovery::BasicRecoveryTests
 
 import ParseTree;
+import util::Maybe;
 
 layout Layout = [\ ]* !>> [\ ];
 
@@ -33,15 +34,23 @@ test bool basicOk() {
 
 test bool abx() {
     Tree t = parseS("a b x $");
-    return getErrorText(findBestError(t)) == "x ";
+    return getErrorText(findBestError(t).val) == "x ";
 }
 
 test bool axc() {
     Tree t = parseS("a x c $");
-    return getErrorText(findBestError(t)) == "x c";
+    return getErrorText(findBestError(t).val) == "x c";
 }
 
 test bool ax() {
-    Tree t = parseS("a x $");
-    return getErrorText(findBestError(t)) == "x ";
+    str input = "a x $";
+
+    Tree t = parseS(input);
+    assert size(findAllErrors(t)) == 3;
+    assert getErrorText(findBestError(t).val) == "x ";
+
+    Tree autoDisambiguated = parser(#S, allowRecovery=true, allowAmbiguity=false)(input, |unknown:///|);
+    assert size(findAllErrors(autoDisambiguated)) == 1;
+
+    return getErrorText(findFirstError(autoDisambiguated)) == getErrorText(findBestError(t).val);
 }
diff --git a/src/org/rascalmpl/library/lang/rascal/tests/concrete/recovery/ErrorRecoveryBenchmark.rsc b/src/org/rascalmpl/library/lang/rascal/tests/concrete/recovery/ErrorRecoveryBenchmark.rsc
@@ -33,24 +33,35 @@ void runLanguageTests() {
     testRecoveryRascal();
 }
 
-void runRascalBatchTest(int maxFiles=1000, int maxFileSize=4000) {
+void runRascalBatchTest(int maxFiles=1000, int minFileSize=0, int maxFileSize=4000, int fromFile=0) {
     int startTime = realTime();
-    TestStats stats = batchRecoveryTest(|std:///lang/rascal/syntax/Rascal.rsc|, "Module", |std:///|, ".rsc", maxFiles, maxFileSize);
+    TestStats stats = batchRecoveryTest(|std:///lang/rascal/syntax/Rascal.rsc|, "Module", |std:///|, ".rsc", maxFiles, minFileSize, maxFileSize, fromFile, |cwd:///rascal-recovery-stats.csv|);
     int duration = realTime() - startTime;
     println();
-    println("========================im========================================");
+    println("================================================================");
     println("Rascal batch test done in <duration/1000> seconds, total result:");
     printStats(stats);
 }int main(list[str] args) {
     int maxFiles = 1000;
-    int maxFileSize = 4000;
-    if (size(args) == 2) {
+    int maxFileSize = 1000000;
+    int minFileSize = 0;
+    int fromFile = 0;
+    if (size(args) > 0) {
         maxFiles = toInt(args[0]);
-        maxFileSize = toInt(args[1]);
-    } else if (size(args) != 0) {
-        println("Usage: ErrorRecoveryBenchmark <max-files> <max-file-size>");
     }
+    if (size(args) > 1) {
+        minFileSize = toInt(args[1]);
+    }
+    if (size(args) > 2) {
+        maxFileSize = toInt(args[2]);
+    }
+    if (size(args) > 3) {
+        fromFile = toInt(args[3]);
+    } else {
+        println("Usage: ErrorRecoveryBenchmark [\<max-files\> [\<min-file-size\> [\<max-file-size\> [\<from-file\>]]]]");
+    }
+
+    runRascalBatchTest(maxFiles=maxFiles, minFileSize=minFileSize, maxFileSize=maxFileSize, fromFile=fromFile);
 
-    runRascalBatchTest(maxFiles=maxFiles, maxFileSize=maxFileSize);
     return 0;
 }
diff --git a/src/org/rascalmpl/library/lang/rascal/tests/concrete/recovery/NestedRecoveryTests.rsc b/src/org/rascalmpl/library/lang/rascal/tests/concrete/recovery/NestedRecoveryTests.rsc
@@ -15,6 +15,7 @@
 module lang::rascal::tests::concrete::recovery::NestedRecoveryTests
 
 import ParseTree;
+import util::Maybe;
 
 layout Layout = [\ ]* !>> [\ ];
 
@@ -35,5 +36,5 @@ test bool nestedOk() {
 
 test bool nestedTypo() {
     Tree t = parseS("a b x c");
-    return getErrorText(findFirstError(defaultErrorDisambiguationFilter(t))) == "x ";
+    return getErrorText(findBestError(t).val) == "x ";
 }
diff --git a/src/org/rascalmpl/library/lang/rascal/tests/concrete/recovery/PicoRecoveryTests.rsc b/src/org/rascalmpl/library/lang/rascal/tests/concrete/recovery/PicoRecoveryTests.rsc
@@ -20,6 +20,7 @@ import ParseTree;
 
 import IO;
 import String;
+import util::Maybe;
 
 Tree parsePico(str input, bool visualize=false) 
     = parser(#Program, allowRecovery=true, allowAmbiguity=true)(input, |unknown:///?visualize=<"<visualize>">|);

diff --git a/src/org/rascalmpl/library/lang/rascal/tests/concrete/recovery/RascalRecoveryTests.rsc b/src/org/rascalmpl/library/lang/rascal/tests/concrete/recovery/RascalRecoveryTests.rsc
@@ -18,6 +18,9 @@ import lang::rascal::\syntax::Rascal;
 
 import ParseTree;
 import IO;
+import util::Maybe;
+
+bool debugging = false;
 
 bool debugging = false;
 
@@ -31,8 +34,8 @@ Tree parseRascal(type[&T] t, str input, bool visualize=false) {
             println("- <getErrorText(error)>");
         }
 
-        Tree disambiguated = defaultErrorDisambiguationFilter(result);
-        println("Best error: <getErrorText(findFirstError(disambiguated))>");
+        println("Best error: <getErrorText(findBestError(result).val)>");
+    }
     }
     }
 
@@ -96,7 +99,7 @@ test bool rascalMissingCloseParen() {
     Tree t = parseRascal("module A void f({} void g(){}");
 
     assert getErrorText(findFirstError(t)) == "void g(";
-    assert getErrorText(findFirstError(defaultErrorDisambiguationFilter(t))) == "(";
+    assert getErrorText(findBestError(t).val) == "(";
 
     return true;
 }
@@ -106,7 +109,7 @@ test bool rascalFunctionDeclarationMissingCloseParen() {
 
     assert getErrorText(findFirstError(t)) == "void g(";
 
-    Tree error = findFirstError(defaultErrorDisambiguationFilter(t));
+    Tree error = findBestError(t).val;
     assert getErrorText(error) == "(";
     loc location = getSkipped(error).src;
     assert location.begin.column == 16 && location.length == 1;
@@ -116,14 +119,14 @@ test bool rascalFunctionDeclarationMissingCloseParen() {
 
 test bool rascalIfMissingExpr() {
     Tree t = parseFunctionDeclaration("void f(){if(){1;}}", visualize=false);
-    return getErrorText(findFirstError(t)) == ")";
+    return getErrorText(findBestError(t).val) == ")";
 }
 
 test bool rascalIfBodyEmpty() {
     Tree t = parseRascal("module A void f(){1;} void g(){if(1){}} void h(){1;}");
 
     println("error: <getErrorText(findFirstError(t))>");
-    assert getErrorText(findBestError(t)) == "} void h(){1";
+    assert getErrorText(findBestError(t).val) == "} void h(){1";
 
     return true;
 }