From 2725c65387ff7213585485f75e225d8a89a11476 Mon Sep 17 00:00:00 2001
From: "Jurgen J. Vinju" <Jurgen.Vinju@cwi.nl>
Date: Fri, 8 Nov 2024 17:23:56 +0100
Subject: [PATCH 1/6] added a function that can filter error trees which are in
 optional positions in a grammar (list elements and optionals)

---
 .../rascalmpl/library/util/ErrorRecovery.rsc  | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/org/rascalmpl/library/util/ErrorRecovery.rsc b/src/org/rascalmpl/library/util/ErrorRecovery.rsc
index 08673830a9..e85b25b569 100644
--- a/src/org/rascalmpl/library/util/ErrorRecovery.rsc
+++ b/src/org/rascalmpl/library/util/ErrorRecovery.rsc
@@ -51,3 +51,24 @@ This filter removes error trees until no ambiguities caused by error recovery ar
 Note that regular ambiguous trees remain in the parse forest unless `allowAmbiguity` is set to false in which case an error is thrown.
 }
 java Tree disambiguateErrors(Tree t, bool allowAmbiguity=true);
+
+@synopsis{Removes error trees if they are in optional positions.}
+@description{
+Removing grammatically optional error trees can reduce the number of case distinctions
+required to make algorithms that process parse trees robust against parse errors.
+}
+Tree filterOptionalErrorTrees(Tree t) = visit(t) {
+    case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_])),[*pre, _sep, appl(error(_,_,_),_), *post])
+        => appl(p, [*pre, *post])[@\loc=t@\loc]
+    case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_])),[appl(error(_,_,_),_), _sep, *post])
+        => appl(p, post)[@\loc=t@\loc]
+    case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_,_,_])),[*pre, _sep1, _sep2, _sep3, appl(error(_,_,_),_), *post])
+        => appl(p, [*pre, *post])[@\loc=t@\loc]
+    case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_,_,_])),[appl(error(_,_,_),_), _sep1, _sep2, _sep3, *post])
+        => appl(p, post)[@\loc=t@\loc]
+    case t:appl(p:regular(/iter|iter-star/(_)),[*pre, appl(error(_,_,_),_), *post])
+        => appl(p, [*pre, *post])[@\loc=t@\loc]
+    case t:appl(p:regular(opt(_)), appl(error(_,_,_), _)) 
+        => appl(p, [])[@\loc=t@\loc]
+    // TODO: some forms of recursion could be flattened in the presence of errors mid-way.
+};

From 66061173aad478e6e488c46cc478f7eb878aa1cd Mon Sep 17 00:00:00 2001
From: "Jurgen J. Vinju" <Jurgen.Vinju@cwi.nl>
Date: Mon, 11 Nov 2024 19:44:18 +0100
Subject: [PATCH 2/6] added more experimental features to error recovery

---
 .../rascalmpl/library/util/ErrorRecovery.rsc  | 53 ++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

diff --git a/src/org/rascalmpl/library/util/ErrorRecovery.rsc b/src/org/rascalmpl/library/util/ErrorRecovery.rsc
index e85b25b569..e502592c5b 100644
--- a/src/org/rascalmpl/library/util/ErrorRecovery.rsc
+++ b/src/org/rascalmpl/library/util/ErrorRecovery.rsc
@@ -57,7 +57,7 @@ java Tree disambiguateErrors(Tree t, bool allowAmbiguity=true);
 Removing grammatically optional error trees can reduce the number of case distinctions
 required to make algorithms that process parse trees robust against parse errors.
 }
-Tree filterOptionalErrorTrees(Tree t) = visit(t) {
+Tree filterOptionalErrorTrees(Tree x) = visit(x) {
     case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_])),[*pre, _sep, appl(error(_,_,_),_), *post])
         => appl(p, [*pre, *post])[@\loc=t@\loc]
     case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_])),[appl(error(_,_,_),_), _sep, *post])
@@ -72,3 +72,54 @@ Tree filterOptionalErrorTrees(Tree t) = visit(t) {
         => appl(p, [])[@\loc=t@\loc]
     // TODO: some forms of recursion could be flattened in the presence of errors mid-way.
 };
+
+@synopsis{Removes trees which contain error trees, if they are in optional positions.}
+@description{
+Removing grammatically optional error trees can reduce the number of case distinctions
+required to make algorithms that process parse trees robust against parse errors.
+}
+@benefits{
+* this algorithm is more aggressive and more successful in removing error trees
+then ((filterOptionalErrorTrees)) can be.
+}
+@pitfalls{
+* this algorithm may cut off entire branches which are otherwise fine to extract more information from.
+}
+Tree filterOptionalIndirectErrorTrees(Tree x) = visit(addErrorStats(x)) {
+    case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_])),[*pre, _sep, appl(_,_, erroneous=true), *post])
+        => appl(p, [*pre, *post])[@\loc=t@\loc]
+    case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_])),[appl(_,_, erroneous=true), _sep, *post])
+        => appl(p, post)[@\loc=t@\loc]
+    case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_,_,_])),[*pre, _sep1, _sep2, _sep3, appl(_,_, erroneous=true), *post])
+        => appl(p, [*pre, *post])[@\loc=t@\loc]
+    case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_,_,_])),[appl(_,_, erroneous=true), _sep1, _sep2, _sep3, *post])
+        => appl(p, post)[@\loc=t@\loc]
+    case t:appl(p:regular(/iter|iter-star/(_)),[*pre, appl(_,_, erroneous=true), *post])
+        => appl(p, [*pre, *post])[@\loc=t@\loc]
+    case t:appl(p:regular(opt(_)), appl(_, _, erroneous=true)) 
+        => appl(p, [])[@\loc=t@\loc]
+} 
+
+@synopsis{Fields for storing the result of ((addErrorStats))}
+data Tree(int skipped = 0, bool erroneous=false);
+
+@synopsis{Annotates all nodes of a parse tree with error recovery statistics}
+@description{
+After this algorithm all nodes contain this information:
+* `int skipped` for the total number of skipped characters in a tree
+* `bool erroneous` that marks sub-trees which do contain errors with `true` while others remain `false`
+}
+@benefits{
+* local information about aggegrated information can be handy when filtering
+parse forests
+}
+@pitfalls{
+* statistics do not tell the whole truth about sub-trees. Filtering based on these numbers
+must be seen as a heuristic that sometimes pays-off, but sometimes hides crucial information.
+}
+Tree addErrorStats(Tree x) = bottom-up visit(x) {
+    case t:appl(skipped(_), args)   => t[skipped = size(args)][erroneous = true]
+    case t:appl(error(_,_,_), args) => t[skipped = (0 | it + a.skipped | a <- args)][erroneous = true]
+    case t:appl(prod(_,_,_), args)  => t[skipped = (0 | it + a.skipped | a <- args)][erroneous = (false | it || a.erroneous | a <- args)]
+    case t:amb(alts)                => t[skipped = (0 | min([it, a.skipped]) | a <- alts)][erroneous = (false | it && a.erroneous | a <- alts)]
+};
\ No newline at end of file

From 7a4d95909bf439af0f722394b0587086048fa928 Mon Sep 17 00:00:00 2001
From: "Jurgen J. Vinju" <Jurgen.Vinju@cwi.nl>
Date: Tue, 12 Nov 2024 09:41:17 +0100
Subject: [PATCH 3/6] fixes and doc additions

---
 .../rascalmpl/library/util/ErrorRecovery.rsc  | 74 +++++++++++++++----
 1 file changed, 59 insertions(+), 15 deletions(-)

diff --git a/src/org/rascalmpl/library/util/ErrorRecovery.rsc b/src/org/rascalmpl/library/util/ErrorRecovery.rsc
index e502592c5b..a4dffa2136 100644
--- a/src/org/rascalmpl/library/util/ErrorRecovery.rsc
+++ b/src/org/rascalmpl/library/util/ErrorRecovery.rsc
@@ -46,9 +46,23 @@ If you want the text of the whole error tree, you can just use string interpolat
 str getErrorText(appl(error(_, _, _), [*_, appl(skipped(_), chars)])) = stringChars([c | char(c) <- chars]);
 
 @javaClass{org.rascalmpl.library.util.ErrorRecovery}
-@synopsis{Error recovery often produces ambiguous trees where errors can be recovered in multiple ways.
-This filter removes error trees until no ambiguities caused by error recovery are left.
-Note that regular ambiguous trees remain in the parse forest unless `allowAmbiguity` is set to false in which case an error is thrown.
+@synopsis{This filter removes error trees until no ambiguities caused by error recovery are left.}
+@description{
+Error recovery often produces ambiguous trees where errors can be recovered in multiple ways. Ambiguity
+clusters (`amb`) represent the choices between all the valid prefixes. This filter removes choices until
+the last one is left. 
+
+Note that regular ambiguous trees remain in the parse forest unless `allowAmbiguity` is set to false in 
+which case an error is thrown.
+}
+@benefits{
+* after this algorithm only one error is left at every input position with an error. Downstream
+functionality does not have to deal with ambiguity anymore, making the code robust.
+}
+@pitfalls{
+* this algorithm removes valid prefixes based on heuristics like "shortest error", which may 
+remove interesting prefixes for downstream processing. In particular the accuracy of error repair and auto-complete
+may be damaged by this function. So it is best to use it for error recovery, and not for error repair.
 }
 java Tree disambiguateErrors(Tree t, bool allowAmbiguity=true);
 
@@ -85,17 +99,17 @@ then ((filterOptionalErrorTrees)) can be.
 @pitfalls{
 * this algorithm may cut off entire branches which are otherwise fine to extract more information from.
 }
-Tree filterOptionalIndirectErrorTrees(Tree x) = visit(addErrorStats(x)) {
+Tree filterOptionalIndirectErrorTrees(Tree x) = bottom-up visit(addErrorStats(x)) {
     case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_])),[*pre, _sep, appl(_,_, erroneous=true), *post])
-        => appl(p, [*pre, *post])[@\loc=t@\loc]
+        => addStats(appl(p, [*pre, *post])[@\loc=t@\loc])
     case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_])),[appl(_,_, erroneous=true), _sep, *post])
-        => appl(p, post)[@\loc=t@\loc]
+        => addStats(appl(p, post)[@\loc=t@\loc])
     case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_,_,_])),[*pre, _sep1, _sep2, _sep3, appl(_,_, erroneous=true), *post])
-        => appl(p, [*pre, *post])[@\loc=t@\loc]
+        => addStats(appl(p, [*pre, *post])[@\loc=t@\loc])
     case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_,_,_])),[appl(_,_, erroneous=true), _sep1, _sep2, _sep3, *post])
-        => appl(p, post)[@\loc=t@\loc]
+        => addStats(appl(p, post)[@\loc=t@\loc])
     case t:appl(p:regular(/iter|iter-star/(_)),[*pre, appl(_,_, erroneous=true), *post])
-        => appl(p, [*pre, *post])[@\loc=t@\loc]
+        => addStats(appl(p, [*pre, *post])[@\loc=t@\loc])
     case t:appl(p:regular(opt(_)), appl(_, _, erroneous=true)) 
         => appl(p, [])[@\loc=t@\loc]
 } 
@@ -115,11 +129,41 @@ parse forests
 }
 @pitfalls{
 * statistics do not tell the whole truth about sub-trees. Filtering based on these numbers
-must be seen as a heuristic that sometimes pays-off, but sometimes hides crucial information.
+must be seen as a heuristic that sometimes pays-off, and often hides crucial information.
 }
 Tree addErrorStats(Tree x) = bottom-up visit(x) {
-    case t:appl(skipped(_), args)   => t[skipped = size(args)][erroneous = true]
-    case t:appl(error(_,_,_), args) => t[skipped = (0 | it + a.skipped | a <- args)][erroneous = true]
-    case t:appl(prod(_,_,_), args)  => t[skipped = (0 | it + a.skipped | a <- args)][erroneous = (false | it || a.erroneous | a <- args)]
-    case t:amb(alts)                => t[skipped = (0 | min([it, a.skipped]) | a <- alts)][erroneous = (false | it && a.erroneous | a <- alts)]
-};
\ No newline at end of file
+    case Tree t => addStats(t)
+};
+
+@synopsis{Reusable utility for re-computing error statistics per Tree node.}
+private Tree addStats(t:appl(prod(_,_,_), args)) = t[skipped = (0 | it + a.skipped | a <- args)][erroneous = (false | it || a.erroneous | a <- args)];
+private Tree addStats(t:appl(skipped(_), args))  = t[skipped = size(args)][erroneous = true];
+private Tree addStats(t:appl(error(_,_,_), args))= t[skipped = (0 | it + a.skipped | a <- args)][erroneous = true];
+private Tree addStats(t:amb(alts))               = t[skipped = (0 | min([it, a.skipped]) | a <- alts)][erroneous = (false | it && a.erroneous | a <- alts)];
+default private Tree addStats(Tree t) = t;
+
+@synopsis{Disambiguates error ambiguity clusters by selecting the alternatives with the shortest amount of skipped characters}
+@benefits{
+* this is an aggressive filter that can greatly reduce the complexity of dealing with recovered parse trees.
+* chances are that after this filter all ambiguity has been removed, making downstream processing easier.
+}
+@pitfalls{
+* the trees with the shortest skips are not always the most relevant trees to consider for repair or recovery.
+}
+Tree selectShortestSkips(Tree x) = visit(addErrorStats(x)) {
+    case amb(alts) => amb({ a | a <- alts, a.skipped == minimum})
+        when int minimum := min([a.skipped | a <- alts])
+}
+
+@synopsis{Disambiguates error ambiguity clusters by selecting the alternatives with the largest amount of skipped characters}
+@benefits{
+* this is an aggressive filter that can greatly reduce the complexity of dealing with recovered parse trees.
+* chances are that after this filter all ambiguity has been removed, making downstream processing easier.
+}
+@pitfalls{
+* the trees with the longest skips are not always the most relevant trees to consider for repair or recovery.
+}
+Tree selectLongestSkips(Tree x) = visit(addErrorStats(x)) {
+    case amb(alts) => amb({ a | a <- alts, a.skipped == maximum})
+        when int maximum := max([a.skipped | a <- alts])
+}
\ No newline at end of file

From c447811a5b2b9bebf493f70d0f3f7bc4150913b1 Mon Sep 17 00:00:00 2001
From: "Jurgen J. Vinju" <Jurgen.Vinju@cwi.nl>
Date: Tue, 12 Nov 2024 22:04:58 +0100
Subject: [PATCH 4/6] improved docs

---
 .../rascalmpl/library/util/ErrorRecovery.rsc  | 30 ++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/org/rascalmpl/library/util/ErrorRecovery.rsc b/src/org/rascalmpl/library/util/ErrorRecovery.rsc
index a4dffa2136..c53019f068 100644
--- a/src/org/rascalmpl/library/util/ErrorRecovery.rsc
+++ b/src/org/rascalmpl/library/util/ErrorRecovery.rsc
@@ -70,6 +70,22 @@ java Tree disambiguateErrors(Tree t, bool allowAmbiguity=true);
 @description{
 Removing grammatically optional error trees can reduce the number of case distinctions
 required to make algorithms that process parse trees robust against parse errors.
+
+The algorithm works bottom-up such that only the smallest possible trees are removed.
+After every removal, new error statistics are prepared for passing up to the next level.
+If errors are completely removed by the filter, then the parents will remain unchanged.
+}
+@benefits{
+* Removing error trees increases the robustness of downstream processors
+* By removing error trees from lists, the relative `src` origins remain the same for
+downstream processing. 
+* Sets of trees in ambiguity clusters may be reduced to singletons, making the ambiguity cluster dissappear. 
+This also improves the robustness of downstream processors.
+}
+@pitfalls{
+* this algorithm may remove relatively large sub-trees and thus through away valuable information.
+It is much better to use this as a recovery tool that increases the robustness of oblivious downstream processors,
+then to start an error repair or auto-complete algorithm.
 }
 Tree filterOptionalErrorTrees(Tree x) = visit(x) {
     case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_])),[*pre, _sep, appl(error(_,_,_),_), *post])
@@ -91,13 +107,25 @@ Tree filterOptionalErrorTrees(Tree x) = visit(x) {
 @description{
 Removing grammatically optional error trees can reduce the number of case distinctions
 required to make algorithms that process parse trees robust against parse errors.
+
+The algorithm works bottom-up such that only the smallest possible trees are removed.
+After every removal, new error statistics are prepared for passing up to the next level.
+If errors are completely removed by the filter, then the parents will remain unchanged.
 }
 @benefits{
 * this algorithm is more aggressive and more successful in removing error trees
 then ((filterOptionalErrorTrees)) can be.
+* Removing error trees increases the robustness of downstream processors.
+* By removing error trees from lists, the relative `src` origins remain the same for
+downstream processing. 
+* Sets of trees in ambiguity clusters may be reduced to singletons, making the ambiguity cluster dissappear. 
+This also improves the robustness of downstream processors.
 }
 @pitfalls{
-* this algorithm may cut off entire branches which are otherwise fine to extract more information from.
+* this algorithm may remove (very) large sub-trees if they contain one error somewhere, and thus through away valuable information.
+It is much better to use this as a recovery tool that increases the robustness of oblivious downstream processors,
+then to start an error repair or auto-complete algorithm.
+
 }
 Tree filterOptionalIndirectErrorTrees(Tree x) = bottom-up visit(addErrorStats(x)) {
     case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_])),[*pre, _sep, appl(_,_, erroneous=true), *post])

From c32f3bd229ac1ba389bd2a0ac47d98ba9cdedf61 Mon Sep 17 00:00:00 2001
From: "Jurgen J. Vinju" <Jurgen.Vinju@cwi.nl>
Date: Wed, 13 Nov 2024 10:40:47 +0100
Subject: [PATCH 5/6] improved readability of  private addStats function

---
 .../rascalmpl/library/util/ErrorRecovery.rsc  | 24 +++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/src/org/rascalmpl/library/util/ErrorRecovery.rsc b/src/org/rascalmpl/library/util/ErrorRecovery.rsc
index c53019f068..f36f2577e7 100644
--- a/src/org/rascalmpl/library/util/ErrorRecovery.rsc
+++ b/src/org/rascalmpl/library/util/ErrorRecovery.rsc
@@ -164,10 +164,26 @@ Tree addErrorStats(Tree x) = bottom-up visit(x) {
 };
 
 @synopsis{Reusable utility for re-computing error statistics per Tree node.}
-private Tree addStats(t:appl(prod(_,_,_), args)) = t[skipped = (0 | it + a.skipped | a <- args)][erroneous = (false | it || a.erroneous | a <- args)];
-private Tree addStats(t:appl(skipped(_), args))  = t[skipped = size(args)][erroneous = true];
-private Tree addStats(t:appl(error(_,_,_), args))= t[skipped = (0 | it + a.skipped | a <- args)][erroneous = true];
-private Tree addStats(t:amb(alts))               = t[skipped = (0 | min([it, a.skipped]) | a <- alts)][erroneous = (false | it && a.erroneous | a <- alts)];
+private Tree addStats(t:appl(prod(_,_,_), args)) 
+    = t
+        [skipped = (0 | it + a.skipped | a <- args)]
+        [erroneous = (false | it || a.erroneous | a <- args)];
+
+private Tree addStats(t:appl(skipped(_), args))  
+    = t
+        [skipped = size(args)]
+        [erroneous = true];
+
+private Tree addStats(t:appl(error(_,_,_), args))
+    = t
+        [skipped = (0 | it + a.skipped | a <- args)]
+        [erroneous = true];
+
+private Tree addStats(t:amb(alts)) 
+    = t
+        [skipped = (0 | min([it, a.skipped]) | a <- alts)]
+        [erroneous = (false | it && a.erroneous | a <- alts)];
+        
 default private Tree addStats(Tree t) = t;
 
 @synopsis{Disambiguates error ambiguity clusters by selecting the alternatives with the shortest amount of skipped characters}

From 27b64f2a68ba4a678b1f35533de7ec3402f20d46 Mon Sep 17 00:00:00 2001
From: "Jurgen J. Vinju" <Jurgen.Vinju@cwi.nl>
Date: Wed, 13 Nov 2024 10:42:37 +0100
Subject: [PATCH 6/6] added comments

---
 src/org/rascalmpl/library/util/ErrorRecovery.rsc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/org/rascalmpl/library/util/ErrorRecovery.rsc b/src/org/rascalmpl/library/util/ErrorRecovery.rsc
index f36f2577e7..cd06eac071 100644
--- a/src/org/rascalmpl/library/util/ErrorRecovery.rsc
+++ b/src/org/rascalmpl/library/util/ErrorRecovery.rsc
@@ -164,6 +164,13 @@ Tree addErrorStats(Tree x) = bottom-up visit(x) {
 };
 
 @synopsis{Reusable utility for re-computing error statistics per Tree node.}
+@description{
+This function must be applied in a bottom-up manner to make sense.
+}
+@benefits{
+* different bottom-up algorithms can reuse this function to re-compute
+the statistics based on a new state of the tree.
+}
 private Tree addStats(t:appl(prod(_,_,_), args)) 
     = t
         [skipped = (0 | it + a.skipped | a <- args)]
@@ -183,7 +190,7 @@ private Tree addStats(t:amb(alts))
     = t
         [skipped = (0 | min([it, a.skipped]) | a <- alts)]
         [erroneous = (false | it && a.erroneous | a <- alts)];
-        
+
 default private Tree addStats(Tree t) = t;
 
 @synopsis{Disambiguates error ambiguity clusters by selecting the alternatives with the shortest amount of skipped characters}