From 2725c65387ff7213585485f75e225d8a89a11476 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Fri, 8 Nov 2024 17:23:56 +0100 Subject: [PATCH 1/6] added a function that can filter error trees which are in optional positions in a grammar (list elements and optionals) --- .../rascalmpl/library/util/ErrorRecovery.rsc | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/org/rascalmpl/library/util/ErrorRecovery.rsc b/src/org/rascalmpl/library/util/ErrorRecovery.rsc index 08673830a9..e85b25b569 100644 --- a/src/org/rascalmpl/library/util/ErrorRecovery.rsc +++ b/src/org/rascalmpl/library/util/ErrorRecovery.rsc @@ -51,3 +51,24 @@ This filter removes error trees until no ambiguities caused by error recovery ar Note that regular ambiguous trees remain in the parse forest unless `allowAmbiguity` is set to false in which case an error is thrown. } java Tree disambiguateErrors(Tree t, bool allowAmbiguity=true); + +@synopsis{Removes error trees if they are in optional positions.} +@description{ +Removing grammatically optional error trees can reduce the number of case distinctions +required to make algorithms that process parse trees robust against parse errors. +} +Tree filterOptionalErrorTrees(Tree t) = visit(t) { + case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_])),[*pre, _sep, appl(error(_,_,_),_), *post]) + => appl(p, [*pre, *post])[@\loc=t@\loc] + case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_])),[appl(error(_,_,_),_), _sep, *post]) + => appl(p, post)[@\loc=t@\loc] + case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_,_,_])),[*pre, _sep1, _sep2, _sep3, appl(error(_,_,_),_), *post]) + => appl(p, [*pre, *post])[@\loc=t@\loc] + case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_,_,_])),[appl(error(_,_,_),_), _sep1, _sep2, _sep3, *post]) + => appl(p, post)[@\loc=t@\loc] + case t:appl(p:regular(/iter|iter-star/(_)),[*pre, appl(error(_,_,_),_), *post]) + => appl(p, [*pre, *post])[@\loc=t@\loc] + case t:appl(p:regular(opt(_)), appl(error(_,_,_), _)) + => appl(p, [])[@\loc=t@\loc] + // TODO: some forms of recursion could be flattened in the presence of errors mid-way. +}; From 66061173aad478e6e488c46cc478f7eb878aa1cd Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Mon, 11 Nov 2024 19:44:18 +0100 Subject: [PATCH 2/6] added more experimental features to error recovery --- .../rascalmpl/library/util/ErrorRecovery.rsc | 53 ++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/src/org/rascalmpl/library/util/ErrorRecovery.rsc b/src/org/rascalmpl/library/util/ErrorRecovery.rsc index e85b25b569..e502592c5b 100644 --- a/src/org/rascalmpl/library/util/ErrorRecovery.rsc +++ b/src/org/rascalmpl/library/util/ErrorRecovery.rsc @@ -57,7 +57,7 @@ java Tree disambiguateErrors(Tree t, bool allowAmbiguity=true); Removing grammatically optional error trees can reduce the number of case distinctions required to make algorithms that process parse trees robust against parse errors. } -Tree filterOptionalErrorTrees(Tree t) = visit(t) { +Tree filterOptionalErrorTrees(Tree x) = visit(x) { case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_])),[*pre, _sep, appl(error(_,_,_),_), *post]) => appl(p, [*pre, *post])[@\loc=t@\loc] case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_])),[appl(error(_,_,_),_), _sep, *post]) @@ -72,3 +72,54 @@ Tree filterOptionalErrorTrees(Tree t) = visit(t) { => appl(p, [])[@\loc=t@\loc] // TODO: some forms of recursion could be flattened in the presence of errors mid-way. }; + +@synopsis{Removes trees which contain error trees, if they are in optional positions.} +@description{ +Removing grammatically optional error trees can reduce the number of case distinctions +required to make algorithms that process parse trees robust against parse errors. +} +@benefits{ +* this algorithm is more aggressive and more successful in removing error trees +then ((filterOptionalErrorTrees)) can be. +} +@pitfalls{ +* this algorithm may cut off entire branches which are otherwise fine to extract more information from. +} +Tree filterOptionalIndirectErrorTrees(Tree x) = visit(addErrorStats(x)) { + case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_])),[*pre, _sep, appl(_,_, erroneous=true), *post]) + => appl(p, [*pre, *post])[@\loc=t@\loc] + case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_])),[appl(_,_, erroneous=true), _sep, *post]) + => appl(p, post)[@\loc=t@\loc] + case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_,_,_])),[*pre, _sep1, _sep2, _sep3, appl(_,_, erroneous=true), *post]) + => appl(p, [*pre, *post])[@\loc=t@\loc] + case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_,_,_])),[appl(_,_, erroneous=true), _sep1, _sep2, _sep3, *post]) + => appl(p, post)[@\loc=t@\loc] + case t:appl(p:regular(/iter|iter-star/(_)),[*pre, appl(_,_, erroneous=true), *post]) + => appl(p, [*pre, *post])[@\loc=t@\loc] + case t:appl(p:regular(opt(_)), appl(_, _, erroneous=true)) + => appl(p, [])[@\loc=t@\loc] +} + +@synopsis{Fields for storing the result of ((addErrorStats))} +data Tree(int skipped = 0, bool erroneous=false); + +@synopsis{Annotates all nodes of a parse tree with error recovery statistics} +@description{ +After this algorithm all nodes contain this information: +* `int skipped` for the total number of skipped characters in a tree +* `bool erroneous` that marks sub-trees which do contain errors with `true` while others remain `false` +} +@benefits{ +* local information about aggegrated information can be handy when filtering +parse forests +} +@pitfalls{ +* statistics do not tell the whole truth about sub-trees. Filtering based on these numbers +must be seen as a heuristic that sometimes pays-off, but sometimes hides crucial information. +} +Tree addErrorStats(Tree x) = bottom-up visit(x) { + case t:appl(skipped(_), args) => t[skipped = size(args)][erroneous = true] + case t:appl(error(_,_,_), args) => t[skipped = (0 | it + a.skipped | a <- args)][erroneous = true] + case t:appl(prod(_,_,_), args) => t[skipped = (0 | it + a.skipped | a <- args)][erroneous = (false | it || a.erroneous | a <- args)] + case t:amb(alts) => t[skipped = (0 | min([it, a.skipped]) | a <- alts)][erroneous = (false | it && a.erroneous | a <- alts)] +}; \ No newline at end of file From 7a4d95909bf439af0f722394b0587086048fa928 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Tue, 12 Nov 2024 09:41:17 +0100 Subject: [PATCH 3/6] fixes and doc additions --- .../rascalmpl/library/util/ErrorRecovery.rsc | 74 +++++++++++++++---- 1 file changed, 59 insertions(+), 15 deletions(-) diff --git a/src/org/rascalmpl/library/util/ErrorRecovery.rsc b/src/org/rascalmpl/library/util/ErrorRecovery.rsc index e502592c5b..a4dffa2136 100644 --- a/src/org/rascalmpl/library/util/ErrorRecovery.rsc +++ b/src/org/rascalmpl/library/util/ErrorRecovery.rsc @@ -46,9 +46,23 @@ If you want the text of the whole error tree, you can just use string interpolat str getErrorText(appl(error(_, _, _), [*_, appl(skipped(_), chars)])) = stringChars([c | char(c) <- chars]); @javaClass{org.rascalmpl.library.util.ErrorRecovery} -@synopsis{Error recovery often produces ambiguous trees where errors can be recovered in multiple ways. -This filter removes error trees until no ambiguities caused by error recovery are left. -Note that regular ambiguous trees remain in the parse forest unless `allowAmbiguity` is set to false in which case an error is thrown. +@synopsis{This filter removes error trees until no ambiguities caused by error recovery are left.} +@description{ +Error recovery often produces ambiguous trees where errors can be recovered in multiple ways. Ambiguity +clusters (`amb`) represent the choices between all the valid prefixes. This filter removes choices until +the last one is left. + +Note that regular ambiguous trees remain in the parse forest unless `allowAmbiguity` is set to false in +which case an error is thrown. +} +@benefits{ +* after this algorithm only one error is left at every input position with an error. Downstream +functionality does not have to deal with ambiguity anymore, making the code robust. +} +@pitfalls{ +* this algorithm removes valid prefixes based on heuristics like "shortest error", which may +remove interesting prefixes for downstream processing. In particular the accuracy of error repair and auto-complete +may be damaged by this function. So it is best to use it for error recovery, and not for error repair. } java Tree disambiguateErrors(Tree t, bool allowAmbiguity=true); @@ -85,17 +99,17 @@ then ((filterOptionalErrorTrees)) can be. @pitfalls{ * this algorithm may cut off entire branches which are otherwise fine to extract more information from. } -Tree filterOptionalIndirectErrorTrees(Tree x) = visit(addErrorStats(x)) { +Tree filterOptionalIndirectErrorTrees(Tree x) = bottom-up visit(addErrorStats(x)) { case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_])),[*pre, _sep, appl(_,_, erroneous=true), *post]) - => appl(p, [*pre, *post])[@\loc=t@\loc] + => addStats(appl(p, [*pre, *post])[@\loc=t@\loc]) case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_])),[appl(_,_, erroneous=true), _sep, *post]) - => appl(p, post)[@\loc=t@\loc] + => addStats(appl(p, post)[@\loc=t@\loc]) case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_,_,_])),[*pre, _sep1, _sep2, _sep3, appl(_,_, erroneous=true), *post]) - => appl(p, [*pre, *post])[@\loc=t@\loc] + => addStats(appl(p, [*pre, *post])[@\loc=t@\loc]) case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_,_,_])),[appl(_,_, erroneous=true), _sep1, _sep2, _sep3, *post]) - => appl(p, post)[@\loc=t@\loc] + => addStats(appl(p, post)[@\loc=t@\loc]) case t:appl(p:regular(/iter|iter-star/(_)),[*pre, appl(_,_, erroneous=true), *post]) - => appl(p, [*pre, *post])[@\loc=t@\loc] + => addStats(appl(p, [*pre, *post])[@\loc=t@\loc]) case t:appl(p:regular(opt(_)), appl(_, _, erroneous=true)) => appl(p, [])[@\loc=t@\loc] } @@ -115,11 +129,41 @@ parse forests } @pitfalls{ * statistics do not tell the whole truth about sub-trees. Filtering based on these numbers -must be seen as a heuristic that sometimes pays-off, but sometimes hides crucial information. +must be seen as a heuristic that sometimes pays-off, and often hides crucial information. } Tree addErrorStats(Tree x) = bottom-up visit(x) { - case t:appl(skipped(_), args) => t[skipped = size(args)][erroneous = true] - case t:appl(error(_,_,_), args) => t[skipped = (0 | it + a.skipped | a <- args)][erroneous = true] - case t:appl(prod(_,_,_), args) => t[skipped = (0 | it + a.skipped | a <- args)][erroneous = (false | it || a.erroneous | a <- args)] - case t:amb(alts) => t[skipped = (0 | min([it, a.skipped]) | a <- alts)][erroneous = (false | it && a.erroneous | a <- alts)] -}; \ No newline at end of file + case Tree t => addStats(t) +}; + +@synopsis{Reusable utility for re-computing error statistics per Tree node.} +private Tree addStats(t:appl(prod(_,_,_), args)) = t[skipped = (0 | it + a.skipped | a <- args)][erroneous = (false | it || a.erroneous | a <- args)]; +private Tree addStats(t:appl(skipped(_), args)) = t[skipped = size(args)][erroneous = true]; +private Tree addStats(t:appl(error(_,_,_), args))= t[skipped = (0 | it + a.skipped | a <- args)][erroneous = true]; +private Tree addStats(t:amb(alts)) = t[skipped = (0 | min([it, a.skipped]) | a <- alts)][erroneous = (false | it && a.erroneous | a <- alts)]; +default private Tree addStats(Tree t) = t; + +@synopsis{Disambiguates error ambiguity clusters by selecting the alternatives with the shortest amount of skipped characters} +@benefits{ +* this is an aggressive filter that can greatly reduce the complexity of dealing with recovered parse trees. +* chances are that after this filter all ambiguity has been removed, making downstream processing easier. +} +@pitfalls{ +* the trees with the shortest skips are not always the most relevant trees to consider for repair or recovery. +} +Tree selectShortestSkips(Tree x) = visit(addErrorStats(x)) { + case amb(alts) => amb({ a | a <- alts, a.skipped == minimum}) + when int minimum := min([a.skipped | a <- alts]) +} + +@synopsis{Disambiguates error ambiguity clusters by selecting the alternatives with the largest amount of skipped characters} +@benefits{ +* this is an aggressive filter that can greatly reduce the complexity of dealing with recovered parse trees. +* chances are that after this filter all ambiguity has been removed, making downstream processing easier. +} +@pitfalls{ +* the trees with the longest skips are not always the most relevant trees to consider for repair or recovery. +} +Tree selectLongestSkips(Tree x) = visit(addErrorStats(x)) { + case amb(alts) => amb({ a | a <- alts, a.skipped == maximum}) + when int maximum := max([a.skipped | a <- alts]) +} \ No newline at end of file From c447811a5b2b9bebf493f70d0f3f7bc4150913b1 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Tue, 12 Nov 2024 22:04:58 +0100 Subject: [PATCH 4/6] improved docs --- .../rascalmpl/library/util/ErrorRecovery.rsc | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/src/org/rascalmpl/library/util/ErrorRecovery.rsc b/src/org/rascalmpl/library/util/ErrorRecovery.rsc index a4dffa2136..c53019f068 100644 --- a/src/org/rascalmpl/library/util/ErrorRecovery.rsc +++ b/src/org/rascalmpl/library/util/ErrorRecovery.rsc @@ -70,6 +70,22 @@ java Tree disambiguateErrors(Tree t, bool allowAmbiguity=true); @description{ Removing grammatically optional error trees can reduce the number of case distinctions required to make algorithms that process parse trees robust against parse errors. + +The algorithm works bottom-up such that only the smallest possible trees are removed. +After every removal, new error statistics are prepared for passing up to the next level. +If errors are completely removed by the filter, then the parents will remain unchanged. +} +@benefits{ +* Removing error trees increases the robustness of downstream processors +* By removing error trees from lists, the relative `src` origins remain the same for +downstream processing. +* Sets of trees in ambiguity clusters may be reduced to singletons, making the ambiguity cluster dissappear. +This also improves the robustness of downstream processors. +} +@pitfalls{ +* this algorithm may remove relatively large sub-trees and thus through away valuable information. +It is much better to use this as a recovery tool that increases the robustness of oblivious downstream processors, +then to start an error repair or auto-complete algorithm. } Tree filterOptionalErrorTrees(Tree x) = visit(x) { case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_])),[*pre, _sep, appl(error(_,_,_),_), *post]) @@ -91,13 +107,25 @@ Tree filterOptionalErrorTrees(Tree x) = visit(x) { @description{ Removing grammatically optional error trees can reduce the number of case distinctions required to make algorithms that process parse trees robust against parse errors. + +The algorithm works bottom-up such that only the smallest possible trees are removed. +After every removal, new error statistics are prepared for passing up to the next level. +If errors are completely removed by the filter, then the parents will remain unchanged. } @benefits{ * this algorithm is more aggressive and more successful in removing error trees then ((filterOptionalErrorTrees)) can be. +* Removing error trees increases the robustness of downstream processors. +* By removing error trees from lists, the relative `src` origins remain the same for +downstream processing. +* Sets of trees in ambiguity clusters may be reduced to singletons, making the ambiguity cluster dissappear. +This also improves the robustness of downstream processors. } @pitfalls{ -* this algorithm may cut off entire branches which are otherwise fine to extract more information from. +* this algorithm may remove (very) large sub-trees if they contain one error somewhere, and thus through away valuable information. +It is much better to use this as a recovery tool that increases the robustness of oblivious downstream processors, +then to start an error repair or auto-complete algorithm. + } Tree filterOptionalIndirectErrorTrees(Tree x) = bottom-up visit(addErrorStats(x)) { case t:appl(p:regular(/iter-sep|iter-star-sep/(_,[_])),[*pre, _sep, appl(_,_, erroneous=true), *post]) From c32f3bd229ac1ba389bd2a0ac47d98ba9cdedf61 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Wed, 13 Nov 2024 10:40:47 +0100 Subject: [PATCH 5/6] improved readability of private addStats function --- .../rascalmpl/library/util/ErrorRecovery.rsc | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/src/org/rascalmpl/library/util/ErrorRecovery.rsc b/src/org/rascalmpl/library/util/ErrorRecovery.rsc index c53019f068..f36f2577e7 100644 --- a/src/org/rascalmpl/library/util/ErrorRecovery.rsc +++ b/src/org/rascalmpl/library/util/ErrorRecovery.rsc @@ -164,10 +164,26 @@ Tree addErrorStats(Tree x) = bottom-up visit(x) { }; @synopsis{Reusable utility for re-computing error statistics per Tree node.} -private Tree addStats(t:appl(prod(_,_,_), args)) = t[skipped = (0 | it + a.skipped | a <- args)][erroneous = (false | it || a.erroneous | a <- args)]; -private Tree addStats(t:appl(skipped(_), args)) = t[skipped = size(args)][erroneous = true]; -private Tree addStats(t:appl(error(_,_,_), args))= t[skipped = (0 | it + a.skipped | a <- args)][erroneous = true]; -private Tree addStats(t:amb(alts)) = t[skipped = (0 | min([it, a.skipped]) | a <- alts)][erroneous = (false | it && a.erroneous | a <- alts)]; +private Tree addStats(t:appl(prod(_,_,_), args)) + = t + [skipped = (0 | it + a.skipped | a <- args)] + [erroneous = (false | it || a.erroneous | a <- args)]; + +private Tree addStats(t:appl(skipped(_), args)) + = t + [skipped = size(args)] + [erroneous = true]; + +private Tree addStats(t:appl(error(_,_,_), args)) + = t + [skipped = (0 | it + a.skipped | a <- args)] + [erroneous = true]; + +private Tree addStats(t:amb(alts)) + = t + [skipped = (0 | min([it, a.skipped]) | a <- alts)] + [erroneous = (false | it && a.erroneous | a <- alts)]; + default private Tree addStats(Tree t) = t; @synopsis{Disambiguates error ambiguity clusters by selecting the alternatives with the shortest amount of skipped characters} From 27b64f2a68ba4a678b1f35533de7ec3402f20d46 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Wed, 13 Nov 2024 10:42:37 +0100 Subject: [PATCH 6/6] added comments --- src/org/rascalmpl/library/util/ErrorRecovery.rsc | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/org/rascalmpl/library/util/ErrorRecovery.rsc b/src/org/rascalmpl/library/util/ErrorRecovery.rsc index f36f2577e7..cd06eac071 100644 --- a/src/org/rascalmpl/library/util/ErrorRecovery.rsc +++ b/src/org/rascalmpl/library/util/ErrorRecovery.rsc @@ -164,6 +164,13 @@ Tree addErrorStats(Tree x) = bottom-up visit(x) { }; @synopsis{Reusable utility for re-computing error statistics per Tree node.} +@description{ +This function must be applied in a bottom-up manner to make sense. +} +@benefits{ +* different bottom-up algorithms can reuse this function to re-compute +the statistics based on a new state of the tree. +} private Tree addStats(t:appl(prod(_,_,_), args)) = t [skipped = (0 | it + a.skipped | a <- args)] @@ -183,7 +190,7 @@ private Tree addStats(t:amb(alts)) = t [skipped = (0 | min([it, a.skipped]) | a <- alts)] [erroneous = (false | it && a.erroneous | a <- alts)]; - + default private Tree addStats(Tree t) = t; @synopsis{Disambiguates error ambiguity clusters by selecting the alternatives with the shortest amount of skipped characters}