From cde98511cd4649ae712305ad7d6e052966994e63 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Mon, 22 Jul 2024 10:11:40 +0200 Subject: [PATCH 1/8] added a lot of failing tests --- .../lang/rascal/tests/concrete/Character.rsc | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/org/rascalmpl/library/lang/rascal/tests/concrete/Character.rsc b/src/org/rascalmpl/library/lang/rascal/tests/concrete/Character.rsc index a6b7ae1fbba..66388d4f331 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/concrete/Character.rsc +++ b/src/org/rascalmpl/library/lang/rascal/tests/concrete/Character.rsc @@ -49,6 +49,41 @@ test bool charClassOrderedRanges() = (#[a-z A-Z]).symbol == \char-class([range(6 test bool charClassMergedRanges() = (#[A-Z F-G]).symbol == \char-class([range(65,90)]); test bool charClassExtendedRanges() = (#[A-M N-Z]).symbol == \char-class([range(65,90)]); +test bool asciiEscape() = \char-class([range(0,127)]) := #[\a00-aFF].symbol; +test bool utf16Escape() = \char-class([range(0,65535)]) := #[\u0000-\uFFFF].symbol; +test bool utf24Escape() = \char-class([range(0,1114111)]) := #[\U000000-\U10FFFF].symbol; +test bool highLowSurrogateRange1() = \char-class([range(9312,12991)]) := #[①-㊿].symbol; +test bool highLowSurrogateRange2() = \char-class([range(127829,127829)]) := #[🍕].symbol; +test bool differentEscapesSameResult1() = #[\a00-aFF] == #[\u0000-\u00FF]; +test bool differentEscapesSameResult2() = #[\a00-aFF] == #[\u0000-\u00FF]; + +/* to avoid a known ambiguity */ +alias NotAZ = ![A-Z]; + +test bool unicodeCharacterClassSubtype1() { + Tree t = char(charAt("⑭", 0)); + + if ([①-㊿] circled := t) { + assert [⑭] _ := circled; + assert NotAZ _ := circled; + return true; + } + + return false; +} + +test bool unicodeCharacterClassSubtype2() { + Tree t = char(charAt("🍕", 0)); + + if ([🍕] pizza := t) { + assert [\a00-🍕] _ := pizza; + assert NotAZ _ := pizza; + return true; + } + + return false; +} + // ambiguity in this syntax must be resolved first //test bool differenceCC() = (#[a-zA-Z] - [A-Z]).symbol == (#[a-z]).symbol; //test bool unionCC() = (#[a-z] || [A-Z]).symbol == (#[A-Za-z]).symbol; From 9b9285ae89da0862168fe96785904ac4de9a1300 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Mon, 22 Jul 2024 10:37:55 +0200 Subject: [PATCH 2/8] this fixes #2009 --- .../lang/rascal/tests/concrete/Character.rsc | 14 +++--- .../values/parsetrees/SymbolFactory.java | 43 +++++++++---------- 2 files changed, 28 insertions(+), 29 deletions(-) diff --git a/src/org/rascalmpl/library/lang/rascal/tests/concrete/Character.rsc b/src/org/rascalmpl/library/lang/rascal/tests/concrete/Character.rsc index 66388d4f331..271f59dfd59 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/concrete/Character.rsc +++ b/src/org/rascalmpl/library/lang/rascal/tests/concrete/Character.rsc @@ -49,13 +49,13 @@ test bool charClassOrderedRanges() = (#[a-z A-Z]).symbol == \char-class([range(6 test bool charClassMergedRanges() = (#[A-Z F-G]).symbol == \char-class([range(65,90)]); test bool charClassExtendedRanges() = (#[A-M N-Z]).symbol == \char-class([range(65,90)]); -test bool asciiEscape() = \char-class([range(0,127)]) := #[\a00-aFF].symbol; -test bool utf16Escape() = \char-class([range(0,65535)]) := #[\u0000-\uFFFF].symbol; -test bool utf24Escape() = \char-class([range(0,1114111)]) := #[\U000000-\U10FFFF].symbol; -test bool highLowSurrogateRange1() = \char-class([range(9312,12991)]) := #[①-㊿].symbol; -test bool highLowSurrogateRange2() = \char-class([range(127829,127829)]) := #[🍕].symbol; -test bool differentEscapesSameResult1() = #[\a00-aFF] == #[\u0000-\u00FF]; -test bool differentEscapesSameResult2() = #[\a00-aFF] == #[\u0000-\u00FF]; +test bool asciiEscape() = \char-class([range(0,127)]) == #[\a00-\a7F].symbol; +test bool utf16Escape() = \char-class([range(0,65535)]) == #[\u0000-\uFFFF].symbol; +test bool utf24Escape() = \char-class([range(0,1114111)]) == #[\U000000-\U10FFFF].symbol; +test bool highLowSurrogateRange1() = \char-class([range(9312,12991)]) == #[①-㊿].symbol; +test bool highLowSurrogateRange2() = \char-class([range(127829,127829)]) == #[🍕].symbol; +test bool differentEscapesSameResult1() = #[\a00-\a7F] == #[\u0000-\u007F]; +test bool differentEscapesSameResult2() = #[\a00-\a7F] == #[\U000000-\U00007F]; /* to avoid a known ambiguity */ alias NotAZ = ![A-Z]; diff --git a/src/org/rascalmpl/values/parsetrees/SymbolFactory.java b/src/org/rascalmpl/values/parsetrees/SymbolFactory.java index 22e96b5b231..bc28bb6d9d0 100644 --- a/src/org/rascalmpl/values/parsetrees/SymbolFactory.java +++ b/src/org/rascalmpl/values/parsetrees/SymbolFactory.java @@ -337,31 +337,30 @@ else if (range.isFromTo()) { private static IValue char2int(Char character) { String s = ((Char.Lexical) character).getString(); - if (s.startsWith("\\")) { - if (s.length() > 1 && java.lang.Character.isDigit(s.charAt(1))) { // octal escape - // TODO - throw new NotYetImplemented("octal escape sequence in character class types"); - } - if (s.length() > 1 && s.charAt(1) == 'u') { // octal escape - // TODO - throw new NotYetImplemented("unicode escape sequence in character class types"); - } - char cha = s.charAt(1); + if (s.matches("\\\\[auU][0-9A-F]+")) { + // ascii escape (a), utf16 escape (u) or utf24 escape (U) + return factory.integer(Integer.parseInt(s.substring(2), 16)); + } + else if (s.startsWith("\\")) { + // builtin escape + int cha = s.codePointAt(1); switch (cha) { - case 't': return factory.integer('\t'); - case 'n': return factory.integer('\n'); - case 'r': return factory.integer('\r'); - case '\"' : return factory.integer('\"'); - case '\'' : return factory.integer('\''); - case '-' : return factory.integer('-'); - case '<' : return factory.integer('<'); - case '>' : return factory.integer('>'); - case '\\' : return factory.integer('\\'); + case 't': return factory.integer('\t'); + case 'n': return factory.integer('\n'); + case 'r': return factory.integer('\r'); + case '\"' : return factory.integer('\"'); + case '\'' : return factory.integer('\''); + case '-' : return factory.integer('-'); + case '<' : return factory.integer('<'); + case '>' : return factory.integer('>'); + case '\\' : return factory.integer('\\'); + default: return factory.integer(s.codePointAt(1)); } - s = s.substring(1); } - char cha = s.charAt(0); - return factory.integer(cha); + else { + // just a single character (but possibly two char's) + return factory.integer(s.codePointAt(0)); + } } public static IConstructor charClass(int ch) { From 72af4969efcdf846c89a2f297274dfd48bab68c1 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Mon, 22 Jul 2024 10:40:48 +0200 Subject: [PATCH 3/8] simplification --- src/org/rascalmpl/values/parsetrees/SymbolFactory.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/org/rascalmpl/values/parsetrees/SymbolFactory.java b/src/org/rascalmpl/values/parsetrees/SymbolFactory.java index bc28bb6d9d0..1bea3c95946 100644 --- a/src/org/rascalmpl/values/parsetrees/SymbolFactory.java +++ b/src/org/rascalmpl/values/parsetrees/SymbolFactory.java @@ -354,7 +354,7 @@ else if (s.startsWith("\\")) { case '<' : return factory.integer('<'); case '>' : return factory.integer('>'); case '\\' : return factory.integer('\\'); - default: return factory.integer(s.codePointAt(1)); + default: return factory.integer(cha); } } else { From e38ec4f5ed07be4e5e4dc6113222f34e74b1d29c Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Mon, 22 Jul 2024 11:17:31 +0200 Subject: [PATCH 4/8] added failing tests for literals with escaped characters in them --- .../library/lang/rascal/tests/concrete/Character.rsc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/org/rascalmpl/library/lang/rascal/tests/concrete/Character.rsc b/src/org/rascalmpl/library/lang/rascal/tests/concrete/Character.rsc index 271f59dfd59..fbfa3793686 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/concrete/Character.rsc +++ b/src/org/rascalmpl/library/lang/rascal/tests/concrete/Character.rsc @@ -84,6 +84,12 @@ test bool unicodeCharacterClassSubtype2() { return false; } +test bool literalAsciiEscape1() = lit("\n") == #"\a0A".symbol; +test bool literalAsciiEscape2() = lit("w") == #"\a77".symbol; +test bool literalUtf16Escape() = lit("\n") == #"\u000A".symbol; +test bool literalUtf24Escape1() = lit("\n") == #"\U00000A".symbol; +test bool literalUtf24Escape2() = lit("🍕") == #"\U01F355".symbol; + // ambiguity in this syntax must be resolved first //test bool differenceCC() = (#[a-zA-Z] - [A-Z]).symbol == (#[a-z]).symbol; //test bool unionCC() = (#[a-z] || [A-Z]).symbol == (#[A-Za-z]).symbol; From 6cf019e3a2fcf58f4375a807f6e2bde82ac510f0 Mon Sep 17 00:00:00 2001 From: "Jurgen J. Vinju" Date: Mon, 22 Jul 2024 11:23:54 +0200 Subject: [PATCH 5/8] easy fix for the literals --- .../values/parsetrees/SymbolFactory.java | 69 ++++--------------- 1 file changed, 13 insertions(+), 56 deletions(-) diff --git a/src/org/rascalmpl/values/parsetrees/SymbolFactory.java b/src/org/rascalmpl/values/parsetrees/SymbolFactory.java index 1bea3c95946..f743422c331 100644 --- a/src/org/rascalmpl/values/parsetrees/SymbolFactory.java +++ b/src/org/rascalmpl/values/parsetrees/SymbolFactory.java @@ -13,6 +13,8 @@ *******************************************************************************/ package org.rascalmpl.values.parsetrees; +import java.io.IOException; +import java.io.StringReader; import java.util.List; import org.rascalmpl.ast.CaseInsensitiveStringConstant; @@ -34,6 +36,8 @@ import io.usethesource.vallang.IString; import io.usethesource.vallang.IValue; import io.usethesource.vallang.IValueFactory; +import io.usethesource.vallang.exceptions.FactTypeUseException; +import io.usethesource.vallang.io.StandardTextReader; import org.rascalmpl.values.RascalValueFactory; import org.rascalmpl.values.ValueFactoryFactory; @@ -197,63 +201,16 @@ private static IValue symbolAST2SymbolConstructor(Sym symbol, boolean lex, Strin } private static IValue literal2Symbol(StringConstant sep) { - String lit = ((StringConstant.Lexical) sep).getString(); - StringBuilder builder = new StringBuilder(lit.length()); - - // TODO: did we deal with all escapes here? probably not! - for (int i = 1; i < lit.length() - 1; i++) { - if (lit.charAt(i) == '\\') { - i++; - switch (lit.charAt(i)) { - case 'b': - builder.append('\b'); - break; - case 'f': - builder.append('\f'); - break; - case 'n': - builder.append('\n'); - break; - case 't': - builder.append('\t'); - break; - case 'r': - builder.append('\r'); - break; - case '\\': - builder.append('\\'); - break; - case '\"': - builder.append('\"'); - break; - case '>': - builder.append('>'); - break; - case '<': - builder.append('<'); - break; - case '\'': - builder.append('\''); - break; - case 'u': - while (lit.charAt(i++) == 'u'); - builder.append((char) Integer.decode("0x" + lit.substring(i, i+4)).intValue()); - i+=4; - break; - default: - // octal escape - int a = lit.charAt(i++); - int b = lit.charAt(i++); - int c = lit.charAt(i); - builder.append( (char) (100 * a + 10 * b + c)); - } - } - else { - builder.append(lit.charAt(i)); - } + try { + String lit = ((StringConstant.Lexical) sep).getString(); + // this should be the exact notation for string literals in vallang + IValue string = new StandardTextReader().read(factory, new StringReader(lit)); + + return factory.constructor(RascalValueFactory.Symbol_Lit, string); + } + catch (FactTypeUseException | IOException e) { + throw new RuntimeException("Internal error: parsed stringconstant notation does not coincide with vallang stringconstant notation"); } - - return factory.constructor(RascalValueFactory.Symbol_Lit, factory.string(builder.toString())); } private static IValue ciliteral2Symbol(CaseInsensitiveStringConstant constant) { From 0ace42f7d1e261bd715e7f4fc972795710d2382e Mon Sep 17 00:00:00 2001 From: Davy Landman Date: Tue, 27 Aug 2024 14:19:58 +0200 Subject: [PATCH 6/8] Added failing test case --- .../library/lang/rascal/tests/concrete/Character.rsc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/org/rascalmpl/library/lang/rascal/tests/concrete/Character.rsc b/src/org/rascalmpl/library/lang/rascal/tests/concrete/Character.rsc index fbfa3793686..2237cfc9063 100644 --- a/src/org/rascalmpl/library/lang/rascal/tests/concrete/Character.rsc +++ b/src/org/rascalmpl/library/lang/rascal/tests/concrete/Character.rsc @@ -51,7 +51,7 @@ test bool charClassExtendedRanges() = (#[A-M N-Z]).symbol == \char-class([range( test bool asciiEscape() = \char-class([range(0,127)]) == #[\a00-\a7F].symbol; test bool utf16Escape() = \char-class([range(0,65535)]) == #[\u0000-\uFFFF].symbol; -test bool utf24Escape() = \char-class([range(0,1114111)]) == #[\U000000-\U10FFFF].symbol; +test bool utf32Escape() = \char-class([range(0,1114111)]) == #[\U000000-\U10FFFF].symbol; test bool highLowSurrogateRange1() = \char-class([range(9312,12991)]) == #[①-㊿].symbol; test bool highLowSurrogateRange2() = \char-class([range(127829,127829)]) == #[🍕].symbol; test bool differentEscapesSameResult1() = #[\a00-\a7F] == #[\u0000-\u007F]; @@ -86,9 +86,12 @@ test bool unicodeCharacterClassSubtype2() { test bool literalAsciiEscape1() = lit("\n") == #"\a0A".symbol; test bool literalAsciiEscape2() = lit("w") == #"\a77".symbol; +test bool literalAsciiEscape3() = lit("\f") == #"\a0C".symbol; +test bool literalAsciiEscape4() = lit("\n") == #"\n".symbol; +test bool literalAsciiEscape5() = lit("\f") == #"\f".symbol; test bool literalUtf16Escape() = lit("\n") == #"\u000A".symbol; -test bool literalUtf24Escape1() = lit("\n") == #"\U00000A".symbol; -test bool literalUtf24Escape2() = lit("🍕") == #"\U01F355".symbol; +test bool literalUtf32Escape1() = lit("\n") == #"\U00000A".symbol; +test bool literalUtf32Escape2() = lit("🍕") == #"\U01F355".symbol; // ambiguity in this syntax must be resolved first //test bool differenceCC() = (#[a-zA-Z] - [A-Z]).symbol == (#[a-z]).symbol; From 4c93b8fe9a09a6b920e86a44b3341c3f6541bc9a Mon Sep 17 00:00:00 2001 From: Davy Landman Date: Tue, 27 Aug 2024 14:43:21 +0200 Subject: [PATCH 7/8] Improved logic for char escapes --- .../values/parsetrees/SymbolFactory.java | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/src/org/rascalmpl/values/parsetrees/SymbolFactory.java b/src/org/rascalmpl/values/parsetrees/SymbolFactory.java index f743422c331..9e6a47f49bc 100644 --- a/src/org/rascalmpl/values/parsetrees/SymbolFactory.java +++ b/src/org/rascalmpl/values/parsetrees/SymbolFactory.java @@ -294,24 +294,22 @@ else if (range.isFromTo()) { private static IValue char2int(Char character) { String s = ((Char.Lexical) character).getString(); - if (s.matches("\\\\[auU][0-9A-F]+")) { - // ascii escape (a), utf16 escape (u) or utf24 escape (U) - return factory.integer(Integer.parseInt(s.substring(2), 16)); - } - else if (s.startsWith("\\")) { + if (s.startsWith("\\")) { // builtin escape int cha = s.codePointAt(1); + if (cha == 'a' | cha == 'u' | cha == 'U') { + if (s.matches("\\\\[auU][0-9A-Fa-f]+")) { + // ascii escape (a), utf16 escape (u) or utf32 escape (U) + return factory.integer(Integer.parseInt(s.substring(2), 16)); + } + } switch (cha) { case 't': return factory.integer('\t'); case 'n': return factory.integer('\n'); case 'r': return factory.integer('\r'); - case '\"' : return factory.integer('\"'); - case '\'' : return factory.integer('\''); - case '-' : return factory.integer('-'); - case '<' : return factory.integer('<'); - case '>' : return factory.integer('>'); - case '\\' : return factory.integer('\\'); - default: return factory.integer(cha); + case 'f': return factory.integer('\f'); + case 'b': return factory.integer('\b'); + default: return factory.integer(cha); //fallback is just the character thats escaped } } else { From 1d2d2dca95c8edbf841e7c93c599119ae509d126 Mon Sep 17 00:00:00 2001 From: Davy Landman Date: Tue, 27 Aug 2024 15:07:05 +0200 Subject: [PATCH 8/8] Improved logic for char escapes --- .../values/parsetrees/SymbolFactory.java | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/org/rascalmpl/values/parsetrees/SymbolFactory.java b/src/org/rascalmpl/values/parsetrees/SymbolFactory.java index 9e6a47f49bc..a77ab2de799 100644 --- a/src/org/rascalmpl/values/parsetrees/SymbolFactory.java +++ b/src/org/rascalmpl/values/parsetrees/SymbolFactory.java @@ -16,6 +16,7 @@ import java.io.IOException; import java.io.StringReader; import java.util.List; +import java.util.regex.Pattern; import org.rascalmpl.ast.CaseInsensitiveStringConstant; import org.rascalmpl.ast.Char; @@ -292,18 +293,24 @@ else if (range.isFromTo()) { return result.done(); } + private static final Pattern IS_UNICODE_ESCAPE = Pattern.compile("\\\\[auU][0-9A-Fa-f]+"); + private static IValue char2int(Char character) { String s = ((Char.Lexical) character).getString(); if (s.startsWith("\\")) { // builtin escape int cha = s.codePointAt(1); - if (cha == 'a' | cha == 'u' | cha == 'U') { - if (s.matches("\\\\[auU][0-9A-Fa-f]+")) { - // ascii escape (a), utf16 escape (u) or utf32 escape (U) - return factory.integer(Integer.parseInt(s.substring(2), 16)); - } - } switch (cha) { + case 'a': // fallthrough + case 'u': // fallthrough + case 'U': + if (IS_UNICODE_ESCAPE.matcher(s).matches()) { + // ascii escape (a), utf16 escape (u) or utf32 escape (U) + return factory.integer(Integer.parseInt(s.substring(2), 16)); + } + else { + return factory.integer(cha); + } case 't': return factory.integer('\t'); case 'n': return factory.integer('\n'); case 'r': return factory.integer('\r');