Skip to content

Commit 7dcb289

Browse files
authored
Add \w helper function to RegexCompiler / source generator (#62620)
1 parent 6f40e48 commit 7dcb289

File tree

6 files changed

+143
-43
lines changed

6 files changed

+143
-43
lines changed

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

+69-17
Original file line numberDiff line numberDiff line change
@@ -227,16 +227,48 @@ private static ImmutableArray<Diagnostic> EmitRegexMethod(IndentedTextWriter wri
227227
writer.WriteLine($" protected override bool FindFirstChar()");
228228
writer.WriteLine($" {{");
229229
writer.Indent += 4;
230-
EmitFindFirstChar(writer, rm, id);
230+
RequiredHelperFunctions requiredHelpers = EmitFindFirstChar(writer, rm, id);
231231
writer.Indent -= 4;
232232
writer.WriteLine($" }}");
233233
writer.WriteLine();
234234
writer.WriteLine($" protected override void Go()");
235235
writer.WriteLine($" {{");
236236
writer.Indent += 4;
237-
EmitGo(writer, rm, id);
237+
requiredHelpers |= EmitGo(writer, rm, id);
238238
writer.Indent -= 4;
239239
writer.WriteLine($" }}");
240+
241+
if ((requiredHelpers & RequiredHelperFunctions.IsWordChar) != 0)
242+
{
243+
writer.WriteLine();
244+
writer.WriteLine($" /// <summary>Determines whether the character is part of the [\\w] set.</summary>");
245+
writer.WriteLine($" [global::System.Runtime.CompilerServices.MethodImpl(global::System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)]");
246+
writer.WriteLine($" private static bool IsWordChar(char ch)");
247+
writer.WriteLine($" {{");
248+
writer.WriteLine($" global::System.ReadOnlySpan<byte> ascii = new byte[]");
249+
writer.WriteLine($" {{");
250+
writer.WriteLine($" 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,");
251+
writer.WriteLine($" 0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07");
252+
writer.WriteLine($" }};");
253+
writer.WriteLine();
254+
writer.WriteLine($" int chDiv8 = ch >> 3;");
255+
writer.WriteLine($" return (uint)chDiv8 < (uint)ascii.Length ?");
256+
writer.WriteLine($" (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :");
257+
writer.WriteLine($" global::System.Globalization.CharUnicodeInfo.GetUnicodeCategory(ch) switch");
258+
writer.WriteLine($" {{");
259+
writer.WriteLine($" global::System.Globalization.UnicodeCategory.UppercaseLetter or");
260+
writer.WriteLine($" global::System.Globalization.UnicodeCategory.LowercaseLetter or");
261+
writer.WriteLine($" global::System.Globalization.UnicodeCategory.TitlecaseLetter or");
262+
writer.WriteLine($" global::System.Globalization.UnicodeCategory.ModifierLetter or");
263+
writer.WriteLine($" global::System.Globalization.UnicodeCategory.OtherLetter or");
264+
writer.WriteLine($" global::System.Globalization.UnicodeCategory.NonSpacingMark or");
265+
writer.WriteLine($" global::System.Globalization.UnicodeCategory.DecimalDigitNumber or");
266+
writer.WriteLine($" global::System.Globalization.UnicodeCategory.ConnectorPunctuation => true,");
267+
writer.WriteLine($" _ => false,");
268+
writer.WriteLine($" }};");
269+
writer.WriteLine($" }}");
270+
}
271+
240272
writer.WriteLine($" }}");
241273
writer.WriteLine($" }}");
242274
writer.WriteLine("}");
@@ -266,11 +298,12 @@ static void AppendHashtableContents(IndentedTextWriter writer, Hashtable ht)
266298
}
267299

268300
/// <summary>Emits the body of the FindFirstChar override.</summary>
269-
private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, string id)
301+
private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, string id)
270302
{
271303
RegexOptions options = (RegexOptions)rm.Options;
272304
RegexCode code = rm.Code;
273305
bool hasTextInfo = false;
306+
RequiredHelperFunctions requiredHelpers = RequiredHelperFunctions.None;
274307

275308
// In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later.
276309
// To handle that, we build up a collection of all the declarations to include, track where they should be inserted,
@@ -344,7 +377,7 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm,
344377

345378
// We're done. Patch up any additional declarations.
346379
ReplaceAdditionalDeclarations(writer, additionalDeclarations, additionalDeclarationsPosition, additionalDeclarationsIndent);
347-
return;
380+
return requiredHelpers;
348381

349382
// Emits any anchors. Returns true if the anchor roots any match to a specific location and thus no further
350383
// searching is required; otherwise, false.
@@ -518,7 +551,7 @@ void EmitFixedSet()
518551
for (; setIndex < setsToUse; setIndex++)
519552
{
520553
string spanIndex = $"span[i{(sets[setIndex].Distance > 0 ? $" + {sets[setIndex].Distance}" : "")}]";
521-
string charInClassExpr = MatchCharacterClass(hasTextInfo, options, spanIndex, sets[setIndex].Set, sets[setIndex].CaseInsensitive, additionalDeclarations);
554+
string charInClassExpr = MatchCharacterClass(hasTextInfo, options, spanIndex, sets[setIndex].Set, sets[setIndex].CaseInsensitive, additionalDeclarations, ref requiredHelpers);
522555

523556
if (setIndex == start)
524557
{
@@ -571,7 +604,7 @@ FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or
571604
}
572605

573606
/// <summary>Emits the body of the Go override.</summary>
574-
private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
607+
private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
575608
{
576609
// In .NET Framework and up through .NET Core 3.1, the code generated for RegexOptions.Compiled was effectively an unrolled
577610
// version of what RegexInterpreter would process. The RegexNode tree would be turned into a series of opcodes via
@@ -599,6 +632,7 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
599632

600633
RegexOptions options = (RegexOptions)rm.Options;
601634
RegexCode code = rm.Code;
635+
RequiredHelperFunctions requiredHelpers = RequiredHelperFunctions.None;
602636

603637
// Helper to define names. Names start unadorned, but as soon as there's repetition,
604638
// they begin to have a numbered suffix.
@@ -622,14 +656,14 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
622656
writer.WriteLine($"int end = start + {(node.Type == RegexNode.Multi ? node.Str!.Length : 1)};");
623657
writer.WriteLine("base.Capture(0, start, end);");
624658
writer.WriteLine("base.runtextpos = end;");
625-
return;
659+
return requiredHelpers;
626660

627661
case RegexNode.Empty:
628662
// This case isn't common in production, but it's very common when first getting started with the
629663
// source generator and seeing what happens as you add more to expressions. When approaching
630664
// it from a learning perspective, this is very common, as it's the empty string you start with.
631665
writer.WriteLine("base.Capture(0, base.runtextpos, base.runtextpos);");
632-
return;
666+
return requiredHelpers;
633667
}
634668

635669
// In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later.
@@ -717,7 +751,7 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
717751
}
718752
}
719753

720-
return;
754+
return requiredHelpers;
721755

722756
// Helper to create a name guaranteed to be unique within the function.
723757
string ReserveName(string prefix)
@@ -1864,7 +1898,7 @@ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, string? offset
18641898

18651899
if (node.IsSetFamily)
18661900
{
1867-
expr = $"!{MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations)}";
1901+
expr = $"!{MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations, ref requiredHelpers)}";
18681902
}
18691903
else
18701904
{
@@ -2662,7 +2696,7 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired =
26622696
string expr = $"{sliceSpan}[{iterationLocal}]";
26632697
if (node.IsSetFamily)
26642698
{
2665-
expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations);
2699+
expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations, ref requiredHelpers);
26662700
}
26672701
else
26682702
{
@@ -2716,7 +2750,7 @@ void EmitAtomicSingleCharZeroOrOne(RegexNode node)
27162750
string expr = $"{sliceSpan}[{sliceStaticPos}]";
27172751
if (node.IsSetFamily)
27182752
{
2719-
expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations);
2753+
expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), additionalDeclarations, ref requiredHelpers);
27202754
}
27212755
else
27222756
{
@@ -3070,7 +3104,7 @@ private static bool EmitInitializeCultureForGoIfNecessary(IndentedTextWriter wri
30703104

30713105
private static string ToLowerIfNeeded(bool hasTextInfo, RegexOptions options, string expression, bool toLower) => toLower ? ToLower(hasTextInfo, options, expression) : expression;
30723106

3073-
private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options, string chExpr, string charClass, bool caseInsensitive, HashSet<string>? additionalDeclarations)
3107+
private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options, string chExpr, string charClass, bool caseInsensitive, HashSet<string> additionalDeclarations, ref RequiredHelperFunctions requiredHelpers)
30743108
{
30753109
// We need to perform the equivalent of calling RegexRunner.CharInClass(ch, charClass),
30763110
// but that call is relatively expensive. Before we fall back to it, we try to optimize
@@ -3097,6 +3131,14 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
30973131

30983132
case RegexCharClass.NotSpaceClass:
30993133
return $"!char.IsWhiteSpace({chExpr})";
3134+
3135+
case RegexCharClass.WordClass:
3136+
requiredHelpers |= RequiredHelperFunctions.IsWordChar;
3137+
return $"IsWordChar({chExpr})";
3138+
3139+
case RegexCharClass.NotWordClass:
3140+
requiredHelpers |= RequiredHelperFunctions.IsWordChar;
3141+
return $"!IsWordChar({chExpr})";
31003142
}
31013143

31023144
// If we're meant to be doing a case-insensitive lookup, and if we're not using the invariant culture,
@@ -3146,11 +3188,11 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
31463188
{
31473189
return $"(({chExpr} | 0x20) == {Literal(setChars[1])})";
31483190
}
3149-
additionalDeclarations?.Add("char ch;");
3191+
additionalDeclarations.Add("char ch;");
31503192
return $"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}))";
31513193

31523194
case 3:
3153-
additionalDeclarations?.Add("char ch;");
3195+
additionalDeclarations.Add("char ch;");
31543196
return (setChars[0] | 0x20) == setChars[1] ?
31553197
$"((((ch = {chExpr}) | 0x20) == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))" :
31563198
$"(((ch = {chExpr}) == {Literal(setChars[0])}) | (ch == {Literal(setChars[1])}) | (ch == {Literal(setChars[2])}))";
@@ -3159,15 +3201,15 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
31593201
if (((setChars[0] | 0x20) == setChars[1]) &&
31603202
((setChars[2] | 0x20) == setChars[3]))
31613203
{
3162-
additionalDeclarations?.Add("char ch;");
3204+
additionalDeclarations.Add("char ch;");
31633205
return $"(((ch = ({chExpr} | 0x20)) == {Literal(setChars[1])}) | (ch == {Literal(setChars[3])}))";
31643206
}
31653207
break;
31663208
}
31673209
}
31683210

31693211
// All options after this point require a ch local.
3170-
additionalDeclarations?.Add("char ch;");
3212+
additionalDeclarations.Add("char ch;");
31713213

31723214
// Analyze the character set more to determine what code to generate.
31733215
RegexCharClass.CharClassAnalysisResults analysis = RegexCharClass.Analyze(charClass);
@@ -3471,5 +3513,15 @@ public void Dispose()
34713513
}
34723514
}
34733515
}
3516+
3517+
/// <summary>Bit flags indicating which additional helpers should be emitted into the regex class.</summary>
3518+
[Flags]
3519+
private enum RequiredHelperFunctions
3520+
{
3521+
/// <summary>No additional functions are required.</summary>
3522+
None,
3523+
/// <summary>The IsWordChar helper is required.</summary>
3524+
IsWordChar
3525+
}
34743526
}
34753527
}

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs

+45-13
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,6 @@ internal sealed partial class RegexCharClass
4141
private const short SpaceConst = 100;
4242
private const short NotSpaceConst = -100;
4343

44-
private const char ZeroWidthJoiner = '\u200D';
45-
private const char ZeroWidthNonJoiner = '\u200C';
46-
4744
private const string InternalRegexIgnoreCase = "__InternalRegexIgnoreCase__";
4845
private const string Space = "\x64";
4946
private const string NotSpace = "\uFF9C";
@@ -975,25 +972,59 @@ public static bool IsECMAWordChar(char ch) =>
975972
ch == '_' || // underscore
976973
ch == '\u0130'; // latin capital letter I with dot above
977974

975+
/// <summary>16 bytes, representing the chars 0 through 127, with a 1 for a bit where that char is a word char.</summary>
976+
private static ReadOnlySpan<byte> WordCharAsciiLookup => new byte[]
977+
{
978+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
979+
0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
980+
};
981+
982+
/// <summary>Determines whether a character is considered a word character for the purposes of testing the \w set.</summary>
978983
public static bool IsWordChar(char ch)
984+
{
985+
// This is the same as IsBoundaryWordChar, except that IsBoundaryWordChar also
986+
// returns true for \u200c and \u200d.
987+
988+
// Fast lookup in our lookup table for ASCII characters. This is purely an optimization, and has the
989+
// behavior as if we fell through to the switch below (which was actually used to produce the lookup table).
990+
ReadOnlySpan<byte> asciiLookup = WordCharAsciiLookup;
991+
int chDiv8 = ch >> 3;
992+
if ((uint)chDiv8 < (uint)asciiLookup.Length)
993+
{
994+
return (asciiLookup[chDiv8] & (1 << (ch & 0x7))) != 0;
995+
}
996+
997+
// For non-ASCII, fall back to checking the Unicode category.
998+
switch (CharUnicodeInfo.GetUnicodeCategory(ch))
999+
{
1000+
case UnicodeCategory.UppercaseLetter:
1001+
case UnicodeCategory.LowercaseLetter:
1002+
case UnicodeCategory.TitlecaseLetter:
1003+
case UnicodeCategory.ModifierLetter:
1004+
case UnicodeCategory.OtherLetter:
1005+
case UnicodeCategory.NonSpacingMark:
1006+
case UnicodeCategory.DecimalDigitNumber:
1007+
case UnicodeCategory.ConnectorPunctuation:
1008+
return true;
1009+
1010+
default:
1011+
return false;
1012+
}
1013+
}
1014+
1015+
/// <summary>Determines whether a character is considered a word character for the purposes of testing a word character boundary.</summary>
1016+
public static bool IsBoundaryWordChar(char ch)
9791017
{
9801018
// According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/)
9811019
// RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic
9821020
// values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C
9831021
// ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.
9841022

985-
// 16 bytes, representing the chars 0 through 127, with a 1 for a bit where that char is a word char
986-
static ReadOnlySpan<byte> AsciiLookup() => new byte[]
987-
{
988-
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
989-
0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
990-
};
991-
9921023
// Fast lookup in our lookup table for ASCII characters. This is purely an optimization, and has the
9931024
// behavior as if we fell through to the switch below (which was actually used to produce the lookup table).
994-
ReadOnlySpan<byte> asciiLookup = AsciiLookup();
1025+
ReadOnlySpan<byte> asciiLookup = WordCharAsciiLookup;
9951026
int chDiv8 = ch >> 3;
996-
if ((uint)chDiv8 < asciiLookup.Length)
1027+
if ((uint)chDiv8 < (uint)asciiLookup.Length)
9971028
{
9981029
return (asciiLookup[chDiv8] & (1 << (ch & 0x7))) != 0;
9991030
}
@@ -1012,7 +1043,8 @@ public static bool IsWordChar(char ch)
10121043
return true;
10131044

10141045
default:
1015-
return ch == ZeroWidthJoiner || ch == ZeroWidthNonJoiner;
1046+
const char ZeroWidthNonJoiner = '\u200C', ZeroWidthJoiner = '\u200D';
1047+
return ch == ZeroWidthJoiner | ch == ZeroWidthNonJoiner;
10161048
}
10171049
}
10181050

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs

+13
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ internal abstract class RegexCompiler
3030
private static readonly MethodInfo s_matchLengthMethod = RegexRunnerMethod("MatchLength");
3131
private static readonly MethodInfo s_matchIndexMethod = RegexRunnerMethod("MatchIndex");
3232
private static readonly MethodInfo s_isBoundaryMethod = RegexRunnerMethod("IsBoundary");
33+
private static readonly MethodInfo s_isWordCharMethod = RegexRunnerMethod("IsWordChar");
3334
private static readonly MethodInfo s_isECMABoundaryMethod = RegexRunnerMethod("IsECMABoundary");
3435
private static readonly MethodInfo s_crawlposMethod = RegexRunnerMethod("Crawlpos");
3536
private static readonly MethodInfo s_charInClassMethod = RegexRunnerMethod("CharInClass");
@@ -3529,6 +3530,18 @@ private void EmitMatchCharacterClass(string charClass, bool caseInsensitive)
35293530
Ldc(0);
35303531
Ceq();
35313532
return;
3533+
3534+
case RegexCharClass.WordClass:
3535+
// RegexRunner.IsWordChar(ch)
3536+
Call(s_isWordCharMethod);
3537+
return;
3538+
3539+
case RegexCharClass.NotWordClass:
3540+
// !RegexRunner.IsWordChar(ch)
3541+
Call(s_isWordCharMethod);
3542+
Ldc(0);
3543+
Ceq();
3544+
return;
35323545
}
35333546

35343547
// If we're meant to be doing a case-insensitive lookup, and if we're not using the invariant culture,

0 commit comments

Comments
 (0)