@@ -227,16 +227,48 @@ private static ImmutableArray<Diagnostic> EmitRegexMethod(IndentedTextWriter wri
227
227
writer . WriteLine ( $ " protected override bool FindFirstChar()") ;
228
228
writer . WriteLine ( $ " {{") ;
229
229
writer . Indent += 4 ;
230
- EmitFindFirstChar ( writer , rm , id ) ;
230
+ RequiredHelperFunctions requiredHelpers = EmitFindFirstChar ( writer , rm , id ) ;
231
231
writer . Indent -= 4 ;
232
232
writer . WriteLine ( $ " }}") ;
233
233
writer . WriteLine ( ) ;
234
234
writer . WriteLine ( $ " protected override void Go()") ;
235
235
writer . WriteLine ( $ " {{") ;
236
236
writer . Indent += 4 ;
237
- EmitGo ( writer , rm , id ) ;
237
+ requiredHelpers |= EmitGo ( writer , rm , id ) ;
238
238
writer . Indent -= 4 ;
239
239
writer . WriteLine ( $ " }}") ;
240
+
241
+ if ( ( requiredHelpers & RequiredHelperFunctions . IsWordChar ) != 0 )
242
+ {
243
+ writer . WriteLine ( ) ;
244
+ writer . WriteLine ( $ " /// <summary>Determines whether the character is part of the [\\ w] set.</summary>") ;
245
+ writer . WriteLine ( $ " [global::System.Runtime.CompilerServices.MethodImpl(global::System.Runtime.CompilerServices.MethodImplOptions.AggressiveInlining)]") ;
246
+ writer . WriteLine ( $ " private static bool IsWordChar(char ch)") ;
247
+ writer . WriteLine ( $ " {{") ;
248
+ writer . WriteLine ( $ " global::System.ReadOnlySpan<byte> ascii = new byte[]") ;
249
+ writer . WriteLine ( $ " {{") ;
250
+ writer . WriteLine ( $ " 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,") ;
251
+ writer . WriteLine ( $ " 0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07") ;
252
+ writer . WriteLine ( $ " }};") ;
253
+ writer . WriteLine ( ) ;
254
+ writer . WriteLine ( $ " int chDiv8 = ch >> 3;") ;
255
+ writer . WriteLine ( $ " return (uint)chDiv8 < (uint)ascii.Length ?") ;
256
+ writer . WriteLine ( $ " (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :") ;
257
+ writer . WriteLine ( $ " global::System.Globalization.CharUnicodeInfo.GetUnicodeCategory(ch) switch") ;
258
+ writer . WriteLine ( $ " {{") ;
259
+ writer . WriteLine ( $ " global::System.Globalization.UnicodeCategory.UppercaseLetter or") ;
260
+ writer . WriteLine ( $ " global::System.Globalization.UnicodeCategory.LowercaseLetter or") ;
261
+ writer . WriteLine ( $ " global::System.Globalization.UnicodeCategory.TitlecaseLetter or") ;
262
+ writer . WriteLine ( $ " global::System.Globalization.UnicodeCategory.ModifierLetter or") ;
263
+ writer . WriteLine ( $ " global::System.Globalization.UnicodeCategory.OtherLetter or") ;
264
+ writer . WriteLine ( $ " global::System.Globalization.UnicodeCategory.NonSpacingMark or") ;
265
+ writer . WriteLine ( $ " global::System.Globalization.UnicodeCategory.DecimalDigitNumber or") ;
266
+ writer . WriteLine ( $ " global::System.Globalization.UnicodeCategory.ConnectorPunctuation => true,") ;
267
+ writer . WriteLine ( $ " _ => false,") ;
268
+ writer . WriteLine ( $ " }};") ;
269
+ writer . WriteLine ( $ " }}") ;
270
+ }
271
+
240
272
writer . WriteLine ( $ " }}") ;
241
273
writer . WriteLine ( $ " }}") ;
242
274
writer . WriteLine ( "}" ) ;
@@ -266,11 +298,12 @@ static void AppendHashtableContents(IndentedTextWriter writer, Hashtable ht)
266
298
}
267
299
268
300
/// <summary>Emits the body of the FindFirstChar override.</summary>
269
- private static void EmitFindFirstChar ( IndentedTextWriter writer , RegexMethod rm , string id )
301
+ private static RequiredHelperFunctions EmitFindFirstChar ( IndentedTextWriter writer , RegexMethod rm , string id )
270
302
{
271
303
RegexOptions options = ( RegexOptions ) rm . Options ;
272
304
RegexCode code = rm . Code ;
273
305
bool hasTextInfo = false ;
306
+ RequiredHelperFunctions requiredHelpers = RequiredHelperFunctions . None ;
274
307
275
308
// In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later.
276
309
// To handle that, we build up a collection of all the declarations to include, track where they should be inserted,
@@ -344,7 +377,7 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm,
344
377
345
378
// We're done. Patch up any additional declarations.
346
379
ReplaceAdditionalDeclarations ( writer , additionalDeclarations , additionalDeclarationsPosition , additionalDeclarationsIndent ) ;
347
- return ;
380
+ return requiredHelpers ;
348
381
349
382
// Emits any anchors. Returns true if the anchor roots any match to a specific location and thus no further
350
383
// searching is required; otherwise, false.
@@ -518,7 +551,7 @@ void EmitFixedSet()
518
551
for ( ; setIndex < setsToUse ; setIndex ++ )
519
552
{
520
553
string spanIndex = $ "span[i{ ( sets [ setIndex ] . Distance > 0 ? $ " + { sets [ setIndex ] . Distance } " : "" ) } ]";
521
- string charInClassExpr = MatchCharacterClass ( hasTextInfo , options , spanIndex , sets [ setIndex ] . Set , sets [ setIndex ] . CaseInsensitive , additionalDeclarations ) ;
554
+ string charInClassExpr = MatchCharacterClass ( hasTextInfo , options , spanIndex , sets [ setIndex ] . Set , sets [ setIndex ] . CaseInsensitive , additionalDeclarations , ref requiredHelpers ) ;
522
555
523
556
if ( setIndex == start )
524
557
{
@@ -571,7 +604,7 @@ FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or
571
604
}
572
605
573
606
/// <summary>Emits the body of the Go override.</summary>
574
- private static void EmitGo ( IndentedTextWriter writer , RegexMethod rm , string id )
607
+ private static RequiredHelperFunctions EmitGo ( IndentedTextWriter writer , RegexMethod rm , string id )
575
608
{
576
609
// In .NET Framework and up through .NET Core 3.1, the code generated for RegexOptions.Compiled was effectively an unrolled
577
610
// version of what RegexInterpreter would process. The RegexNode tree would be turned into a series of opcodes via
@@ -599,6 +632,7 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
599
632
600
633
RegexOptions options = ( RegexOptions ) rm . Options ;
601
634
RegexCode code = rm . Code ;
635
+ RequiredHelperFunctions requiredHelpers = RequiredHelperFunctions . None ;
602
636
603
637
// Helper to define names. Names start unadorned, but as soon as there's repetition,
604
638
// they begin to have a numbered suffix.
@@ -622,14 +656,14 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
622
656
writer . WriteLine ( $ "int end = start + { ( node . Type == RegexNode . Multi ? node . Str ! . Length : 1 ) } ;") ;
623
657
writer . WriteLine ( "base.Capture(0, start, end);" ) ;
624
658
writer . WriteLine ( "base.runtextpos = end;" ) ;
625
- return ;
659
+ return requiredHelpers ;
626
660
627
661
case RegexNode . Empty :
628
662
// This case isn't common in production, but it's very common when first getting started with the
629
663
// source generator and seeing what happens as you add more to expressions. When approaching
630
664
// it from a learning perspective, this is very common, as it's the empty string you start with.
631
665
writer . WriteLine ( "base.Capture(0, base.runtextpos, base.runtextpos);" ) ;
632
- return ;
666
+ return requiredHelpers ;
633
667
}
634
668
635
669
// In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later.
@@ -717,7 +751,7 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
717
751
}
718
752
}
719
753
720
- return ;
754
+ return requiredHelpers ;
721
755
722
756
// Helper to create a name guaranteed to be unique within the function.
723
757
string ReserveName ( string prefix )
@@ -1864,7 +1898,7 @@ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, string? offset
1864
1898
1865
1899
if ( node . IsSetFamily )
1866
1900
{
1867
- expr = $ "!{ MatchCharacterClass ( hasTextInfo , options , expr , node . Str ! , IsCaseInsensitive ( node ) , additionalDeclarations ) } ";
1901
+ expr = $ "!{ MatchCharacterClass ( hasTextInfo , options , expr , node . Str ! , IsCaseInsensitive ( node ) , additionalDeclarations , ref requiredHelpers ) } ";
1868
1902
}
1869
1903
else
1870
1904
{
@@ -2662,7 +2696,7 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired =
2662
2696
string expr = $ "{ sliceSpan } [{ iterationLocal } ]";
2663
2697
if ( node . IsSetFamily )
2664
2698
{
2665
- expr = MatchCharacterClass ( hasTextInfo , options , expr , node . Str ! , IsCaseInsensitive ( node ) , additionalDeclarations ) ;
2699
+ expr = MatchCharacterClass ( hasTextInfo , options , expr , node . Str ! , IsCaseInsensitive ( node ) , additionalDeclarations , ref requiredHelpers ) ;
2666
2700
}
2667
2701
else
2668
2702
{
@@ -2716,7 +2750,7 @@ void EmitAtomicSingleCharZeroOrOne(RegexNode node)
2716
2750
string expr = $ "{ sliceSpan } [{ sliceStaticPos } ]";
2717
2751
if ( node . IsSetFamily )
2718
2752
{
2719
- expr = MatchCharacterClass ( hasTextInfo , options , expr , node . Str ! , IsCaseInsensitive ( node ) , additionalDeclarations ) ;
2753
+ expr = MatchCharacterClass ( hasTextInfo , options , expr , node . Str ! , IsCaseInsensitive ( node ) , additionalDeclarations , ref requiredHelpers ) ;
2720
2754
}
2721
2755
else
2722
2756
{
@@ -3070,7 +3104,7 @@ private static bool EmitInitializeCultureForGoIfNecessary(IndentedTextWriter wri
3070
3104
3071
3105
private static string ToLowerIfNeeded ( bool hasTextInfo , RegexOptions options , string expression , bool toLower ) => toLower ? ToLower ( hasTextInfo , options , expression ) : expression ;
3072
3106
3073
- private static string MatchCharacterClass ( bool hasTextInfo , RegexOptions options , string chExpr , string charClass , bool caseInsensitive , HashSet < string > ? additionalDeclarations )
3107
+ private static string MatchCharacterClass ( bool hasTextInfo , RegexOptions options , string chExpr , string charClass , bool caseInsensitive , HashSet < string > additionalDeclarations , ref RequiredHelperFunctions requiredHelpers )
3074
3108
{
3075
3109
// We need to perform the equivalent of calling RegexRunner.CharInClass(ch, charClass),
3076
3110
// but that call is relatively expensive. Before we fall back to it, we try to optimize
@@ -3097,6 +3131,14 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
3097
3131
3098
3132
case RegexCharClass . NotSpaceClass :
3099
3133
return $ "!char.IsWhiteSpace({ chExpr } )";
3134
+
3135
+ case RegexCharClass . WordClass :
3136
+ requiredHelpers |= RequiredHelperFunctions . IsWordChar ;
3137
+ return $ "IsWordChar({ chExpr } )";
3138
+
3139
+ case RegexCharClass . NotWordClass :
3140
+ requiredHelpers |= RequiredHelperFunctions . IsWordChar ;
3141
+ return $ "!IsWordChar({ chExpr } )";
3100
3142
}
3101
3143
3102
3144
// If we're meant to be doing a case-insensitive lookup, and if we're not using the invariant culture,
@@ -3146,11 +3188,11 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
3146
3188
{
3147
3189
return $ "(({ chExpr } | 0x20) == { Literal ( setChars [ 1 ] ) } )";
3148
3190
}
3149
- additionalDeclarations ? . Add ( "char ch;" ) ;
3191
+ additionalDeclarations . Add ( "char ch;" ) ;
3150
3192
return $ "(((ch = { chExpr } ) == { Literal ( setChars [ 0 ] ) } ) | (ch == { Literal ( setChars [ 1 ] ) } ))";
3151
3193
3152
3194
case 3 :
3153
- additionalDeclarations ? . Add ( "char ch;" ) ;
3195
+ additionalDeclarations . Add ( "char ch;" ) ;
3154
3196
return ( setChars [ 0 ] | 0x20 ) == setChars [ 1 ] ?
3155
3197
$ "((((ch = { chExpr } ) | 0x20) == { Literal ( setChars [ 1 ] ) } ) | (ch == { Literal ( setChars [ 2 ] ) } ))" :
3156
3198
$ "(((ch = { chExpr } ) == { Literal ( setChars [ 0 ] ) } ) | (ch == { Literal ( setChars [ 1 ] ) } ) | (ch == { Literal ( setChars [ 2 ] ) } ))";
@@ -3159,15 +3201,15 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options
3159
3201
if ( ( ( setChars [ 0 ] | 0x20 ) == setChars [ 1 ] ) &&
3160
3202
( ( setChars [ 2 ] | 0x20 ) == setChars [ 3 ] ) )
3161
3203
{
3162
- additionalDeclarations ? . Add ( "char ch;" ) ;
3204
+ additionalDeclarations . Add ( "char ch;" ) ;
3163
3205
return $ "(((ch = ({ chExpr } | 0x20)) == { Literal ( setChars [ 1 ] ) } ) | (ch == { Literal ( setChars [ 3 ] ) } ))";
3164
3206
}
3165
3207
break ;
3166
3208
}
3167
3209
}
3168
3210
3169
3211
// All options after this point require a ch local.
3170
- additionalDeclarations ? . Add ( "char ch;" ) ;
3212
+ additionalDeclarations . Add ( "char ch;" ) ;
3171
3213
3172
3214
// Analyze the character set more to determine what code to generate.
3173
3215
RegexCharClass . CharClassAnalysisResults analysis = RegexCharClass . Analyze ( charClass ) ;
@@ -3471,5 +3513,15 @@ public void Dispose()
3471
3513
}
3472
3514
}
3473
3515
}
3516
+
3517
+ /// <summary>Bit flags indicating which additional helpers should be emitted into the regex class.</summary>
3518
+ [ Flags ]
3519
+ private enum RequiredHelperFunctions
3520
+ {
3521
+ /// <summary>No additional functions are required.</summary>
3522
+ None ,
3523
+ /// <summary>The IsWordChar helper is required.</summary>
3524
+ IsWordChar
3525
+ }
3474
3526
}
3475
3527
}
0 commit comments