Skip to content

Commit d5ce000

Browse files
authored
Use RegexCaseEquivalence table for case-insensitive backreferences (#67977)
1 parent 9594b9c commit d5ce000

File tree

11 files changed

+190
-114
lines changed

11 files changed

+190
-114
lines changed

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

Lines changed: 6 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -181,27 +181,6 @@ private static void EmitRegexDerivedTypeRunnerFactory(IndentedTextWriter writer,
181181
writer.WriteLine($" /// <summary>Provides the runner that contains the custom logic implementing the specified regular expression.</summary>");
182182
writer.WriteLine($" private sealed class Runner : RegexRunner");
183183
writer.WriteLine($" {{");
184-
if (rm.Tree.Culture != null)
185-
{
186-
// If the RegexTree has Culture set, then we need to emit the _textInfo field that should be used at match time for casing operations.
187-
// Instead of just using the culture set on the tree, we use it to check which behavior corresponds to the culture, and based on that we
188-
// create a TextInfo based on a well-known culture that has the same behavior. This is done in order to ensure that the culture being used at
189-
// runtime will be supported no matter where the code ends up running.
190-
writer.WriteLine($" /// <summary>TextInfo that will be used for Backreference case comparisons.</summary>");
191-
switch (RegexCaseEquivalences.GetRegexBehavior(rm.Tree.Culture))
192-
{
193-
case RegexCaseBehavior.Invariant:
194-
writer.WriteLine($" private readonly TextInfo _textInfo = CultureInfo.InvariantCulture.TextInfo;");
195-
break;
196-
case RegexCaseBehavior.NonTurkish:
197-
writer.WriteLine($" private readonly TextInfo _textInfo = CultureInfo.GetCultureInfo(\"en-US\").TextInfo;");
198-
break;
199-
case RegexCaseBehavior.Turkish:
200-
writer.WriteLine($" private readonly TextInfo _textInfo = CultureInfo.GetCultureInfo(\"tr-TR\").TextInfo;");
201-
break;
202-
}
203-
writer.WriteLine();
204-
}
205184
if (rm.MatchTimeout is null)
206185
{
207186
// We need to emit timeout checks for everything other than the developer explicitly setting Timeout.Infinite.
@@ -1625,38 +1604,14 @@ void EmitWhenHasCapture()
16251604
additionalDeclarations.Add("int matchLength = 0;");
16261605
writer.WriteLine($"matchLength = base.MatchLength({capnum});");
16271606

1628-
bool caseInsensitive = (node.Options & RegexOptions.IgnoreCase) != 0;
1629-
16301607
if ((node.Options & RegexOptions.RightToLeft) == 0)
16311608
{
1632-
if (!caseInsensitive)
1609+
// Validate that the remaining length of the slice is sufficient
1610+
// to possibly match, and then do a SequenceEqual against the matched text.
1611+
writer.WriteLine($"if ({sliceSpan}.Length < matchLength || ");
1612+
using (EmitBlock(writer, $" !inputSpan.Slice(base.MatchIndex({capnum}), matchLength).SequenceEqual({sliceSpan}.Slice(0, matchLength)))"))
16331613
{
1634-
// If we're case-sensitive, we can simply validate that the remaining length of the slice is sufficient
1635-
// to possibly match, and then do a SequenceEqual against the matched text.
1636-
writer.WriteLine($"if ({sliceSpan}.Length < matchLength || ");
1637-
using (EmitBlock(writer, $" !inputSpan.Slice(base.MatchIndex({capnum}), matchLength).SequenceEqual({sliceSpan}.Slice(0, matchLength)))"))
1638-
{
1639-
Goto(doneLabel);
1640-
}
1641-
}
1642-
else
1643-
{
1644-
// For case-insensitive, we have to walk each character individually.
1645-
using (EmitBlock(writer, $"if ({sliceSpan}.Length < matchLength)"))
1646-
{
1647-
Goto(doneLabel);
1648-
}
1649-
writer.WriteLine();
1650-
1651-
additionalDeclarations.Add("int matchIndex = 0;");
1652-
writer.WriteLine($"matchIndex = base.MatchIndex({capnum});");
1653-
using (EmitBlock(writer, $"for (int i = 0; i < matchLength; i++)"))
1654-
{
1655-
using (EmitBlock(writer, $"if (_textInfo.ToLower(inputSpan[matchIndex + i]) != _textInfo.ToLower({sliceSpan}[i]))"))
1656-
{
1657-
Goto(doneLabel);
1658-
}
1659-
}
1614+
Goto(doneLabel);
16601615
}
16611616

16621617
writer.WriteLine();
@@ -1675,9 +1630,7 @@ void EmitWhenHasCapture()
16751630
writer.WriteLine($"matchIndex = base.MatchIndex({capnum});");
16761631
using (EmitBlock(writer, $"for (int i = 0; i < matchLength; i++)"))
16771632
{
1678-
using (EmitBlock(writer, caseInsensitive ?
1679-
$"if (_textInfo.ToLower(inputSpan[matchIndex + i]) != _textInfo.ToLower(inputSpan[pos - matchLength + i]))" :
1680-
$"if (inputSpan[matchIndex + i] != inputSpan[pos - matchLength + i])"))
1633+
using (EmitBlock(writer, $"if (inputSpan[matchIndex + i] != inputSpan[pos - matchLength + i])"))
16811634
{
16821635
Goto(doneLabel);
16831636
}

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ public void Initialize(IncrementalGeneratorInitializationContext context)
6565

6666
// If we're unable to generate a full implementation for this regex, report a diagnostic.
6767
// We'll still output a limited implementation that just caches a new Regex(...).
68-
if (!regexMethod.Tree.Root.SupportsCompilation(out string? reason))
68+
if (!SupportsCodeGeneration(regexMethod.Tree.Root, out string? reason))
6969
{
7070
return (regexMethod, reason, Diagnostic.Create(DiagnosticDescriptors.LimitedSourceGeneration, regexMethod.MethodSyntax.GetLocation()));
7171
}
@@ -275,6 +275,55 @@ public void Initialize(IncrementalGeneratorInitializationContext context)
275275
});
276276
}
277277

278+
// Determines whether the passed in node supports code generation strategy based on walking the tree.
279+
// Also returns a human-readable string to explain the reason (it will be emitted by the source generator, hence
280+
// there's no need to localize).
281+
private static bool SupportsCodeGeneration(RegexNode node, [NotNullWhen(false)] out string? reason)
282+
{
283+
if (!node.SupportsCompilation(out reason))
284+
{
285+
// If the pattern doesn't support Compilation, then code generation won't be supported either.
286+
return false;
287+
}
288+
289+
if (HasCaseInsensitiveBackReferences(node))
290+
{
291+
// For case-insensitive patterns, we use our internal Regex case equivalence table when doing character comparisons.
292+
// Most of the use of this table is done at Regex construction time by substituting all characters that are involved in
293+
// case conversions into sets that contain all possible characters that could match. That said, there is still one case
294+
// where you may need to do case-insensitive comparisons at match time which is the case for backreferences. For that reason,
295+
// and given the Regex case equivalence table is internal and can't be called by the source generated emitted type, if
296+
// the pattern contains case-insensitive backreferences, we won't try to create a source generated Regex-derived type.
297+
reason = "the expression contains case-insensitive backreferences which are not supported by the source generator";
298+
return false;
299+
}
300+
301+
// If Compilation is supported and pattern doesn't have case insensitive backreferences, then code generation is supported.
302+
reason = null;
303+
return true;
304+
305+
static bool HasCaseInsensitiveBackReferences(RegexNode node)
306+
{
307+
if (node.Kind is RegexNodeKind.Backreference && (node.Options & RegexOptions.IgnoreCase) != 0)
308+
{
309+
return true;
310+
}
311+
312+
int childCount = node.ChildCount();
313+
for (int i = 0; i < childCount; i++)
314+
{
315+
// This recursion shouldn't hit issues with stack depth since this gets checked after
316+
// SupportCompilation has ensured that the max depth is not greater than 40.
317+
if (HasCaseInsensitiveBackReferences(node.Child(i)))
318+
{
319+
return true;
320+
}
321+
}
322+
323+
return false;
324+
}
325+
}
326+
278327
/// <summary>Computes a hash of the string.</summary>
279328
/// <remarks>
280329
/// Currently an FNV-1a hash function. The actual algorithm used doesn't matter; just something

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,20 @@ internal sealed class CompiledRegexRunner : RegexRunner
99
{
1010
private readonly ScanDelegate _scanMethod;
1111
/// <summary>This field will only be set if the pattern contains backreferences and has RegexOptions.IgnoreCase</summary>
12-
private readonly TextInfo? _textInfo;
12+
private readonly CultureInfo? _culture;
13+
14+
#pragma warning disable CA1823 // Avoid unused private fields. Justification: Used via reflection to cache the Case behavior if needed.
15+
#pragma warning disable CS0169
16+
private RegexCaseBehavior _caseBehavior;
17+
#pragma warning restore CS0169
18+
#pragma warning restore CA1823
1319

1420
internal delegate void ScanDelegate(RegexRunner runner, ReadOnlySpan<char> text);
1521

1622
public CompiledRegexRunner(ScanDelegate scan, CultureInfo? culture)
1723
{
1824
_scanMethod = scan;
19-
_textInfo = culture?.TextInfo;
25+
_culture = culture;
2026
}
2127

2228
protected internal override void Scan(ReadOnlySpan<char> text)

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCaseBehavior.cs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,16 @@ namespace System.Text.RegularExpressions
1212
/// same character. Note that we don't consider a mapping when the only relationship between 'A' and 'B' is that one is the ToUpper() representation of the other. This
1313
/// is for backwards compatibility since, in Regex, we have only consider ToLower() for case insensitive comparisons. Given the case mappings vary depending on the culture,
1414
/// Regex supports 3 main different behaviors or mappings: Invariant, NonTurkish, and Turkish. This is in order to match the behavior of all .NET supported cultures
15-
/// current behavior for ToLower(). As a side note, there should be no cases where 'A'.ToLower() == 'B' but 'A'.ToLower() != 'B'.ToLower(). This aspect is important since
16-
/// for backreferences we make use a.ToLower() == b.ToLower() for comparisons so if there was such a case then it would lead to inconsistencies between how we handle
17-
/// backreferences vs how we handle other case insensitive comparisons.
15+
/// current behavior for ToLower(). As a side note, there should be no cases where 'A'.ToLower() == 'B' but 'A'.ToLower() != 'B'.ToLower().
1816
/// </summary>
1917
internal enum RegexCaseBehavior
2018
{
19+
/// <summary>
20+
/// This means that the RegexCaseBehavior hasn't been calculated based on a passed in culture yet, so it will need to be calculated before the first
21+
/// equivalence check by calling <see cref="RegexCaseEquivalences.GetRegexBehavior(CultureInfo)"/>
22+
/// </summary>
23+
NotSet,
24+
2125
/// <summary>
2226
/// Invariant case-mappings are used. This includes all of the common mappings across cultures. This behavior is used when either the user
2327
/// specified <see cref="RegexOptions.CultureInvariant"/> or when the CurrentCulture is <see cref="CultureInfo.InvariantCulture"/>.

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCaseEquivalences.cs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,20 @@ internal static partial class RegexCaseEquivalences
2828
/// culture and will also factor in the current culture in order to handle the special cases which are different between cultures.
2929
/// </summary>
3030
/// <param name="c">The character being analyzed</param>
31-
/// <param name="culture">The <see cref="CultureInfo"/> to be used to determine the equivalences.</param>
31+
/// <param name="culture">The <see cref="CultureInfo"/> to be used to calculate <paramref name="mappingBehavior"/> in case it hasn't been cached.</param>
32+
/// <param name="mappingBehavior">The behavior to be used for case comparisons. If the value hasn't been set yet, it will get initialized in the first lookup.</param>
3233
/// <param name="equivalences">If <paramref name="c"/> is involved in case conversion, then equivalences will contain the
3334
/// span of character which should be considered equal to <paramref name="c"/> in a case-insensitive comparison.</param>
3435
/// <returns><see langword="true"/> if <paramref name="c"/> is involved in case conversion; otherwise, <see langword="false"/></returns>
35-
public static bool TryFindCaseEquivalencesForCharWithIBehavior(char c, CultureInfo culture, out ReadOnlySpan<char> equivalences)
36+
public static bool TryFindCaseEquivalencesForCharWithIBehavior(char c, CultureInfo culture, ref RegexCaseBehavior mappingBehavior, out ReadOnlySpan<char> equivalences)
3637
{
3738
if ((c | 0x20) == 'i' || (c | 0x01) == '\u0131')
3839
{
39-
RegexCaseBehavior mappingBehavior = GetRegexBehavior(culture);
40+
// If this is the first time that this method is being called then mappingBehavior will be set to default, so we calculate
41+
// the behavior to use and cache the value for future lookups.
42+
if (mappingBehavior == RegexCaseBehavior.NotSet)
43+
mappingBehavior = GetRegexBehavior(culture);
44+
4045
equivalences = c switch
4146
{
4247
// Invariant mappings

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,7 @@ internal sealed partial class RegexCharClass
276276
private StringBuilder? _categories;
277277
private RegexCharClass? _subtractor;
278278
private bool _negate;
279+
private RegexCaseBehavior _caseBehavior;
279280

280281
#if DEBUG
281282
static RegexCharClass()
@@ -440,7 +441,7 @@ public void AddCaseEquivalences(CultureInfo culture)
440441
(char First, char Last) range = rangeList[i];
441442
if (range.First == range.Last)
442443
{
443-
if (RegexCaseEquivalences.TryFindCaseEquivalencesForCharWithIBehavior(range.First, culture, out ReadOnlySpan<char> equivalences))
444+
if (RegexCaseEquivalences.TryFindCaseEquivalencesForCharWithIBehavior(range.First, culture, ref _caseBehavior, out ReadOnlySpan<char> equivalences))
444445
{
445446
foreach (char equivalence in equivalences)
446447
{
@@ -464,7 +465,7 @@ private void AddCaseEquivalenceRange(char chMin, char chMax, CultureInfo culture
464465
{
465466
for (int i = chMin; i <= chMax; i++)
466467
{
467-
if (RegexCaseEquivalences.TryFindCaseEquivalencesForCharWithIBehavior((char)i, culture, out ReadOnlySpan<char> equivalences))
468+
if (RegexCaseEquivalences.TryFindCaseEquivalencesForCharWithIBehavior((char)i, culture, ref _caseBehavior, out ReadOnlySpan<char> equivalences))
468469
{
469470
foreach (char equivalence in equivalences)
470471
{

0 commit comments

Comments
 (0)