Skip to content

Commit 2cc926e

Browse files
authored
Improve Regex performance (mainly interpreted) (#449)
* Remove branches from tight inner interpreter loop in FindFirstChar * Tweak RegexBoyerMoore.IsMatch Reduce the checks needed and elimiate unnecessary layers of function calls. * Remove IsSingleton optimization This doesn't show up in real regexes and is just adding unnecessary complication to the code. No one writes `[a-b]`... they just write `a`. SingletonInverse is more useful, as you can search for any character except for a specific one, e.g. find the first character that's not a dash. * Cache CharInClass results for ASCII lookups * Improve codegen in a few places (and a little cleanup) * Mark RegexInterpreter.SetOperator aggressive inlining It's small but isn't getting inlined; it's only called in 4 places, but on hot paths, and inlininig it nets around an ~8% throughput win.
1 parent 457e691 commit 2cc926e

File tree

8 files changed

+269
-156
lines changed

8 files changed

+269
-156
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs

Lines changed: 37 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,10 @@ internal void Reset(Regex regex, string text, int textbeg, int textend, int text
8181
_textend = textend;
8282
_textstart = textstart;
8383

84-
for (int i = 0; i < _matchcount.Length; i++)
84+
int[] matchcount = _matchcount;
85+
for (int i = 0; i < matchcount.Length; i++)
8586
{
86-
_matchcount[i] = 0;
87+
matchcount[i] = 0;
8788
}
8889

8990
_balancing = false;
@@ -170,21 +171,23 @@ public static Match Synchronized(Match inner)
170171
internal void AddMatch(int cap, int start, int len)
171172
{
172173
_matches[cap] ??= new int[2];
174+
int[][] matches = _matches;
173175

174-
int capcount = _matchcount[cap];
176+
int[] matchcount = _matchcount;
177+
int capcount = matchcount[cap];
175178

176-
if (capcount * 2 + 2 > _matches[cap].Length)
179+
if (capcount * 2 + 2 > matches[cap].Length)
177180
{
178-
int[] oldmatches = _matches[cap];
181+
int[] oldmatches = matches[cap];
179182
int[] newmatches = new int[capcount * 8];
180183
for (int j = 0; j < capcount * 2; j++)
181184
newmatches[j] = oldmatches[j];
182-
_matches[cap] = newmatches;
185+
matches[cap] = newmatches;
183186
}
184187

185-
_matches[cap][capcount * 2] = start;
186-
_matches[cap][capcount * 2 + 1] = len;
187-
_matchcount[cap] = capcount + 1;
188+
matches[cap][capcount * 2] = start;
189+
matches[cap][capcount * 2 + 1] = len;
190+
matchcount[cap] = capcount + 1;
188191
}
189192

190193
/*
@@ -204,15 +207,16 @@ internal void BalanceMatch(int cap)
204207

205208
// first see if it is negative, and therefore is a reference to the next available
206209
// capture group for balancing. If it is, we'll reset target to point to that capture.
207-
if (_matches[cap][target] < 0)
208-
target = -3 - _matches[cap][target];
210+
int[][] matches = _matches;
211+
if (matches[cap][target] < 0)
212+
target = -3 - matches[cap][target];
209213

210214
// move back to the previous capture
211215
target -= 2;
212216

213217
// if the previous capture is a reference, just copy that reference to the end. Otherwise, point to it.
214-
if (target >= 0 && _matches[cap][target] < 0)
215-
AddMatch(cap, _matches[cap][target], _matches[cap][target + 1]);
218+
if (target >= 0 && matches[cap][target] < 0)
219+
AddMatch(cap, matches[cap][target], matches[cap][target + 1]);
216220
else
217221
AddMatch(cap, -3 - target, -4 - target /* == -3 - (target + 1) */ );
218222
}
@@ -230,43 +234,52 @@ internal void RemoveMatch(int cap)
230234
/// </summary>
231235
internal bool IsMatched(int cap)
232236
{
233-
return cap < _matchcount.Length && _matchcount[cap] > 0 && _matches[cap][_matchcount[cap] * 2 - 1] != (-3 + 1);
237+
int[] matchcount = _matchcount;
238+
return (uint)cap < (uint)matchcount.Length && matchcount[cap] > 0 && _matches[cap][matchcount[cap] * 2 - 1] != (-3 + 1);
234239
}
235240

236241
/// <summary>
237242
/// Returns the index of the last specified matched group by capnum
238243
/// </summary>
239244
internal int MatchIndex(int cap)
240245
{
241-
int i = _matches[cap][_matchcount[cap] * 2 - 2];
246+
int[][] matches = _matches;
247+
248+
int i = matches[cap][_matchcount[cap] * 2 - 2];
242249
if (i >= 0)
243250
return i;
244251

245-
return _matches[cap][-3 - i];
252+
return matches[cap][-3 - i];
246253
}
247254

248255
/// <summary>
249256
/// Returns the length of the last specified matched group by capnum
250257
/// </summary>
251258
internal int MatchLength(int cap)
252259
{
253-
int i = _matches[cap][_matchcount[cap] * 2 - 1];
260+
int[][] matches = _matches;
261+
262+
int i = matches[cap][_matchcount[cap] * 2 - 1];
254263
if (i >= 0)
255264
return i;
256265

257-
return _matches[cap][-3 - i];
266+
return matches[cap][-3 - i];
258267
}
259268

260269
/// <summary>
261270
/// Tidy the match so that it can be used as an immutable result
262271
/// </summary>
263272
internal void Tidy(int textpos)
264273
{
265-
int[] interval = _matches[0];
274+
int[][] matches = _matches;
275+
276+
int[] interval = matches[0];
266277
Index = interval[0];
267278
Length = interval[1];
268279
_textpos = textpos;
269-
_capcount = _matchcount[0];
280+
281+
int[] matchcount = _matchcount;
282+
_capcount = matchcount[0];
270283

271284
if (_balancing)
272285
{
@@ -276,13 +289,13 @@ internal void Tidy(int textpos)
276289
// until we find a balance captures. Then we check each subsequent entry. If it's a balance
277290
// capture (it's negative), we decrement j. If it's a real capture, we increment j and copy
278291
// it down to the last free position.
279-
for (int cap = 0; cap < _matchcount.Length; cap++)
292+
for (int cap = 0; cap < matchcount.Length; cap++)
280293
{
281294
int limit;
282295
int[] matcharray;
283296

284-
limit = _matchcount[cap] * 2;
285-
matcharray = _matches[cap];
297+
limit = matchcount[cap] * 2;
298+
matcharray = matches[cap];
286299

287300
int i = 0;
288301
int j;
@@ -310,7 +323,7 @@ internal void Tidy(int textpos)
310323
}
311324
}
312325

313-
_matchcount[cap] = j / 2;
326+
matchcount[cap] = j / 2;
314327
}
315328

316329
_balancing = false;

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs

Lines changed: 8 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -205,23 +205,6 @@ public RegexBoyerMoore(string pattern, bool caseInsensitive, bool rightToLeft, C
205205
}
206206
}
207207

208-
private bool MatchPattern(string text, int index)
209-
{
210-
if (CaseInsensitive)
211-
{
212-
if (text.Length - index < Pattern.Length)
213-
{
214-
return false;
215-
}
216-
217-
return (0 == string.Compare(Pattern, 0, text, index, Pattern.Length, CaseInsensitive, _culture));
218-
}
219-
else
220-
{
221-
return (0 == string.CompareOrdinal(Pattern, 0, text, index, Pattern.Length));
222-
}
223-
}
224-
225208
/// <summary>
226209
/// When a regex is anchored, we can do a quick IsMatch test instead of a Scan
227210
/// </summary>
@@ -231,16 +214,21 @@ public bool IsMatch(string text, int index, int beglimit, int endlimit)
231214
{
232215
if (index < beglimit || endlimit - index < Pattern.Length)
233216
return false;
234-
235-
return MatchPattern(text, index);
236217
}
237218
else
238219
{
239220
if (index > endlimit || index - beglimit < Pattern.Length)
240221
return false;
241222

242-
return MatchPattern(text, index - Pattern.Length);
223+
index -= Pattern.Length;
243224
}
225+
226+
if (CaseInsensitive)
227+
{
228+
return string.Compare(Pattern, 0, text, index, Pattern.Length, ignoreCase: true, _culture) == 0;
229+
}
230+
231+
return Pattern.AsSpan().SequenceEqual(text.AsSpan(index, Pattern.Length));
244232
}
245233

246234
/// <summary>

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs

Lines changed: 64 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using System.Collections.Generic;
66
using System.Diagnostics;
77
using System.Globalization;
8+
using System.Threading;
89

910
namespace System.Text.RegularExpressions
1011
{
@@ -732,7 +733,7 @@ public static string ConvertOldStringsToClass(string set, string category)
732733
/// </summary>
733734
public static char SingletonChar(string set)
734735
{
735-
Debug.Assert(IsSingleton(set) || IsSingletonInverse(set), "Tried to get the singleton char out of a non singleton character class");
736+
Debug.Assert(IsSingletonInverse(set), "Tried to get the singleton char out of a non singleton character class");
736737
return set[SetStartIndex];
737738
}
738739

@@ -747,14 +748,6 @@ public static bool IsEmpty(string charClass) =>
747748
!IsNegated(charClass) &&
748749
!IsSubtraction(charClass);
749750

750-
/// <summary><c>true</c> if the set contains a single character only</summary>
751-
public static bool IsSingleton(string set) =>
752-
set[CategoryLengthIndex] == 0 &&
753-
set[SetLengthIndex] == 2 &&
754-
!IsNegated(set) &&
755-
!IsSubtraction(set) &&
756-
(set[SetStartIndex] == LastChar || set[SetStartIndex] + 1 == set[SetStartIndex + 1]);
757-
758751
public static bool IsSingletonInverse(string set) =>
759752
set[CategoryLengthIndex] == 0 &&
760753
set[SetLengthIndex] == 2 &&
@@ -823,6 +816,68 @@ public static bool IsWordChar(char ch)
823816
}
824817
}
825818

819+
public static bool CharInClass(char ch, string set, ref int[]? asciiResultCache)
820+
{
821+
// The int[] contains 8 ints, or 256 bits. These are laid out as pairs, where the first bit ("known") in the pair
822+
// says whether the second bit ("value") in the pair has already been computed. Once a value is computed, it's never
823+
// changed, so since Int32s are written/read atomically, we can trust the value bit if we see that the known bit
824+
// has been set. If the known bit hasn't been set, then we proceed to look it up, and then swap in the result.
825+
const int CacheArrayLength = 8;
826+
Debug.Assert(asciiResultCache is null || asciiResultCache.Length == CacheArrayLength, "set lookup should be able to store two bits for each of the first 128 characters");
827+
828+
if (ch < 128)
829+
{
830+
// Lazily-initialize the cache for this set.
831+
if (asciiResultCache is null)
832+
{
833+
Interlocked.CompareExchange(ref asciiResultCache, new int[CacheArrayLength], null);
834+
}
835+
836+
// Determine which int in the lookup array contains the known and value bits for this character,
837+
// and compute their bit numbers.
838+
ref int slot = ref asciiResultCache[ch >> 4];
839+
int knownBit = 1 << ((ch & 0xF) << 1);
840+
int valueBit = knownBit << 1;
841+
842+
// If the value for this bit has already been computed, use it.
843+
int current = slot;
844+
if ((current & knownBit) != 0)
845+
{
846+
return (current & valueBit) != 0;
847+
}
848+
849+
// (After warm-up, we should find ourselves rarely getting here.)
850+
851+
// Otherwise, compute it normally.
852+
bool isInClass = CharInClass(ch, set);
853+
854+
// Determine which bits to write back to the array.
855+
int bitsToSet = knownBit;
856+
if (isInClass)
857+
{
858+
bitsToSet |= valueBit;
859+
}
860+
861+
// "or" the bits back in a thread-safe manner.
862+
while (true)
863+
{
864+
int oldValue = Interlocked.CompareExchange(ref slot, current | bitsToSet, current);
865+
if (oldValue == current)
866+
{
867+
break;
868+
}
869+
870+
current = oldValue;
871+
}
872+
873+
// Return the computed value.
874+
return isInClass;
875+
}
876+
877+
// Non-ASCII. Fall back to computing the answer.
878+
return CharInClassRecursive(ch, set, 0);
879+
}
880+
826881
public static bool CharInClass(char ch, string set) =>
827882
CharInClassRecursive(ch, set, 0);
828883

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
using System.Collections;
1919
using System.Collections.Generic;
2020
using System.Diagnostics;
21-
using System.Globalization;
2221

2322
namespace System.Text.RegularExpressions
2423
{
@@ -91,10 +90,12 @@ internal sealed class RegexCode
9190

9291
public readonly int[] Codes; // the code
9392
public readonly string[] Strings; // the string/set table
93+
public readonly int[]?[] StringsAsciiLookup; // the ASCII lookup table optimization for the sets in Strings
9494
public readonly int TrackCount; // how many instructions use backtracking
9595
public readonly Hashtable? Caps; // mapping of user group numbers -> impl group slots
9696
public readonly int CapSize; // number of impl group slots
9797
public readonly RegexPrefix? FCPrefix; // the set of candidate first characters (may be null)
98+
public int[]? FCPrefixAsciiLookup; // the ASCII lookup table optimization for the set of candidate first characters if there are any
9899
public readonly RegexBoyerMoore? BMPrefix; // the fixed prefix string as a Boyer-Moore machine (may be null)
99100
public readonly int Anchors; // the set of zero-length start anchors (RegexFCD.Bol, etc)
100101
public readonly bool RightToLeft; // true if right to left
@@ -109,6 +110,7 @@ public RegexCode(int[] codes, List<string> stringlist, int trackcount,
109110

110111
Codes = codes;
111112
Strings = stringlist.ToArray();
113+
StringsAsciiLookup = new int[Strings.Length][];
112114
TrackCount = trackcount;
113115
Caps = caps;
114116
CapSize = capsize;

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1163,29 +1163,14 @@ protected void GenerateFindFirstChar()
11631163
CallToLower();
11641164
}
11651165

1166-
if (!RegexCharClass.IsSingleton(_fcPrefix.GetValueOrDefault().Prefix))
1167-
{
1168-
EmitCallCharInClass(_fcPrefix.GetValueOrDefault().Prefix, charInClassV);
1169-
BrtrueFar(l2);
1170-
}
1171-
else
1172-
{
1173-
Ldc(RegexCharClass.SingletonChar(_fcPrefix.GetValueOrDefault().Prefix));
1174-
Beq(l2);
1175-
}
1166+
EmitCallCharInClass(_fcPrefix.GetValueOrDefault().Prefix, charInClassV);
1167+
BrtrueFar(l2);
11761168

11771169
MarkLabel(l5);
11781170

11791171
Ldloc(cV);
11801172
Ldc(0);
1181-
if (!RegexCharClass.IsSingleton(_fcPrefix.GetValueOrDefault().Prefix))
1182-
{
1183-
BgtFar(l1);
1184-
}
1185-
else
1186-
{
1187-
Bgt(l1);
1188-
}
1173+
BgtFar(l1);
11891174

11901175
Ldc(0);
11911176
BrFar(l3);

0 commit comments

Comments
 (0)