Skip to content

Commit b410984

Browse files
authored
Fix handling of atomic nodes in RegexCompiler / source generator (#66046)
* Fix handling of atomic nodes in RegexCompiler / source generator We weren't properly resetting the stack position, so if we had an atomic group that contained something that backtracked, any backtracking positions left on the stack by that nested construct would then be consumed by a previous backtracking construct and lead to it reading the wrong state. * Fix atomic handling for positive/negative lookaheads as well
1 parent fdbdb9a commit b410984

File tree

3 files changed

+98
-28
lines changed

3 files changed

+98
-28
lines changed

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

Lines changed: 44 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1840,26 +1840,30 @@ void EmitPositiveLookaheadAssertion(RegexNode node)
18401840
Debug.Assert(node.Kind is RegexNodeKind.PositiveLookaround, $"Unexpected type: {node.Kind}");
18411841
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
18421842

1843-
// Lookarounds are implicitly atomic. Store the original done label to reset at the end.
1844-
string originalDoneLabel = doneLabel;
1845-
18461843
// Save off pos. We'll need to reset this upon successful completion of the lookahead.
18471844
string startingPos = ReserveName("positivelookahead_starting_pos");
18481845
writer.WriteLine($"int {startingPos} = pos;");
18491846
writer.WriteLine();
18501847
int startingSliceStaticPos = sliceStaticPos;
18511848

18521849
// Emit the child.
1853-
EmitNode(node.Child(0));
1850+
RegexNode child = node.Child(0);
1851+
if (analysis.MayBacktrack(child))
1852+
{
1853+
// Lookarounds are implicitly atomic, so we need to emit the node as atomic if it might backtrack.
1854+
EmitAtomic(node, null);
1855+
}
1856+
else
1857+
{
1858+
EmitNode(child);
1859+
}
18541860

18551861
// After the child completes successfully, reset the text positions.
18561862
// Do not reset captures, which persist beyond the lookahead.
18571863
writer.WriteLine();
18581864
writer.WriteLine($"pos = {startingPos};");
18591865
SliceInputSpan(writer);
18601866
sliceStaticPos = startingSliceStaticPos;
1861-
1862-
doneLabel = originalDoneLabel;
18631867
}
18641868

18651869
// Emits the code to handle a negative lookahead assertion.
@@ -1868,7 +1872,6 @@ void EmitNegativeLookaheadAssertion(RegexNode node)
18681872
Debug.Assert(node.Kind is RegexNodeKind.NegativeLookaround, $"Unexpected type: {node.Kind}");
18691873
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
18701874

1871-
// Lookarounds are implicitly atomic. Store the original done label to reset at the end.
18721875
string originalDoneLabel = doneLabel;
18731876

18741877
// Save off pos. We'll need to reset this upon successful completion of the lookahead.
@@ -1880,7 +1883,16 @@ void EmitNegativeLookaheadAssertion(RegexNode node)
18801883
doneLabel = negativeLookaheadDoneLabel;
18811884

18821885
// Emit the child.
1883-
EmitNode(node.Child(0));
1886+
RegexNode child = node.Child(0);
1887+
if (analysis.MayBacktrack(child))
1888+
{
1889+
// Lookarounds are implicitly atomic, so we need to emit the node as atomic if it might backtrack.
1890+
EmitAtomic(node, null);
1891+
}
1892+
else
1893+
{
1894+
EmitNode(child);
1895+
}
18841896

18851897
// If the generated code ends up here, it matched the lookahead, which actually
18861898
// means failure for a _negative_ lookahead, so we need to jump to the original done.
@@ -1920,9 +1932,9 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
19201932
Goto(doneLabel);
19211933
return;
19221934

1923-
// Atomic is invisible in the generated source, other than its impact on the targets of jumps
1924-
case RegexNodeKind.Atomic:
1925-
EmitAtomic(node, subsequent);
1935+
// Skip atomic nodes that wrap non-backtracking children; in such a case there's nothing to be made atomic.
1936+
case RegexNodeKind.Atomic when !analysis.MayBacktrack(node.Child(0)):
1937+
EmitNode(node.Child(0));
19261938
return;
19271939

19281940
// Concatenate is a simplification in the node tree so that a series of children can be represented as one.
@@ -2006,6 +2018,10 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
20062018
EmitExpressionConditional(node);
20072019
break;
20082020

2021+
case RegexNodeKind.Atomic when analysis.MayBacktrack(node.Child(0)):
2022+
EmitAtomic(node, subsequent);
2023+
return;
2024+
20092025
case RegexNodeKind.Capture:
20102026
EmitCapture(node, subsequent);
20112027
break;
@@ -2032,14 +2048,27 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
20322048
// Emits the node for an atomic.
20332049
void EmitAtomic(RegexNode node, RegexNode? subsequent)
20342050
{
2035-
Debug.Assert(node.Kind is RegexNodeKind.Atomic, $"Unexpected type: {node.Kind}");
2051+
Debug.Assert(node.Kind is RegexNodeKind.Atomic or RegexNodeKind.PositiveLookaround or RegexNodeKind.NegativeLookaround, $"Unexpected type: {node.Kind}");
20362052
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
2053+
Debug.Assert(analysis.MayBacktrack(node.Child(0)), "Expected child to potentially backtrack");
20372054

2038-
// Atomic simply outputs the code for the child, but it ensures that any done label left
2039-
// set by the child is reset to what it was prior to the node's processing. That way,
2040-
// anything later that tries to jump back won't see labels set inside the atomic.
2055+
// Grab the current done label and the current backtracking position. The purpose of the atomic node
2056+
// is to ensure that nodes after it that might backtrack skip over the atomic, which means after
2057+
// rendering the atomic's child, we need to reset the label so that subsequent backtracking doesn't
2058+
// see any label left set by the atomic's child. We also need to reset the backtracking stack position
2059+
// so that the state on the stack remains consistent.
20412060
string originalDoneLabel = doneLabel;
2061+
additionalDeclarations.Add("int stackpos = 0;");
2062+
string startingStackpos = ReserveName("atomic_stackpos");
2063+
writer.WriteLine($"int {startingStackpos} = stackpos;");
2064+
writer.WriteLine();
2065+
2066+
// Emit the child.
20422067
EmitNode(node.Child(0), subsequent);
2068+
writer.WriteLine();
2069+
2070+
// Reset the stack position and done label.
2071+
writer.WriteLine($"stackpos = {startingStackpos};");
20432072
doneLabel = originalDoneLabel;
20442073
}
20452074

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs

Lines changed: 51 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2015,9 +2015,6 @@ void EmitPositiveLookaheadAssertion(RegexNode node)
20152015
Debug.Assert(node.Kind is RegexNodeKind.PositiveLookaround, $"Unexpected type: {node.Kind}");
20162016
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
20172017

2018-
// Lookarounds are implicitly atomic. Store the original done label to reset at the end.
2019-
Label originalDoneLabel = doneLabel;
2020-
20212018
// Save off pos. We'll need to reset this upon successful completion of the lookahead.
20222019
// startingPos = pos;
20232020
LocalBuilder startingPos = DeclareInt32();
@@ -2026,7 +2023,16 @@ void EmitPositiveLookaheadAssertion(RegexNode node)
20262023
int startingTextSpanPos = sliceStaticPos;
20272024

20282025
// Emit the child.
2029-
EmitNode(node.Child(0));
2026+
RegexNode child = node.Child(0);
2027+
if (analysis.MayBacktrack(child))
2028+
{
2029+
// Lookarounds are implicitly atomic, so we need to emit the node as atomic if it might backtrack.
2030+
EmitAtomic(node, null);
2031+
}
2032+
else
2033+
{
2034+
EmitNode(child);
2035+
}
20302036

20312037
// After the child completes successfully, reset the text positions.
20322038
// Do not reset captures, which persist beyond the lookahead.
@@ -2036,8 +2042,6 @@ void EmitPositiveLookaheadAssertion(RegexNode node)
20362042
Stloc(pos);
20372043
SliceInputSpan();
20382044
sliceStaticPos = startingTextSpanPos;
2039-
2040-
doneLabel = originalDoneLabel;
20412045
}
20422046

20432047
// Emits the code to handle a negative lookahead assertion.
@@ -2046,7 +2050,6 @@ void EmitNegativeLookaheadAssertion(RegexNode node)
20462050
Debug.Assert(node.Kind is RegexNodeKind.NegativeLookaround, $"Unexpected type: {node.Kind}");
20472051
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
20482052

2049-
// Lookarounds are implicitly atomic. Store the original done label to reset at the end.
20502053
Label originalDoneLabel = doneLabel;
20512054

20522055
// Save off pos. We'll need to reset this upon successful completion of the lookahead.
@@ -2060,7 +2063,16 @@ void EmitNegativeLookaheadAssertion(RegexNode node)
20602063
doneLabel = negativeLookaheadDoneLabel;
20612064

20622065
// Emit the child.
2063-
EmitNode(node.Child(0));
2066+
RegexNode child = node.Child(0);
2067+
if (analysis.MayBacktrack(child))
2068+
{
2069+
// Lookarounds are implicitly atomic, so we need to emit the node as atomic if it might backtrack.
2070+
EmitAtomic(node, null);
2071+
}
2072+
else
2073+
{
2074+
EmitNode(child);
2075+
}
20642076

20652077
// If the generated code ends up here, it matched the lookahead, which actually
20662078
// means failure for a _negative_ lookahead, so we need to jump to the original done.
@@ -2204,14 +2216,40 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
22042216
// Emits the node for an atomic.
22052217
void EmitAtomic(RegexNode node, RegexNode? subsequent)
22062218
{
2207-
Debug.Assert(node.Kind is RegexNodeKind.Atomic, $"Unexpected type: {node.Kind}");
2219+
Debug.Assert(node.Kind is RegexNodeKind.Atomic or RegexNodeKind.PositiveLookaround or RegexNodeKind.NegativeLookaround, $"Unexpected type: {node.Kind}");
22082220
Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}");
22092221

2210-
// Atomic simply outputs the code for the child, but it ensures that any done label left
2211-
// set by the child is reset to what it was prior to the node's processing. That way,
2212-
// anything later that tries to jump back won't see labels set inside the atomic.
2222+
RegexNode child = node.Child(0);
2223+
2224+
if (!analysis.MayBacktrack(child))
2225+
{
2226+
// If the child has no backtracking, the atomic is a nop and we can just skip it.
2227+
// Note that the source generator equivalent for this is in the top-level EmitNode, in order to avoid
2228+
// outputting some extra comments and scopes. As such formatting isn't a concern for the compiler,
2229+
// the logic is instead here in EmitAtomic.
2230+
EmitNode(child, subsequent);
2231+
return;
2232+
}
2233+
2234+
// Grab the current done label and the current backtracking position. The purpose of the atomic node
2235+
// is to ensure that nodes after it that might backtrack skip over the atomic, which means after
2236+
// rendering the atomic's child, we need to reset the label so that subsequent backtracking doesn't
2237+
// see any label left set by the atomic's child. We also need to reset the backtracking stack position
2238+
// so that the state on the stack remains consistent.
22132239
Label originalDoneLabel = doneLabel;
2214-
EmitNode(node.Child(0), subsequent);
2240+
2241+
// int startingStackpos = stackpos;
2242+
using RentedLocalBuilder startingStackpos = RentInt32Local();
2243+
Ldloc(stackpos);
2244+
Stloc(startingStackpos);
2245+
2246+
// Emit the child.
2247+
EmitNode(child, subsequent);
2248+
2249+
// Reset the stack position and done label.
2250+
// stackpos = startingStackpos;
2251+
Ldloc(startingStackpos);
2252+
Stloc(stackpos);
22152253
doneLabel = originalDoneLabel;
22162254
}
22172255

src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,9 @@ public static IEnumerable<object[]> Match_MemberData()
125125
yield return (Case("(?>a*)123"), "aa1234", options, 0, 5, true, "aa123");
126126
yield return (Case("(?>(?>a*))123"), "aa1234", options, 0, 5, true, "aa123");
127127
yield return (Case("(?>a{2,})b"), "aaab", options, 0, 4, true, "aaab");
128+
yield return (Case("[a-z]{0,4}(?>[x-z]*.)(?=xyz1)"), "abcdxyz1", options, 0, 8, true, "abcd");
129+
yield return (Case("[a-z]{0,4}(?=[x-z]*.)(?=cd)"), "abcdxyz1", options, 0, 8, true, "ab");
130+
yield return (Case("[a-z]{0,4}(?![x-z]*[wx])(?=cd)"), "abcdxyz1", options, 0, 8, true, "ab");
128131

129132
// Atomic lazy
130133
yield return (Case("(?>[0-9]+?)abc"), "abc12345abc", options, 3, 8, true, "5abc");

0 commit comments

Comments
 (0)