@@ -1840,26 +1840,30 @@ void EmitPositiveLookaheadAssertion(RegexNode node)
1840
1840
Debug . Assert ( node . Kind is RegexNodeKind . PositiveLookaround , $ "Unexpected type: { node . Kind } ") ;
1841
1841
Debug . Assert ( node . ChildCount ( ) == 1 , $ "Expected 1 child, found { node . ChildCount ( ) } ") ;
1842
1842
1843
- // Lookarounds are implicitly atomic. Store the original done label to reset at the end.
1844
- string originalDoneLabel = doneLabel ;
1845
-
1846
1843
// Save off pos. We'll need to reset this upon successful completion of the lookahead.
1847
1844
string startingPos = ReserveName ( "positivelookahead_starting_pos" ) ;
1848
1845
writer . WriteLine ( $ "int { startingPos } = pos;") ;
1849
1846
writer . WriteLine ( ) ;
1850
1847
int startingSliceStaticPos = sliceStaticPos ;
1851
1848
1852
1849
// Emit the child.
1853
- EmitNode ( node . Child ( 0 ) ) ;
1850
+ RegexNode child = node . Child ( 0 ) ;
1851
+ if ( analysis . MayBacktrack ( child ) )
1852
+ {
1853
+ // Lookarounds are implicitly atomic, so we need to emit the node as atomic if it might backtrack.
1854
+ EmitAtomic ( node , null ) ;
1855
+ }
1856
+ else
1857
+ {
1858
+ EmitNode ( child ) ;
1859
+ }
1854
1860
1855
1861
// After the child completes successfully, reset the text positions.
1856
1862
// Do not reset captures, which persist beyond the lookahead.
1857
1863
writer . WriteLine ( ) ;
1858
1864
writer . WriteLine ( $ "pos = { startingPos } ;") ;
1859
1865
SliceInputSpan ( writer ) ;
1860
1866
sliceStaticPos = startingSliceStaticPos ;
1861
-
1862
- doneLabel = originalDoneLabel ;
1863
1867
}
1864
1868
1865
1869
// Emits the code to handle a negative lookahead assertion.
@@ -1868,7 +1872,6 @@ void EmitNegativeLookaheadAssertion(RegexNode node)
1868
1872
Debug . Assert ( node . Kind is RegexNodeKind . NegativeLookaround , $ "Unexpected type: { node . Kind } ") ;
1869
1873
Debug . Assert ( node . ChildCount ( ) == 1 , $ "Expected 1 child, found { node . ChildCount ( ) } ") ;
1870
1874
1871
- // Lookarounds are implicitly atomic. Store the original done label to reset at the end.
1872
1875
string originalDoneLabel = doneLabel ;
1873
1876
1874
1877
// Save off pos. We'll need to reset this upon successful completion of the lookahead.
@@ -1880,7 +1883,16 @@ void EmitNegativeLookaheadAssertion(RegexNode node)
1880
1883
doneLabel = negativeLookaheadDoneLabel ;
1881
1884
1882
1885
// Emit the child.
1883
- EmitNode ( node . Child ( 0 ) ) ;
1886
+ RegexNode child = node . Child ( 0 ) ;
1887
+ if ( analysis . MayBacktrack ( child ) )
1888
+ {
1889
+ // Lookarounds are implicitly atomic, so we need to emit the node as atomic if it might backtrack.
1890
+ EmitAtomic ( node , null ) ;
1891
+ }
1892
+ else
1893
+ {
1894
+ EmitNode ( child ) ;
1895
+ }
1884
1896
1885
1897
// If the generated code ends up here, it matched the lookahead, which actually
1886
1898
// means failure for a _negative_ lookahead, so we need to jump to the original done.
@@ -1920,9 +1932,9 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
1920
1932
Goto ( doneLabel ) ;
1921
1933
return ;
1922
1934
1923
- // Atomic is invisible in the generated source, other than its impact on the targets of jumps
1924
- case RegexNodeKind . Atomic :
1925
- EmitAtomic ( node , subsequent ) ;
1935
+ // Skip atomic nodes that wrap non-backtracking children; in such a case there's nothing to be made atomic.
1936
+ case RegexNodeKind . Atomic when ! analysis . MayBacktrack ( node . Child ( 0 ) ) :
1937
+ EmitNode ( node . Child ( 0 ) ) ;
1926
1938
return ;
1927
1939
1928
1940
// Concatenate is a simplification in the node tree so that a series of children can be represented as one.
@@ -2006,6 +2018,10 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
2006
2018
EmitExpressionConditional ( node ) ;
2007
2019
break ;
2008
2020
2021
+ case RegexNodeKind . Atomic when analysis . MayBacktrack ( node . Child ( 0 ) ) :
2022
+ EmitAtomic ( node , subsequent ) ;
2023
+ return ;
2024
+
2009
2025
case RegexNodeKind . Capture :
2010
2026
EmitCapture ( node , subsequent ) ;
2011
2027
break ;
@@ -2032,14 +2048,27 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
2032
2048
// Emits the node for an atomic.
2033
2049
void EmitAtomic ( RegexNode node , RegexNode ? subsequent )
2034
2050
{
2035
- Debug . Assert ( node . Kind is RegexNodeKind . Atomic , $ "Unexpected type: { node . Kind } ") ;
2051
+ Debug . Assert ( node . Kind is RegexNodeKind . Atomic or RegexNodeKind . PositiveLookaround or RegexNodeKind . NegativeLookaround , $ "Unexpected type: { node . Kind } ") ;
2036
2052
Debug . Assert ( node . ChildCount ( ) == 1 , $ "Expected 1 child, found { node . ChildCount ( ) } ") ;
2053
+ Debug . Assert ( analysis . MayBacktrack ( node . Child ( 0 ) ) , "Expected child to potentially backtrack" ) ;
2037
2054
2038
- // Atomic simply outputs the code for the child, but it ensures that any done label left
2039
- // set by the child is reset to what it was prior to the node's processing. That way,
2040
- // anything later that tries to jump back won't see labels set inside the atomic.
2055
+ // Grab the current done label and the current backtracking position. The purpose of the atomic node
2056
+ // is to ensure that nodes after it that might backtrack skip over the atomic, which means after
2057
+ // rendering the atomic's child, we need to reset the label so that subsequent backtracking doesn't
2058
+ // see any label left set by the atomic's child. We also need to reset the backtracking stack position
2059
+ // so that the state on the stack remains consistent.
2041
2060
string originalDoneLabel = doneLabel ;
2061
+ additionalDeclarations . Add ( "int stackpos = 0;" ) ;
2062
+ string startingStackpos = ReserveName ( "atomic_stackpos" ) ;
2063
+ writer . WriteLine ( $ "int { startingStackpos } = stackpos;") ;
2064
+ writer . WriteLine ( ) ;
2065
+
2066
+ // Emit the child.
2042
2067
EmitNode ( node . Child ( 0 ) , subsequent ) ;
2068
+ writer . WriteLine ( ) ;
2069
+
2070
+ // Reset the stack position and done label.
2071
+ writer . WriteLine ( $ "stackpos = { startingStackpos } ;") ;
2043
2072
doneLabel = originalDoneLabel ;
2044
2073
}
2045
2074
0 commit comments