Skip to content

Commit

Permalink
regex: Fix negative sets capturing newlines
Browse files Browse the repository at this point in the history
  • Loading branch information
AntonLydike committed Jul 12, 2024
1 parent 1661886 commit d748e7c
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 2 deletions.
6 changes: 4 additions & 2 deletions filecheck/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,5 +75,7 @@ def compile_uops(
elif isinstance(uop, NumSubst):
# we don't do numerical substitutions yet
raise NotImplementedError("Numerical substitutions not supported!")

return re.compile("".join(expr)), captures
try:
return re.compile("".join(expr)), captures
except re.error:
raise CheckError(f"Malformed regex expression: '{''.join(expr)}'", check)
20 changes: 20 additions & 0 deletions filecheck/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,28 @@
"word": r"\w+",
}

NEGATED_SET_WITHOUT_NEWLINES = re.compile(r"([^\\]|^)\[\^((?!\\n))")


def posix_to_python_regex(expr: str) -> str:
"""
We need to translate things like `[:alpha:]` to `[A-Za-z]`, etc.
This also takes care of a little known fact about the llvm::Regex implementation:
```
enum llvm::Regex::RegexFlags::Newline = 2U
Compile for newline-sensitive matching. With this flag '[^' bracket
expressions and '.' never match newline. A ^ anchor matches the
null string after any newline in the string in addition to its normal
function, and the $ anchor matches the null string before any
newline in the string in addition to its normal function.
```
This bad boy is enabled in all FileCheck cases, meaning we need to also add `\n` to all
negative bracket expressions, otherwise we'll eat *so* many newlines.
LLVM supports them, but pythons regex doesn't.
"""
while (match := POSIX_REGEXP_PATTERN.search(expr)) is not None:
Expand All @@ -29,6 +46,9 @@ def posix_to_python_regex(expr: str) -> str:
f"Can't translate posix regex, unknown character set: {match.group(1)}"
)
expr = expr.replace(match.group(0), POSIX_REGEXP_REPLACEMENTS[match.group(1)])

expr = NEGATED_SET_WITHOUT_NEWLINES.sub(r"\1[^\\n\2", expr)

return expr


Expand Down

0 comments on commit d748e7c

Please sign in to comment.