Skip to content

Commit 6bcde03

Browse files
committed
syntax: reject '(?-u)\W' when UTF-8 mode is enabled
When Unicode mode is disabled (i.e., (?-u)), the Perl character classes (\w, \d and \s) revert to their ASCII definitions. The negated forms of these classes are also derived from their ASCII definitions, and this means that they may actually match bytes outside of ASCII and thus possibly invalid UTF-8. For this reason, when the translator is configured to only produce HIR that matches valid UTF-8, '(?-u)\W' should be rejected. Previously, it was not being rejected, which could actually lead to matches that produced offsets that split codepoints, and thus lead to panics when match offsets are used to slice a string. For example, this code fn main() { let re = regex::Regex::new(r"(?-u)\W").unwrap(); let haystack = "☃"; if let Some(m) = re.find(haystack) { println!("{:?}", &haystack[m.range()]); } } panics with byte index 1 is not a char boundary; it is inside '☃' (bytes 0..3) of `☃` That is, it reports a match at 0..1, which is technically correct, but the regex itself should have been rejected in the first place since the top-level Regex API always has UTF-8 mode enabled. Also, many of the replacement tests were using '(?-u)\W' (or similar) for some reason. I'm not sure why, so I just removed the '(?-u)' to make those tests pass. Whether Unicode is enabled or not doesn't seem to be an interesting detail for those tests. (All haystacks and replacements appear to be ASCII.) Fixes #895, Partially addresses #738
1 parent ae5065b commit 6bcde03

File tree

2 files changed

+92
-40
lines changed

2 files changed

+92
-40
lines changed

regex-syntax/src/hir/translate.rs

+84-11
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
305305
let hcls = hir::Class::Unicode(cls);
306306
self.push(HirFrame::Expr(Hir::class(hcls)));
307307
} else {
308-
let cls = self.hir_perl_byte_class(x);
308+
let cls = self.hir_perl_byte_class(x)?;
309309
let hcls = hir::Class::Bytes(cls);
310310
self.push(HirFrame::Expr(Hir::class(hcls)));
311311
}
@@ -445,7 +445,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
445445
cls.union(&xcls);
446446
self.push(HirFrame::ClassUnicode(cls));
447447
} else {
448-
let xcls = self.hir_perl_byte_class(x);
448+
let xcls = self.hir_perl_byte_class(x)?;
449449
let mut cls = self.pop().unwrap().unwrap_class_bytes();
450450
cls.union(&xcls);
451451
self.push(HirFrame::ClassBytes(cls));
@@ -879,7 +879,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
879879
fn hir_perl_byte_class(
880880
&self,
881881
ast_class: &ast::ClassPerl,
882-
) -> hir::ClassBytes {
882+
) -> Result<hir::ClassBytes> {
883883
use crate::ast::ClassPerlKind::*;
884884

885885
assert!(!self.flags().unicode());
@@ -893,7 +893,13 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
893893
if ast_class.negated {
894894
class.negate();
895895
}
896-
class
896+
// Negating a Perl byte class is likely to cause it to match invalid
897+
// UTF-8. That's only OK if the translator is configured to allow such
898+
// things.
899+
if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() {
900+
return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8));
901+
}
902+
Ok(class)
897903
}
898904

899905
/// Converts the given Unicode specific error to an HIR translation error.
@@ -1971,7 +1977,7 @@ mod tests {
19711977

19721978
#[test]
19731979
#[cfg(feature = "unicode-perl")]
1974-
fn class_perl() {
1980+
fn class_perl_unicode() {
19751981
// Unicode
19761982
assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit")));
19771983
assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space")));
@@ -2011,7 +2017,10 @@ mod tests {
20112017
);
20122018
#[cfg(feature = "unicode-case")]
20132019
assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word()));
2020+
}
20142021

2022+
#[test]
2023+
fn class_perl_ascii() {
20152024
// ASCII only
20162025
assert_eq!(
20172026
t(r"(?-u)\d"),
@@ -2040,29 +2049,93 @@ mod tests {
20402049

20412050
// ASCII only, negated
20422051
assert_eq!(
2043-
t(r"(?-u)\D"),
2052+
t_bytes(r"(?-u)\D"),
20442053
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
20452054
);
20462055
assert_eq!(
2047-
t(r"(?-u)\S"),
2056+
t_bytes(r"(?-u)\S"),
20482057
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
20492058
);
20502059
assert_eq!(
2051-
t(r"(?-u)\W"),
2060+
t_bytes(r"(?-u)\W"),
20522061
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
20532062
);
20542063
assert_eq!(
2055-
t(r"(?i-u)\D"),
2064+
t_bytes(r"(?i-u)\D"),
20562065
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
20572066
);
20582067
assert_eq!(
2059-
t(r"(?i-u)\S"),
2068+
t_bytes(r"(?i-u)\S"),
20602069
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
20612070
);
20622071
assert_eq!(
2063-
t(r"(?i-u)\W"),
2072+
t_bytes(r"(?i-u)\W"),
20642073
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
20652074
);
2075+
2076+
// ASCII only, negated, with UTF-8 mode enabled.
2077+
// In this case, negating any Perl class results in an error because
2078+
// all such classes can match invalid UTF-8.
2079+
assert_eq!(
2080+
t_err(r"(?-u)\D"),
2081+
TestError {
2082+
kind: hir::ErrorKind::InvalidUtf8,
2083+
span: Span::new(
2084+
Position::new(5, 1, 6),
2085+
Position::new(7, 1, 8),
2086+
),
2087+
},
2088+
);
2089+
assert_eq!(
2090+
t_err(r"(?-u)\S"),
2091+
TestError {
2092+
kind: hir::ErrorKind::InvalidUtf8,
2093+
span: Span::new(
2094+
Position::new(5, 1, 6),
2095+
Position::new(7, 1, 8),
2096+
),
2097+
},
2098+
);
2099+
assert_eq!(
2100+
t_err(r"(?-u)\W"),
2101+
TestError {
2102+
kind: hir::ErrorKind::InvalidUtf8,
2103+
span: Span::new(
2104+
Position::new(5, 1, 6),
2105+
Position::new(7, 1, 8),
2106+
),
2107+
},
2108+
);
2109+
assert_eq!(
2110+
t_err(r"(?i-u)\D"),
2111+
TestError {
2112+
kind: hir::ErrorKind::InvalidUtf8,
2113+
span: Span::new(
2114+
Position::new(6, 1, 7),
2115+
Position::new(8, 1, 9),
2116+
),
2117+
},
2118+
);
2119+
assert_eq!(
2120+
t_err(r"(?i-u)\S"),
2121+
TestError {
2122+
kind: hir::ErrorKind::InvalidUtf8,
2123+
span: Span::new(
2124+
Position::new(6, 1, 7),
2125+
Position::new(8, 1, 9),
2126+
),
2127+
},
2128+
);
2129+
assert_eq!(
2130+
t_err(r"(?i-u)\W"),
2131+
TestError {
2132+
kind: hir::ErrorKind::InvalidUtf8,
2133+
span: Span::new(
2134+
Position::new(6, 1, 7),
2135+
Position::new(8, 1, 9),
2136+
),
2137+
},
2138+
);
20662139
}
20672140

20682141
#[test]

tests/replace.rs

+8-29
Original file line numberDiff line numberDiff line change
@@ -12,18 +12,11 @@ macro_rules! replace(
1212
replace!(first, replace, r"[0-9]", "age: 26", t!("Z"), "age: Z6");
1313
replace!(plus, replace, r"[0-9]+", "age: 26", t!("Z"), "age: Z");
1414
replace!(all, replace_all, r"[0-9]", "age: 26", t!("Z"), "age: ZZ");
15-
replace!(
16-
groups,
17-
replace,
18-
r"(?-u)(\S+)\s+(\S+)",
19-
"w1 w2",
20-
t!("$2 $1"),
21-
"w2 w1"
22-
);
15+
replace!(groups, replace, r"(\S+)\s+(\S+)", "w1 w2", t!("$2 $1"), "w2 w1");
2316
replace!(
2417
double_dollar,
2518
replace,
26-
r"(?-u)(\S+)\s+(\S+)",
19+
r"(\S+)\s+(\S+)",
2720
"w1 w2",
2821
t!("$2 $$1"),
2922
"w2 $1"
@@ -33,7 +26,7 @@ replace!(
3326
replace!(
3427
named,
3528
replace_all,
36-
r"(?-u)(?P<first>\S+)\s+(?P<last>\S+)(?P<space>\s*)",
29+
r"(?P<first>\S+)\s+(?P<last>\S+)(?P<space>\s*)",
3730
"w1 w2 w3 w4",
3831
t!("$last $first$space"),
3932
"w2 w1 w4 w3"
@@ -48,42 +41,28 @@ replace!(
4841
);
4942
replace!(number_hypen, replace, r"(.)(.)", "ab", t!("$1-$2"), "a-b");
5043
// replace!(number_underscore, replace, r"(.)(.)", "ab", t!("$1_$2"), "a_b");
51-
replace!(
52-
simple_expand,
53-
replace_all,
54-
r"(?-u)(\w) (\w)",
55-
"a b",
56-
t!("$2 $1"),
57-
"b a"
58-
);
59-
replace!(
60-
literal_dollar1,
61-
replace_all,
62-
r"(?-u)(\w+) (\w+)",
63-
"a b",
64-
t!("$$1"),
65-
"$1"
66-
);
44+
replace!(simple_expand, replace_all, r"(\w) (\w)", "a b", t!("$2 $1"), "b a");
45+
replace!(literal_dollar1, replace_all, r"(\w+) (\w+)", "a b", t!("$$1"), "$1");
6746
replace!(
6847
literal_dollar2,
6948
replace_all,
70-
r"(?-u)(\w+) (\w+)",
49+
r"(\w+) (\w+)",
7150
"a b",
7251
t!("$2 $$c $1"),
7352
"b $c a"
7453
);
7554
replace!(
7655
no_expand1,
7756
replace,
78-
r"(?-u)(\S+)\s+(\S+)",
57+
r"(\S+)\s+(\S+)",
7958
"w1 w2",
8059
no_expand!("$2 $1"),
8160
"$2 $1"
8261
);
8362
replace!(
8463
no_expand2,
8564
replace,
86-
r"(?-u)(\S+)\s+(\S+)",
65+
r"(\S+)\s+(\S+)",
8766
"w1 w2",
8867
no_expand!("$$1"),
8968
"$$1"

0 commit comments

Comments
 (0)