-
Notifications
You must be signed in to change notification settings - Fork 325
Audit the use of unsafe in uri/path.rs #413
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 4 commits
50f5d4f
375b95c
707d130
2a8fe92
a85a03c
d4718c9
27ff0cb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,74 +19,56 @@ const NONE: u16 = ::std::u16::MAX; | |
impl PathAndQuery { | ||
// Not public while `bytes` is unstable. | ||
pub(super) fn from_shared(mut src: Bytes) -> Result<Self, InvalidUri> { | ||
let mut query = NONE; | ||
let mut fragment = None; | ||
|
||
// block for iterator borrow | ||
{ | ||
let mut iter = src.as_ref().iter().enumerate(); | ||
|
||
// path ... | ||
for (i, &b) in &mut iter { | ||
// See https://url.spec.whatwg.org/#path-state | ||
match b { | ||
b'?' => { | ||
debug_assert_eq!(query, NONE); | ||
query = i as u16; | ||
break; | ||
} | ||
b'#' => { | ||
fragment = Some(i); | ||
break; | ||
} | ||
let (query, len) = src.as_ref().iter() | ||
// stop at the fragment specifier | ||
.take_while(|&&c| c != b'#') | ||
.try_fold((None, 0u16), |(query, i), &c| { | ||
match c { | ||
// match the query specifier if this is the first occurance | ||
b'?' if query.is_none() => Ok((Some(i), i+1)), | ||
|
||
// This is the range of bytes that don't need to be | ||
// percent-encoded in the path. If it should have been | ||
// percent-encoded, then error. | ||
// | ||
// See https://url.spec.whatwg.org/#path-state | ||
0x21 | | ||
0x24..=0x3B | | ||
0x3D | | ||
0x40..=0x5F | | ||
0x61..=0x7A | | ||
0x7C | | ||
0x7E => {}, | ||
|
||
_ => return Err(ErrorKind::InvalidUriChar.into()), | ||
0x7E => Ok((query, i+1)), | ||
|
||
// While queries *should* be percent-encoded, most | ||
// bytes are actually allowed... | ||
// See https://url.spec.whatwg.org/#query-state | ||
// | ||
// Allowed: 0x21 / 0x24 - 0x3B / 0x3D / 0x3F - 0x7E | ||
// | ||
// The list below is the bytes that are allowed in the | ||
// query but not in the path. The bytes allowed in both | ||
// are matched in the previous arm. | ||
0x3F | | ||
0x60 | | ||
0x7B | | ||
0x7D if query.is_some() => Ok((query, i+1)), | ||
|
||
// all bytes 0x80 and above match here (among others) | ||
// so all Ok() returns in the other match arms identify | ||
// a single byte UTF-8 code point | ||
_ => Err(InvalidUri(ErrorKind::InvalidUriChar)), | ||
} | ||
} | ||
})?; | ||
|
||
// query ... | ||
if query != NONE { | ||
for (i, &b) in iter { | ||
match b { | ||
// While queries *should* be percent-encoded, most | ||
// bytes are actually allowed... | ||
// See https://url.spec.whatwg.org/#query-state | ||
// | ||
// Allowed: 0x21 / 0x24 - 0x3B / 0x3D / 0x3F - 0x7E | ||
0x21 | | ||
0x24..=0x3B | | ||
0x3D | | ||
0x3F..=0x7E => {}, | ||
|
||
b'#' => { | ||
fragment = Some(i); | ||
break; | ||
} | ||
|
||
_ => return Err(ErrorKind::InvalidUriChar.into()), | ||
} | ||
} | ||
} | ||
} | ||
|
||
if let Some(i) = fragment { | ||
src.truncate(i); | ||
} | ||
src.truncate(len as usize); | ||
|
||
Ok(PathAndQuery { | ||
// Safety: The try_fold() checks that each byte in the now truncated | ||
// src is a single byte UTF-8 code point so src as a whole is valid | ||
// UTF-8. | ||
sbosnick marked this conversation as resolved.
Show resolved
Hide resolved
|
||
data: unsafe { ByteStr::from_utf8_unchecked(src) }, | ||
query: query, | ||
query: query.unwrap_or(NONE), | ||
}) | ||
} | ||
|
||
|
@@ -511,7 +493,40 @@ mod tests { | |
assert_eq!("qr=%3", pq("/a/b?qr=%3").query().unwrap()); | ||
} | ||
|
||
#[test] | ||
fn from_u8_slice_equates_with_string() { | ||
assert_eq!("/", pq_bytes(&[b'/'])); | ||
assert_eq!("/ab", pq_bytes(&[b'/', b'a', b'b'])); | ||
assert_eq!("a", pq_bytes(&[b'a'])); | ||
assert_eq!("a?", pq_bytes(&[b'a', b'?'])); | ||
assert_eq!("a?b", pq_bytes(&[b'a', b'?', b'b'])); | ||
assert_eq!("a?{b}", pq_bytes(&[b'a', b'?', b'{', b'b', b'}'])); | ||
assert_eq!("a", pq_bytes(&[b'a', b'#', b'b'])); | ||
assert_eq!("a?b", pq_bytes(&[b'a', b'?', b'b', b'#', b'c'])); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. May eventually be worth having property tests in addition, but definitely not a blocker. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would be happy to add property tests but my thoughts are that that should be a separate pull request. If this is a good approach I could add an issue for this as a marker and come back to it later. |
||
|
||
// This example has invalid UTF-8 in the fragment but the path | ||
// and query are valid UTF-8 | ||
assert_eq!("a?b", pq_bytes(&[b'a', b'?', b'b', b'#', 0xc0])); | ||
} | ||
|
||
#[test] | ||
fn from_invalid_u8_slice_is_error() { | ||
assert!(is_invalid_pq_bytes(&[0xc0])); // invalid UTF-8 | ||
assert!(is_invalid_pq_bytes(&[b' '])); // need percent encoding | ||
assert!(is_invalid_pq_bytes(&[b'{'])); // need percent encoding | ||
assert!(is_invalid_pq_bytes(&[b'a', b'?', 0xc0])); // invalid UTF-8 | ||
assert!(is_invalid_pq_bytes(&[b'a', b'?', b' '])); // needs percent encoding | ||
} | ||
|
||
fn pq(s: &str) -> PathAndQuery { | ||
s.parse().expect(&format!("parsing {}", s)) | ||
} | ||
|
||
fn pq_bytes(s: &[u8]) -> PathAndQuery { | ||
PathAndQuery::try_from(s).expect(&format!("converting {:?}", s)) | ||
} | ||
|
||
fn is_invalid_pq_bytes(s: &[u8]) -> bool { | ||
PathAndQuery::try_from(s).is_err() | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does passing these values along due to
fold
optimize differently? Do we even have benchmarks parsing URIs?Also, this isn't a super strong feeling, but I kind of feel that the previous 2 loops is a little clearer that we're looking for two things....
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
On the second point, the two loop version emphasizes that we are looking for two things, the one loop version emphasizes that we are looking at every byte in
src
until the first'#'
. The latter is what we need to confirm the soundness of the call theByteStr::from_utf8_unchecked()
later in the function. (This is just a shift in emphasis; the two loop version checks the same things.) It is possible to describe the 2 loop version with comments describing the loop control flow (how the early exits and the normal end-of-loop combine to ensure that every byte until the first'#'
gets checked) but the one loop version accomplishes this in code, rather than comments. This is a trade-off on the code-quality issue.Unfortunately, though, there is a performance regression. (When I checked quickly in the past it didn't look like there was but I checked it more closely after this comment and there is.) There are a few benchmarks for parsing URI's in
benches/uri.rs
. They don't benchmark all of the URI parsing code (for example they don't cover parsing the scheme or authority) but two of the benchmarks cover the path and query parsing code. These benchmarks show a performance regression in this code over that in the master branch.I will try a version of the one loop approach with external iteration instead of
try_fold
to see if that addresses the performance issue.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The latest commit has resolved the performance regression (to within the margin of error).