From a1b30893707c913624da760fe28419bf9c7d5d20 Mon Sep 17 00:00:00 2001 From: David Sherret Date: Fri, 7 Feb 2025 16:03:07 -0500 Subject: [PATCH 1/4] perf: url encode path segments in longer string slices --- url/benches/parse_url.rs | 2 +- url/src/parser.rs | 90 +++++++++++++++++++++++++++++++++------- 2 files changed, 76 insertions(+), 16 deletions(-) diff --git a/url/benches/parse_url.rs b/url/benches/parse_url.rs index 61e88162..4a079fb2 100644 --- a/url/benches/parse_url.rs +++ b/url/benches/parse_url.rs @@ -13,7 +13,7 @@ fn short(bench: &mut Bencher) { } fn long(bench: &mut Bencher) { - let url = "https://example.com/parkbench?tre=es&st=uff"; + let url = "https://example.com/parkbench?tre=es&st=ufflongerlonger"; bench.bytes = url.len() as u64; bench.iter(|| black_box(url).parse::().unwrap()); diff --git a/url/src/parser.rs b/url/src/parser.rs index 1ab0dc1d..3f927f82 100644 --- a/url/src/parser.rs +++ b/url/src/parser.rs @@ -1191,19 +1191,67 @@ impl<'a> Parser<'a> { path_start: usize, mut input: Input<'i>, ) -> Input<'i> { + // it's much faster to call utf8_percent_encode in bulk + fn push_pending( + serialization: &mut String, + start_str: &str, + remaining_len: usize, + context: Context, + scheme_type: SchemeType, + ) { + let text = &start_str[..start_str.len() - remaining_len]; + if text.is_empty() { + return; + } + if context == Context::PathSegmentSetter { + if scheme_type.is_special() { + serialization.extend(utf8_percent_encode(text, SPECIAL_PATH_SEGMENT)); + } else { + serialization.extend(utf8_percent_encode(text, PATH_SEGMENT)); + } + } else { + serialization.extend(utf8_percent_encode(text, PATH)); + } + } + // Relative path state loop { let mut segment_start = self.serialization.len(); let mut ends_with_slash = false; + let mut start_str = input.chars.as_str(); loop { let input_before_c = input.clone(); - let (c, utf8_c) = if let Some(x) = input.next_utf8() { - x - } else { + // bypass input.next() and manually handle ascii_tab_or_new_line + // in order to encode string slices in bulk + let Some(c) = input.chars.next() else { + push_pending( + &mut self.serialization, + start_str, + 0, + self.context, + scheme_type, + ); break; }; match c { + ascii_tab_or_new_line_pattern!() => { + push_pending( + &mut self.serialization, + start_str, + input_before_c.chars.as_str().len(), + self.context, + scheme_type, + ); + start_str = input.chars.as_str(); + } '/' if self.context != Context::PathSegmentSetter => { + push_pending( + &mut self.serialization, + start_str, + input_before_c.chars.as_str().len(), + self.context, + scheme_type, + ); self.serialization.push(c); ends_with_slash = true; break; @@ -1211,12 +1259,26 @@ impl<'a> Parser<'a> { '\\' if self.context != Context::PathSegmentSetter && scheme_type.is_special() => { + push_pending( + &mut self.serialization, + start_str, + input_before_c.chars.as_str().len(), + self.context, + scheme_type, + ); self.log_violation(SyntaxViolation::Backslash); self.serialization.push('/'); ends_with_slash = true; break; } '?' | '#' if self.context == Context::UrlParser => { + push_pending( + &mut self.serialization, + start_str, + input_before_c.chars.as_str().len(), + self.context, + scheme_type, + ); input = input_before_c; break; } @@ -1228,23 +1290,21 @@ impl<'a> Parser<'a> { &self.serialization[path_start + 1..], ) { + push_pending( + &mut self.serialization, + start_str, + input_before_c.chars.as_str().len(), + self.context, + scheme_type, + ); + start_str = input_before_c.chars.as_str(); self.serialization.push('/'); segment_start += 1; } - if self.context == Context::PathSegmentSetter { - if scheme_type.is_special() { - self.serialization - .extend(utf8_percent_encode(utf8_c, SPECIAL_PATH_SEGMENT)); - } else { - self.serialization - .extend(utf8_percent_encode(utf8_c, PATH_SEGMENT)); - } - } else { - self.serialization.extend(utf8_percent_encode(utf8_c, PATH)); - } } } } + let segment_before_slash = if ends_with_slash { &self.serialization[segment_start..self.serialization.len() - 1] } else { @@ -1305,7 +1365,7 @@ impl<'a> Parser<'a> { } } if scheme_type.is_file() { - // while url’s path’s size is greater than 1 + // while url's path’s size is greater than 1 // and url’s path[0] is the empty string, // validation error, remove the first item from url’s path. //FIXME: log violation From e3762d670f705e416afc7e9c9c68db410c502dff Mon Sep 17 00:00:00 2001 From: David Sherret Date: Fri, 7 Feb 2025 16:07:27 -0500 Subject: [PATCH 2/4] stable rust --- url/src/lib.rs | 1 - url/src/parser.rs | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/url/src/lib.rs b/url/src/lib.rs index 96fa1eee..9727b77e 100644 --- a/url/src/lib.rs +++ b/url/src/lib.rs @@ -3041,7 +3041,6 @@ fn file_url_segments_to_pathbuf( use std::os::hermit::ffi::OsStrExt; #[cfg(any(unix, target_os = "redox"))] use std::os::unix::prelude::OsStrExt; - use std::path::PathBuf; if host.is_some() { return Err(()); diff --git a/url/src/parser.rs b/url/src/parser.rs index 3f927f82..e63d30a2 100644 --- a/url/src/parser.rs +++ b/url/src/parser.rs @@ -1223,7 +1223,9 @@ impl<'a> Parser<'a> { let input_before_c = input.clone(); // bypass input.next() and manually handle ascii_tab_or_new_line // in order to encode string slices in bulk - let Some(c) = input.chars.next() else { + let c = if let Some(c) = input.chars.next() { + c + } else { push_pending( &mut self.serialization, start_str, From 7e46461587b9b6aaafa982c94799052b2d1d9cd4 Mon Sep 17 00:00:00 2001 From: David Sherret Date: Sat, 8 Feb 2025 08:40:37 -0500 Subject: [PATCH 3/4] revert quote change --- url/src/parser.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/url/src/parser.rs b/url/src/parser.rs index e63d30a2..7af5d3f7 100644 --- a/url/src/parser.rs +++ b/url/src/parser.rs @@ -1367,7 +1367,7 @@ impl<'a> Parser<'a> { } } if scheme_type.is_file() { - // while url's path’s size is greater than 1 + // while url’s path’s size is greater than 1 // and url’s path[0] is the empty string, // validation error, remove the first item from url’s path. //FIXME: log violation From 5ffe79da4af8adcd6f3315645f1c5f49393a01b7 Mon Sep 17 00:00:00 2001 From: David Sherret Date: Sat, 8 Feb 2025 08:43:04 -0500 Subject: [PATCH 4/4] Revert this too. Signed-off-by: David Sherret --- url/benches/parse_url.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/url/benches/parse_url.rs b/url/benches/parse_url.rs index 4a079fb2..61e88162 100644 --- a/url/benches/parse_url.rs +++ b/url/benches/parse_url.rs @@ -13,7 +13,7 @@ fn short(bench: &mut Bencher) { } fn long(bench: &mut Bencher) { - let url = "https://example.com/parkbench?tre=es&st=ufflongerlonger"; + let url = "https://example.com/parkbench?tre=es&st=uff"; bench.bytes = url.len() as u64; bench.iter(|| black_box(url).parse::().unwrap());