Skip to content

Commit c9ff68c

Browse files
committed
Faster escape routines
1 parent 0febc2b commit c9ff68c

File tree

2 files changed

+54
-20
lines changed

2 files changed

+54
-20
lines changed

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ document-features = { version = "0.2", optional = true }
1616
encoding_rs = { version = "0.8", optional = true }
1717
serde = { version = "1.0", optional = true }
1818
memchr = "2.5"
19+
jetscii = "0.5.2"
20+
once_cell = "1.12.0"
1921

2022
[dev-dependencies]
2123
criterion = "0.3"

src/escapei.rs

Lines changed: 52 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,20 @@
11
//! Manage xml character escapes
22
3-
use memchr;
43
use std::borrow::Cow;
54
use std::collections::HashMap;
65
use std::ops::Range;
76

7+
use jetscii::bytes;
8+
use memchr;
9+
use once_cell::sync::Lazy;
10+
811
#[cfg(test)]
912
use pretty_assertions::assert_eq;
1013

14+
static XML_ESCAPE_BYTES: Lazy<jetscii::BytesConst> =
15+
Lazy::new(|| bytes!(b'<', b'>', b'&', b'\'', b'"'));
16+
static XML_PARTIAL_ESCAPE_BYTES: Lazy<jetscii::BytesConst> = Lazy::new(|| bytes!(b'<', b'>', b'&'));
17+
1118
/// Error for XML escape/unescqpe.
1219
#[derive(Debug)]
1320
pub enum EscapeError {
@@ -66,31 +73,17 @@ impl std::error::Error for EscapeError {}
6673
/// Escapes a `&[u8]` and replaces all xml special characters (<, >, &, ', ") with their
6774
/// corresponding xml escaped value.
6875
pub fn escape(raw: &[u8]) -> Cow<[u8]> {
69-
#[inline]
70-
fn to_escape(b: u8) -> bool {
71-
match b {
72-
b'<' | b'>' | b'\'' | b'&' | b'"' => true,
73-
_ => false,
74-
}
75-
}
76-
77-
_escape(raw, to_escape)
76+
// _escape(raw, |ch| matches!(ch, b'<' | b'>' | b'\'' | b'&' | b'"'))
77+
simd_escape(raw, &XML_ESCAPE_BYTES)
7878
}
7979

8080
/// Should only be used for escaping text content. In xml text content, it is allowed
8181
/// (though not recommended) to leave the quote special characters " and ' unescaped.
8282
/// This function escapes a `&[u8]` and replaces xml special characters (<, >, &) with
8383
/// their corresponding xml escaped value, but does not escape quote characters.
8484
pub fn partial_escape(raw: &[u8]) -> Cow<[u8]> {
85-
#[inline]
86-
fn to_escape(b: u8) -> bool {
87-
match b {
88-
b'<' | b'>' | b'&' => true,
89-
_ => false,
90-
}
91-
}
92-
93-
_escape(raw, to_escape)
85+
// _escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&'))
86+
simd_escape(raw, &XML_PARTIAL_ESCAPE_BYTES)
9487
}
9588

9689
/// Escapes a `&[u8]` and replaces a subset of xml special characters (<, >, &, ', ") with their
@@ -112,7 +105,46 @@ fn _escape<F: Fn(u8) -> bool>(raw: &[u8], escape_chars: F) -> Cow<[u8]> {
112105
b'\'' => escaped.extend_from_slice(b"&apos;"),
113106
b'&' => escaped.extend_from_slice(b"&amp;"),
114107
b'"' => escaped.extend_from_slice(b"&quot;"),
115-
_ => unreachable!("Only '<', '>','\', '&' and '\"' are escaped"),
108+
c @ _ => unreachable!(
109+
"Found {} but only '<', '>', ', '&' and '\"' are escaped",
110+
c as char
111+
),
112+
}
113+
pos = new_pos + 1;
114+
}
115+
116+
if let Some(mut escaped) = escaped {
117+
if let Some(raw) = raw.get(pos..) {
118+
escaped.extend_from_slice(raw);
119+
}
120+
Cow::Owned(escaped)
121+
} else {
122+
Cow::Borrowed(raw)
123+
}
124+
}
125+
126+
/// Escapes a `&[u8]` and replaces all xml special characters (<, >, &, ', ") with their
127+
/// corresponding xml escaped value.
128+
pub fn simd_escape<'a>(raw: &'a [u8], escape_matcher: &jetscii::BytesConst) -> Cow<'a, [u8]> {
129+
let mut escaped = None;
130+
let mut pos = 0;
131+
while let Some(i) = escape_matcher.find(&raw[pos..]) {
132+
if escaped.is_none() {
133+
escaped = Some(Vec::with_capacity(raw.len()));
134+
}
135+
let escaped = escaped.as_mut().expect("initialized");
136+
let new_pos = pos + i;
137+
escaped.extend_from_slice(&raw[pos..new_pos]);
138+
match raw[new_pos] {
139+
b'<' => escaped.extend_from_slice(b"&lt;"),
140+
b'>' => escaped.extend_from_slice(b"&gt;"),
141+
b'\'' => escaped.extend_from_slice(b"&apos;"),
142+
b'&' => escaped.extend_from_slice(b"&amp;"),
143+
b'"' => escaped.extend_from_slice(b"&quot;"),
144+
c @ _ => unreachable!(
145+
"Found {} but only '<', '>', ', '&' and '\"' are escaped",
146+
c as char
147+
),
116148
}
117149
pos = new_pos + 1;
118150
}

0 commit comments

Comments
 (0)