Skip to content

Commit 72e11d1

Browse files
authored
Merge pull request #415 from dralley/escape-api
Closure-based unescaping with custom entities
2 parents 8c622f0 + a4d50a5 commit 72e11d1

14 files changed

+142
-259
lines changed

Changelog.md

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,15 @@
109109
|`read_event_unbuffered` |`read_event`
110110
|`read_to_end_unbuffered` |`read_to_end`
111111
- [#412]: Change `read_to_end*` and `read_text_into` to accept `QName` instead of `AsRef<[u8]>`
112-
112+
- [#415]: Changed custom entity unescaping API to accept closures rather than a mapping of entity to
113+
replacement text. This avoids needing to allocate a map and provides the user with more flexibility.
114+
- [#415]: Renamed many functions following the pattern `unescape_and_decode*` to `decode_and_unescape*`
115+
to better communicate their function. Renamed functions following the pattern `*_with_custom_entities`
116+
to `decode_and_unescape_with` to be more consistent across the API.
117+
- [#415]: `BytesText::escaped()` renamed to `BytesText::escape()`, `BytesText::unescaped()` renamed to
118+
`BytesText::unescape()`, `BytesText::unescaped_with()` renamed to `BytesText::unescape_with()`,
119+
`Attribute::escaped_value()` renamed to `Attribute::escape_value()`, and `Attribute::escaped_value_with()`
120+
renamed to `Attribute::escape_value_with()` for consistency across the API.
113121
- [#416]: `BytesStart::to_borrowed` renamed to `BytesStart::borrow`, the same method
114122
added to all events
115123

@@ -137,6 +145,7 @@
137145
[#403]: https://github.com/tafia/quick-xml/pull/403
138146
[#407]: https://github.com/tafia/quick-xml/pull/407
139147
[#412]: https://github.com/tafia/quick-xml/pull/412
148+
[#415]: https://github.com/tafia/quick-xml/pull/415
140149
[#416]: https://github.com/tafia/quick-xml/pull/416
141150
[#418]: https://github.com/tafia/quick-xml/pull/418
142151

benches/macrobenches.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,11 @@ fn parse_document(doc: &[u8]) -> XmlResult<()> {
2424
match r.read_event()? {
2525
Event::Start(e) | Event::Empty(e) => {
2626
for attr in e.attributes() {
27-
criterion::black_box(attr?.unescaped_value()?);
27+
criterion::black_box(attr?.unescape_value()?);
2828
}
2929
}
3030
Event::Text(e) => {
31-
criterion::black_box(e.unescaped()?);
31+
criterion::black_box(e.unescape()?);
3232
}
3333
Event::CData(e) => {
3434
criterion::black_box(e.into_inner());

benches/microbenches.rs

Lines changed: 1 addition & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -125,86 +125,6 @@ fn read_namespaced_event(c: &mut Criterion) {
125125
group.finish();
126126
}
127127

128-
/// Benchmarks the `BytesText::unescaped()` method (includes time of `read_event`
129-
/// benchmark)
130-
fn bytes_text_unescaped(c: &mut Criterion) {
131-
let mut group = c.benchmark_group("BytesText::unescaped");
132-
group.bench_function("trim_text = false", |b| {
133-
b.iter(|| {
134-
let mut buf = Vec::new();
135-
let mut r = Reader::from_reader(SAMPLE);
136-
r.check_end_names(false).check_comments(false);
137-
let mut count = criterion::black_box(0);
138-
let mut nbtxt = criterion::black_box(0);
139-
loop {
140-
match r.read_event_into(&mut buf) {
141-
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
142-
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
143-
Ok(Event::Eof) => break,
144-
_ => (),
145-
}
146-
buf.clear();
147-
}
148-
assert_eq!(
149-
count, 1550,
150-
"Overall tag count in ./tests/documents/sample_rss.xml"
151-
);
152-
153-
// Windows has \r\n instead of \n
154-
#[cfg(windows)]
155-
assert_eq!(
156-
nbtxt, 67661,
157-
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
158-
);
159-
160-
#[cfg(not(windows))]
161-
assert_eq!(
162-
nbtxt, 66277,
163-
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
164-
);
165-
});
166-
});
167-
168-
group.bench_function("trim_text = true", |b| {
169-
b.iter(|| {
170-
let mut buf = Vec::new();
171-
let mut r = Reader::from_reader(SAMPLE);
172-
r.check_end_names(false)
173-
.check_comments(false)
174-
.trim_text(true);
175-
let mut count = criterion::black_box(0);
176-
let mut nbtxt = criterion::black_box(0);
177-
loop {
178-
match r.read_event_into(&mut buf) {
179-
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
180-
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
181-
Ok(Event::Eof) => break,
182-
_ => (),
183-
}
184-
buf.clear();
185-
}
186-
assert_eq!(
187-
count, 1550,
188-
"Overall tag count in ./tests/documents/sample_rss.xml"
189-
);
190-
191-
// Windows has \r\n instead of \n
192-
#[cfg(windows)]
193-
assert_eq!(
194-
nbtxt, 50334,
195-
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
196-
);
197-
198-
#[cfg(not(windows))]
199-
assert_eq!(
200-
nbtxt, 50261,
201-
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
202-
);
203-
});
204-
});
205-
group.finish();
206-
}
207-
208128
/// Benchmarks, how fast individual event parsed
209129
fn one_event(c: &mut Criterion) {
210130
let mut group = c.benchmark_group("One event");
@@ -256,7 +176,7 @@ fn one_event(c: &mut Criterion) {
256176
.check_comments(false)
257177
.trim_text(true);
258178
match r.read_event_into(&mut buf) {
259-
Ok(Event::Comment(ref e)) => nbtxt += e.unescaped().unwrap().len(),
179+
Ok(Event::Comment(ref e)) => nbtxt += e.unescape().unwrap().len(),
260180
something_else => panic!("Did not expect {:?}", something_else),
261181
};
262182

@@ -473,7 +393,6 @@ purus. Consequat id porta nibh venenatis cras sed felis.";
473393
criterion_group!(
474394
benches,
475395
read_event,
476-
bytes_text_unescaped,
477396
read_namespaced_event,
478397
one_event,
479398
attributes,

examples/custom_entities.rs

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,11 @@
77
//! * the regex in this example is simple but brittle;
88
//! * it does not support the use of entities in entity declaration.
99
10+
use std::collections::HashMap;
11+
1012
use quick_xml::events::Event;
1113
use quick_xml::Reader;
1214
use regex::bytes::Regex;
13-
use std::collections::HashMap;
1415

1516
const DATA: &str = r#"
1617
@@ -27,35 +28,41 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
2728
reader.trim_text(true);
2829

2930
let mut buf = Vec::new();
30-
let mut custom_entities = HashMap::new();
31+
let mut custom_entities: HashMap<Vec<u8>, String> = HashMap::new();
3132
let entity_re = Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#)?;
3233

3334
loop {
3435
match reader.read_event_into(&mut buf) {
3536
Ok(Event::DocType(ref e)) => {
3637
for cap in entity_re.captures_iter(&e) {
37-
custom_entities.insert(cap[1].to_vec(), cap[2].to_vec());
38+
custom_entities.insert(
39+
cap[1].to_vec(),
40+
reader.decoder().decode(&cap[2])?.into_owned(),
41+
);
3842
}
3943
}
4044
Ok(Event::Start(ref e)) => match e.name().as_ref() {
41-
b"test" => println!(
42-
"attributes values: {:?}",
43-
e.attributes()
44-
.map(|a| a
45-
.unwrap()
46-
.unescape_and_decode_value_with_custom_entities(
47-
&reader,
48-
&custom_entities
49-
)
50-
.unwrap())
51-
.collect::<Vec<_>>()
52-
),
45+
b"test" => {
46+
let attributes = e
47+
.attributes()
48+
.map(|a| {
49+
a.unwrap()
50+
.decode_and_unescape_value_with(&reader, |ent| {
51+
custom_entities.get(ent).map(|s| s.as_str())
52+
})
53+
.unwrap()
54+
})
55+
.collect::<Vec<_>>();
56+
println!("attributes values: {:?}", attributes);
57+
}
5358
_ => (),
5459
},
5560
Ok(Event::Text(ref e)) => {
5661
println!(
5762
"text value: {}",
58-
e.unescape_and_decode_with_custom_entities(&reader, &custom_entities)
63+
e.decode_and_unescape_with(&reader, |ent| custom_entities
64+
.get(ent)
65+
.map(|s| s.as_str()))
5966
.unwrap()
6067
);
6168
}

src/de/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -618,14 +618,14 @@ where
618618
allow_start: bool,
619619
) -> Result<BytesCData<'de>, DeError> {
620620
match self.next()? {
621-
DeEvent::Text(e) if unescape => e.unescape().map_err(Into::into),
621+
DeEvent::Text(e) if unescape => e.unescape_as_cdata().map_err(Into::into),
622622
DeEvent::Text(e) => Ok(BytesCData::new(e.into_inner())),
623623
DeEvent::CData(e) => Ok(e),
624624
DeEvent::Start(e) if allow_start => {
625625
// allow one nested level
626626
let inner = self.next()?;
627627
let t = match inner {
628-
DeEvent::Text(t) if unescape => t.unescape()?,
628+
DeEvent::Text(t) if unescape => t.unescape_as_cdata()?,
629629
DeEvent::Text(t) => BytesCData::new(t.into_inner()),
630630
DeEvent::CData(t) => t,
631631
DeEvent::Start(s) => {

src/escapei.rs

Lines changed: 32 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
33
use memchr;
44
use std::borrow::Cow;
5-
use std::collections::HashMap;
65
use std::ops::Range;
76

87
#[cfg(test)]
@@ -66,31 +65,15 @@ impl std::error::Error for EscapeError {}
6665
/// Escapes a `&[u8]` and replaces all xml special characters (<, >, &, ', ") with their
6766
/// corresponding xml escaped value.
6867
pub fn escape(raw: &[u8]) -> Cow<[u8]> {
69-
#[inline]
70-
fn to_escape(b: u8) -> bool {
71-
match b {
72-
b'<' | b'>' | b'\'' | b'&' | b'"' => true,
73-
_ => false,
74-
}
75-
}
76-
77-
_escape(raw, to_escape)
68+
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&' | b'\'' | b'\"'))
7869
}
7970

8071
/// Should only be used for escaping text content. In xml text content, it is allowed
8172
/// (though not recommended) to leave the quote special characters " and ' unescaped.
8273
/// This function escapes a `&[u8]` and replaces xml special characters (<, >, &) with
8374
/// their corresponding xml escaped value, but does not escape quote characters.
8475
pub fn partial_escape(raw: &[u8]) -> Cow<[u8]> {
85-
#[inline]
86-
fn to_escape(b: u8) -> bool {
87-
match b {
88-
b'<' | b'>' | b'&' => true,
89-
_ => false,
90-
}
91-
}
92-
93-
_escape(raw, to_escape)
76+
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&'))
9477
}
9578

9679
/// Escapes a `&[u8]` and replaces a subset of xml special characters (<, >, &, ', ") with their
@@ -130,32 +113,23 @@ fn _escape<F: Fn(u8) -> bool>(raw: &[u8], escape_chars: F) -> Cow<[u8]> {
130113
/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
131114
/// value
132115
pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
133-
do_unescape(raw, None)
134-
}
135-
136-
/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
137-
/// value, using a dictionnary of custom entities.
138-
///
139-
/// # Pre-condition
140-
///
141-
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
142-
pub fn unescape_with<'a>(
143-
raw: &'a [u8],
144-
custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
145-
) -> Result<Cow<'a, [u8]>, EscapeError> {
146-
do_unescape(raw, Some(custom_entities))
116+
unescape_with(raw, |_| None)
147117
}
148118

149119
/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
150-
/// value, using an optional dictionary of custom entities.
120+
/// value, using a resolver function for custom entities.
151121
///
152122
/// # Pre-condition
153123
///
154-
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
155-
pub fn do_unescape<'a>(
156-
raw: &'a [u8],
157-
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
158-
) -> Result<Cow<'a, [u8]>, EscapeError> {
124+
/// The implementation of `resolve_entity` is expected to operate over UTF-8 inputs.
125+
pub fn unescape_with<'input, 'entity, F>(
126+
raw: &'input [u8],
127+
resolve_entity: F,
128+
) -> Result<Cow<'input, [u8]>, EscapeError>
129+
where
130+
// the lifetime of the output comes from a capture or is `'static`
131+
F: Fn(&[u8]) -> Option<&'entity str>,
132+
{
159133
let mut unescaped = None;
160134
let mut last_end = 0;
161135
let mut iter = memchr::memchr2_iter(b'&', b';', raw);
@@ -171,12 +145,14 @@ pub fn do_unescape<'a>(
171145

172146
// search for character correctness
173147
let pat = &raw[start + 1..end];
174-
if let Some(s) = named_entity(pat) {
175-
unescaped.extend_from_slice(s.as_bytes());
176-
} else if pat.starts_with(b"#") {
177-
push_utf8(unescaped, parse_number(&pat[1..], start..end)?);
178-
} else if let Some(value) = custom_entities.and_then(|hm| hm.get(pat)) {
179-
unescaped.extend_from_slice(&value);
148+
if pat.starts_with(b"#") {
149+
let entity = &pat[1..]; // starts after the #
150+
let codepoint = parse_number(entity, start..end)?;
151+
push_utf8(unescaped, codepoint);
152+
} else if let Some(value) = named_entity(pat) {
153+
unescaped.extend_from_slice(value.as_bytes());
154+
} else if let Some(value) = resolve_entity(pat) {
155+
unescaped.extend_from_slice(value.as_bytes());
180156
} else {
181157
return Err(EscapeError::UnrecognizedSymbol(
182158
start + 1..end,
@@ -1740,18 +1716,20 @@ fn test_unescape() {
17401716

17411717
#[test]
17421718
fn test_unescape_with() {
1743-
let custom_entities = vec![(b"foo".to_vec(), b"BAR".to_vec())]
1744-
.into_iter()
1745-
.collect();
1746-
assert_eq!(&*unescape_with(b"test", &custom_entities).unwrap(), b"test");
1719+
let custom_entities = |ent: &[u8]| match ent {
1720+
b"foo" => Some("BAR"),
1721+
_ => None,
1722+
};
1723+
1724+
assert_eq!(&*unescape_with(b"test", custom_entities).unwrap(), b"test");
17471725
assert_eq!(
1748-
&*unescape_with(b"&lt;test&gt;", &custom_entities).unwrap(),
1726+
&*unescape_with(b"&lt;test&gt;", custom_entities).unwrap(),
17491727
b"<test>"
17501728
);
1751-
assert_eq!(&*unescape_with(b"&#x30;", &custom_entities).unwrap(), b"0");
1752-
assert_eq!(&*unescape_with(b"&#48;", &custom_entities).unwrap(), b"0");
1753-
assert_eq!(&*unescape_with(b"&foo;", &custom_entities).unwrap(), b"BAR");
1754-
assert!(unescape_with(b"&fop;", &custom_entities).is_err());
1729+
assert_eq!(&*unescape_with(b"&#x30;", custom_entities).unwrap(), b"0");
1730+
assert_eq!(&*unescape_with(b"&#48;", custom_entities).unwrap(), b"0");
1731+
assert_eq!(&*unescape_with(b"&foo;", custom_entities).unwrap(), b"BAR");
1732+
assert!(unescape_with(b"&fop;", custom_entities).is_err());
17551733
}
17561734

17571735
#[test]

0 commit comments

Comments
 (0)