Skip to content

Commit fda559a

Browse files
committed
Change API for providing custom entities
Instead of providing unescaping functions with an entity mapping via a data structure, instead provide a closure which maps the entity with replacement text.
1 parent 65e6319 commit fda559a

File tree

6 files changed

+76
-128
lines changed

6 files changed

+76
-128
lines changed

Changelog.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,8 @@
109109
|`read_event_unbuffered` |`read_event`
110110
|`read_to_end_unbuffered` |`read_to_end`
111111
- [#412]: Change `read_to_end*` and `read_text_into` to accept `QName` instead of `AsRef<[u8]>`
112+
- [#415]: Changed custom entity unescaping API to accept closures rather than a mapping of entity to
113+
replacement text. This avoids needing to allocate a map and provides the user with more flexibility.
112114

113115
- [#416]: `BytesStart::to_borrowed` renamed to `BytesStart::borrow`, the same method
114116
added to all events
@@ -136,6 +138,7 @@
136138
[#403]: https://github.com/tafia/quick-xml/pull/403
137139
[#407]: https://github.com/tafia/quick-xml/pull/407
138140
[#412]: https://github.com/tafia/quick-xml/pull/412
141+
[#415]: https://github.com/tafia/quick-xml/pull/415
139142
[#416]: https://github.com/tafia/quick-xml/pull/416
140143

141144
## 0.23.0 -- 2022-05-08

examples/custom_entities.rs

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,11 @@
77
//! * the regex in this example is simple but brittle;
88
//! * it does not support the use of entities in entity declaration.
99
10+
use std::collections::HashMap;
11+
1012
use quick_xml::events::Event;
1113
use quick_xml::Reader;
1214
use regex::bytes::Regex;
13-
use std::collections::HashMap;
1415

1516
const DATA: &str = r#"
1617
@@ -27,35 +28,39 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
2728
reader.trim_text(true);
2829

2930
let mut buf = Vec::new();
30-
let mut custom_entities = HashMap::new();
31+
let mut custom_entities: HashMap<Vec<u8>, String> = HashMap::new();
3132
let entity_re = Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#)?;
3233

3334
loop {
3435
match reader.read_event_into(&mut buf) {
3536
Ok(Event::DocType(ref e)) => {
3637
for cap in entity_re.captures_iter(&e) {
37-
custom_entities.insert(cap[1].to_vec(), cap[2].to_vec());
38+
custom_entities.insert(cap[1].to_vec(), String::from_utf8(cap[2].to_vec())?);
3839
}
3940
}
4041
Ok(Event::Start(ref e)) => match e.name().as_ref() {
41-
b"test" => println!(
42-
"attributes values: {:?}",
43-
e.attributes()
44-
.map(|a| a
45-
.unwrap()
46-
.unescape_and_decode_value_with_custom_entities(
47-
&reader,
48-
&custom_entities
49-
)
50-
.unwrap())
51-
.collect::<Vec<_>>()
52-
),
42+
b"test" => {
43+
let lookup_custom_entity = |ent| custom_entities.get(ent).map(|s| s.as_str());
44+
let attributes = e
45+
.attributes()
46+
.map(|a| {
47+
a.unwrap()
48+
.unescape_and_decode_value_with_custom_entities(
49+
&reader,
50+
lookup_custom_entity,
51+
)
52+
.unwrap()
53+
})
54+
.collect::<Vec<_>>();
55+
println!("attributes values: {:?}", attributes);
56+
}
5357
_ => (),
5458
},
5559
Ok(Event::Text(ref e)) => {
60+
let lookup_custom_entity = |ent| custom_entities.get(ent).map(|s| s.as_str());
5661
println!(
5762
"text value: {}",
58-
e.unescape_and_decode_with_custom_entities(&reader, &custom_entities)
63+
e.unescape_and_decode_with_custom_entities(&reader, lookup_custom_entity)
5964
.unwrap()
6065
);
6166
}

src/escapei.rs

Lines changed: 25 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
33
use memchr;
44
use std::borrow::Cow;
5-
use std::collections::HashMap;
65
use std::ops::Range;
76

87
#[cfg(test)]
@@ -66,31 +65,15 @@ impl std::error::Error for EscapeError {}
6665
/// Escapes a `&[u8]` and replaces all xml special characters (<, >, &, ', ") with their
6766
/// corresponding xml escaped value.
6867
pub fn escape(raw: &[u8]) -> Cow<[u8]> {
69-
#[inline]
70-
fn to_escape(b: u8) -> bool {
71-
match b {
72-
b'<' | b'>' | b'\'' | b'&' | b'"' => true,
73-
_ => false,
74-
}
75-
}
76-
77-
_escape(raw, to_escape)
68+
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&' | b'\'' | b'\"'))
7869
}
7970

8071
/// Should only be used for escaping text content. In xml text content, it is allowed
8172
/// (though not recommended) to leave the quote special characters " and ' unescaped.
8273
/// This function escapes a `&[u8]` and replaces xml special characters (<, >, &) with
8374
/// their corresponding xml escaped value, but does not escape quote characters.
8475
pub fn partial_escape(raw: &[u8]) -> Cow<[u8]> {
85-
#[inline]
86-
fn to_escape(b: u8) -> bool {
87-
match b {
88-
b'<' | b'>' | b'&' => true,
89-
_ => false,
90-
}
91-
}
92-
93-
_escape(raw, to_escape)
76+
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&'))
9477
}
9578

9679
/// Escapes a `&[u8]` and replaces a subset of xml special characters (<, >, &, ', ") with their
@@ -130,32 +113,16 @@ fn _escape<F: Fn(u8) -> bool>(raw: &[u8], escape_chars: F) -> Cow<[u8]> {
130113
/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
131114
/// value
132115
pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
133-
do_unescape(raw, None)
134-
}
135-
136-
/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
137-
/// value, using a dictionnary of custom entities.
138-
///
139-
/// # Pre-condition
140-
///
141-
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
142-
pub fn unescape_with<'a>(
143-
raw: &'a [u8],
144-
custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
145-
) -> Result<Cow<'a, [u8]>, EscapeError> {
146-
do_unescape(raw, Some(custom_entities))
116+
unescape_with(raw, |_| None)
147117
}
148118

149119
/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
150-
/// value, using an optional dictionary of custom entities.
120+
/// value, using a resolver function for custom entities.
151121
///
152122
/// # Pre-condition
153123
///
154-
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
155-
pub fn do_unescape<'a>(
156-
raw: &'a [u8],
157-
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
158-
) -> Result<Cow<'a, [u8]>, EscapeError> {
124+
/// The implementation of `lookup_custom_entity` is expected to operate over UTF-8 inputs.
125+
pub fn unescape_with<'a>(raw: &'a [u8], resolve_entity: impl Fn(&[u8]) -> Option<&str>) -> Result<Cow<'a, [u8]>, EscapeError> {
159126
let mut unescaped = None;
160127
let mut last_end = 0;
161128
let mut iter = memchr::memchr2_iter(b'&', b';', raw);
@@ -171,12 +138,14 @@ pub fn do_unescape<'a>(
171138

172139
// search for character correctness
173140
let pat = &raw[start + 1..end];
174-
if let Some(s) = named_entity(pat) {
175-
unescaped.extend_from_slice(s.as_bytes());
176-
} else if pat.starts_with(b"#") {
177-
push_utf8(unescaped, parse_number(&pat[1..], start..end)?);
178-
} else if let Some(value) = custom_entities.and_then(|hm| hm.get(pat)) {
179-
unescaped.extend_from_slice(&value);
141+
if pat.starts_with(b"#") {
142+
let entity = &pat[1..]; // starts after the #
143+
let codepoint = parse_number(entity, start..end)?;
144+
push_utf8(unescaped, codepoint);
145+
} else if let Some(value) = named_entity(pat) {
146+
unescaped.extend_from_slice(value.as_bytes());
147+
} else if let Some(value) = resolve_entity(pat) {
148+
unescaped.extend_from_slice(value.as_bytes());
180149
} else {
181150
return Err(EscapeError::UnrecognizedSymbol(
182151
start + 1..end,
@@ -1740,18 +1709,20 @@ fn test_unescape() {
17401709

17411710
#[test]
17421711
fn test_unescape_with() {
1743-
let custom_entities = vec![(b"foo".to_vec(), b"BAR".to_vec())]
1744-
.into_iter()
1745-
.collect();
1746-
assert_eq!(&*unescape_with(b"test", &custom_entities).unwrap(), b"test");
1712+
let custom_entities = |ent: &[u8]| match ent {
1713+
b"foo" => Some("BAR"),
1714+
_ => None,
1715+
};
1716+
1717+
assert_eq!(&*unescape_with(b"test", custom_entities).unwrap(), b"test");
17471718
assert_eq!(
1748-
&*unescape_with(b"&lt;test&gt;", &custom_entities).unwrap(),
1719+
&*unescape_with(b"&lt;test&gt;", custom_entities).unwrap(),
17491720
b"<test>"
17501721
);
1751-
assert_eq!(&*unescape_with(b"&#x30;", &custom_entities).unwrap(), b"0");
1752-
assert_eq!(&*unescape_with(b"&#48;", &custom_entities).unwrap(), b"0");
1753-
assert_eq!(&*unescape_with(b"&foo;", &custom_entities).unwrap(), b"BAR");
1754-
assert!(unescape_with(b"&fop;", &custom_entities).is_err());
1722+
assert_eq!(&*unescape_with(b"&#x30;", custom_entities).unwrap(), b"0");
1723+
assert_eq!(&*unescape_with(b"&#48;", custom_entities).unwrap(), b"0");
1724+
assert_eq!(&*unescape_with(b"&foo;", custom_entities).unwrap(), b"BAR");
1725+
assert!(unescape_with(b"&fop;", custom_entities).is_err());
17551726
}
17561727

17571728
#[test]

src/events/attributes.rs

Lines changed: 11 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,13 @@
33
//! Provides an iterator over attributes key/value pairs
44
55
use crate::errors::{Error, Result as XmlResult};
6-
use crate::escape::{do_unescape, escape};
6+
use crate::escape::{escape, unescape_with};
77
use crate::name::QName;
88
use crate::reader::{is_whitespace, Reader};
99
use crate::utils::{write_byte_string, write_cow_string, Bytes};
1010
use std::fmt::{self, Debug, Display, Formatter};
1111
use std::iter::FusedIterator;
12-
use std::{borrow::Cow, collections::HashMap, ops::Range};
12+
use std::{borrow::Cow, ops::Range};
1313

1414
/// A struct representing a key/value XML attribute.
1515
///
@@ -39,7 +39,7 @@ impl<'a> Attribute<'a> {
3939
///
4040
/// See also [`unescaped_value_with_custom_entities()`](Self::unescaped_value_with_custom_entities)
4141
pub fn unescaped_value(&self) -> XmlResult<Cow<[u8]>> {
42-
self.make_unescaped_value(None)
42+
self.unescaped_value_with_custom_entities(|_| None)
4343
}
4444

4545
/// Returns the unescaped value, using custom entities.
@@ -55,18 +55,11 @@ impl<'a> Attribute<'a> {
5555
/// # Pre-condition
5656
///
5757
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
58-
pub fn unescaped_value_with_custom_entities(
59-
&self,
60-
custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
61-
) -> XmlResult<Cow<[u8]>> {
62-
self.make_unescaped_value(Some(custom_entities))
63-
}
64-
65-
fn make_unescaped_value(
66-
&self,
67-
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
68-
) -> XmlResult<Cow<[u8]>> {
69-
do_unescape(&*self.value, custom_entities).map_err(Error::EscapeError)
58+
pub fn unescaped_value_with_custom_entities<'s>(
59+
&'s self,
60+
resolve_entity: impl Fn(&[u8]) -> Option<&str>,
61+
) -> XmlResult<Cow<'s, [u8]>> {
62+
unescape_with(&*self.value, resolve_entity).map_err(Error::EscapeError)
7063
}
7164

7265
/// Decode then unescapes the value
@@ -80,7 +73,7 @@ impl<'a> Attribute<'a> {
8073
/// [`unescaped_value()`]: Self::unescaped_value
8174
/// [`Reader::decoder().decode()`]: crate::reader::Decoder::decode
8275
pub fn unescape_and_decode_value<B>(&self, reader: &Reader<B>) -> XmlResult<String> {
83-
self.do_unescape_and_decode_value(reader, None)
76+
self.unescape_and_decode_value_with_custom_entities(reader, |_| None)
8477
}
8578

8679
/// Decode then unescapes the value with custom entities
@@ -100,20 +93,10 @@ impl<'a> Attribute<'a> {
10093
pub fn unescape_and_decode_value_with_custom_entities<B>(
10194
&self,
10295
reader: &Reader<B>,
103-
custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
104-
) -> XmlResult<String> {
105-
self.do_unescape_and_decode_value(reader, Some(custom_entities))
106-
}
107-
108-
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
109-
fn do_unescape_and_decode_value<B>(
110-
&self,
111-
reader: &Reader<B>,
112-
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
96+
resolve_entity: impl Fn(&[u8]) -> Option<&str>,
11397
) -> XmlResult<String> {
11498
let decoded = reader.decoder().decode(&*self.value)?;
115-
116-
let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?;
99+
let unescaped = unescape_with(decoded.as_bytes(), resolve_entity)?;
117100
Ok(String::from_utf8(unescaped.into_owned())?)
118101
}
119102
}

src/events/mod.rs

Lines changed: 15 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,12 @@ pub mod attributes;
3737
#[cfg(feature = "encoding")]
3838
use encoding_rs::Encoding;
3939
use std::borrow::Cow;
40-
use std::collections::HashMap;
4140
use std::fmt::{self, Debug, Formatter};
4241
use std::ops::Deref;
4342
use std::str::from_utf8;
4443

4544
use crate::errors::{Error, Result};
46-
use crate::escape::{do_unescape, escape, partial_escape};
45+
use crate::escape::{escape, partial_escape, unescape_with};
4746
use crate::name::{LocalName, QName};
4847
use crate::reader::{Decoder, Reader};
4948
use crate::utils::write_cow_string;
@@ -740,10 +739,12 @@ impl<'a> BytesText<'a> {
740739
//TODO: need to think about better API instead of dozens similar functions
741740
// Maybe use builder pattern. After that expose function as public API
742741
//FIXME: need to take into account entities defined in the document
743-
Ok(BytesCData::new(match do_unescape(&self.content, None)? {
744-
Cow::Borrowed(_) => self.content,
745-
Cow::Owned(unescaped) => Cow::Owned(unescaped),
746-
}))
742+
Ok(BytesCData::new(
743+
match unescape_with(&self.content, |_| None)? {
744+
Cow::Borrowed(_) => self.content,
745+
Cow::Owned(unescaped) => Cow::Owned(unescaped),
746+
},
747+
))
747748
}
748749

749750
/// gets escaped content
@@ -753,7 +754,7 @@ impl<'a> BytesText<'a> {
753754
///
754755
/// See also [`unescaped_with_custom_entities()`](Self::unescaped_with_custom_entities)
755756
pub fn unescaped(&self) -> Result<Cow<[u8]>> {
756-
self.make_unescaped(None)
757+
self.unescaped_with_custom_entities(|_| None)
757758
}
758759

759760
/// gets escaped content with custom entities
@@ -764,21 +765,14 @@ impl<'a> BytesText<'a> {
764765
///
765766
/// # Pre-condition
766767
///
767-
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
768+
/// The implementation of `lookup_custom_entity` is expected to operate over UTF-8 inputs.
768769
///
769770
/// See also [`unescaped()`](Self::unescaped)
770771
pub fn unescaped_with_custom_entities<'s>(
771772
&'s self,
772-
custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
773-
) -> Result<Cow<'s, [u8]>> {
774-
self.make_unescaped(Some(custom_entities))
775-
}
776-
777-
fn make_unescaped<'s>(
778-
&'s self,
779-
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
773+
resolve_entity: impl Fn(&[u8]) -> Option<&str>,
780774
) -> Result<Cow<'s, [u8]>> {
781-
do_unescape(self, custom_entities).map_err(Error::EscapeError)
775+
unescape_with(self, resolve_entity).map_err(Error::EscapeError)
782776
}
783777

784778
/// helper method to unescape then decode self using the reader encoding
@@ -788,7 +782,7 @@ impl<'a> BytesText<'a> {
788782
/// 1. BytesText::unescaped()
789783
/// 2. Reader::decode(...)
790784
pub fn unescape_and_decode<B>(&self, reader: &Reader<B>) -> Result<String> {
791-
self.do_unescape_and_decode_with_custom_entities(reader, None)
785+
self.unescape_and_decode_with_custom_entities(reader, |_| None)
792786
}
793787

794788
/// helper method to unescape then decode self using the reader encoding with custom entities
@@ -800,23 +794,15 @@ impl<'a> BytesText<'a> {
800794
///
801795
/// # Pre-condition
802796
///
803-
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
797+
/// The implementation of `lookup_custom_entity` is expected to operate over UTF-8 inputs.
804798
pub fn unescape_and_decode_with_custom_entities<B>(
805799
&self,
806800
reader: &Reader<B>,
807-
custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
808-
) -> Result<String> {
809-
self.do_unescape_and_decode_with_custom_entities(reader, Some(custom_entities))
810-
}
811-
812-
fn do_unescape_and_decode_with_custom_entities<B>(
813-
&self,
814-
reader: &Reader<B>,
815-
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
801+
resolve_entity: impl Fn(&[u8]) -> Option<&str>,
816802
) -> Result<String> {
817803
let decoded = reader.decoder().decode(&*self)?;
818804

819-
let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?;
805+
let unescaped = unescape_with(decoded.as_bytes(), resolve_entity)?;
820806
Ok(String::from_utf8(unescaped.into_owned())?)
821807
}
822808

0 commit comments

Comments
 (0)