From cc6397582b8dd9445b4de7a79d85110d5372b334 Mon Sep 17 00:00:00 2001 From: Jayonas <37936740+Jayonas@users.noreply.github.com> Date: Tue, 21 May 2024 19:01:38 -0400 Subject: [PATCH 1/2] Add `positions-extra-attr` feature for `Attribute` subranges. To save memory this only uses two `u8`s per `Attribute`, but the tradeoff is that the qname and value ranges will be incorrect in a few extreme data cases, specifically if the attribute's qname has len > 255 or there are >254 spaces surrounding its equal sign. Those cases seem unlikely enough in practice to justify not using more memory for this. --- Cargo.toml | 3 ++ src/lib.rs | 41 ++++++++++++++++++++ src/parse.rs | 16 +++++++- src/tokenizer.rs | 7 +++- src/tokenizer_tests.rs | 2 +- tests/api.rs | 87 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 151 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 6ca4850..7208fcf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,3 +22,6 @@ std = [] # Enables Nodes and Attributes position in the original document preserving. # Increases memory usage by `Range` for each Node and Attribute. positions = [] +# Enables ranges for each Attribute's qname and value individually. +# Increases memory usage by two `u8`s for each Attribute. +positions-extra-attr = ["positions"] diff --git a/src/lib.rs b/src/lib.rs index a851aa3..37803e8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -490,6 +490,10 @@ struct AttributeData<'input> { value: StringStorage<'input>, #[cfg(feature = "positions")] range: Range, + #[cfg(feature = "positions-extra-attr")] + qname_len: u8, + #[cfg(feature = "positions-extra-attr")] + eq_len: u8, // includes any surrounding spaces } /// An attribute. @@ -587,6 +591,43 @@ impl<'a, 'input> Attribute<'a, 'input> { pub fn range(&self) -> Range { self.data.range.clone() } + + /// Returns attribute's qname's range in bytes in the original document. + /// + /// ```text + /// + /// ^^^^^^ + /// ``` + /// + /// This method will return incorrect data if the attribute's qname is longer than 255 bytes, + /// although it will not panic or exhibit undefined behavior. + #[cfg(feature = "positions-extra-attr")] + #[inline] + pub fn range_qname(&self) -> Range { + let end = self.data.range.start + self.data.qname_len as usize; + self.data.range.start..end + } + + /// Returns attribute's value's range in bytes in the original document, excluding the surrounding quotes. + /// + /// If the attribute's value is an empty string then the `start` and `end` of this `Range` are equal, and indicate the closing quote. + /// + /// ```text + /// + /// ^^^^^ + /// ``` + /// + /// This method will return incorrect data if the attribute's qname is longer than 255 bytes + /// or if there are more than 254 spaces surrounding the attribute's equal sign, + /// although it will not panic or exhibit undefined behavior. + #[cfg(feature = "positions-extra-attr")] + #[inline] + pub fn range_value(&self) -> Range { + // +1 on start and -1 on end are to exclude the quotes around the value (all valid quotes are 1 byte) + let start = self.data.range.start + self.data.qname_len as usize + self.data.eq_len as usize + 1; + let end = self.data.range.end - 1; + start..end + } } impl PartialEq for Attribute<'_, '_> { diff --git a/src/parse.rs b/src/parse.rs index 5470ef4..497b79a 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -353,6 +353,10 @@ struct TempAttributeData<'input> { local: &'input str, value: StringStorage<'input>, range: Range, + #[allow(unused)] // only used for feature positions-extra-attr + qname_len: u8, + #[allow(unused)] // only used for feature positions-extra-attr + eq_len: u8, } impl<'input> Document<'input> { @@ -644,8 +648,8 @@ impl<'input> tokenizer::XmlEvents<'input> for Context<'input> { self.after_text = false; } - tokenizer::Token::Attribute(range, prefix, local, value) => { - process_attribute(range, prefix, local, value, self)?; + tokenizer::Token::Attribute(range, qname_len, eq_len, prefix, local, value) => { + process_attribute(range, qname_len, eq_len, prefix, local, value, self)?; } tokenizer::Token::ElementEnd(end, range) => { process_element(end, range, self)?; @@ -666,6 +670,8 @@ impl<'input> tokenizer::XmlEvents<'input> for Context<'input> { #[allow(clippy::too_many_arguments)] fn process_attribute<'input>( range: Range, + qname_len: u8, + eq_len: u8, prefix: &'input str, local: &'input str, value: StrSpan<'input>, @@ -732,6 +738,8 @@ fn process_attribute<'input>( local, value, range, + qname_len, + eq_len, }); } @@ -909,6 +917,10 @@ fn resolve_attributes(namespaces: ShortRange, ctx: &mut Context) -> Result { ElementStart(&'input str, &'input str, usize), // ns:attr="value" - Attribute(Range, &'input str, &'input str, StrSpan<'input>), + Attribute(Range, u8, u8, &'input str, &'input str, StrSpan<'input>), ElementEnd(ElementEnd<'input>, Range), @@ -553,7 +553,10 @@ fn parse_element<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'inp // We cannot mark `parse_attribute` as `#[inline(always)]` // because it will blow up the binary size. let (prefix, local) = s.consume_qname()?; + let qname_end = s.pos(); + let qname_len = (qname_end - start).try_into().unwrap_or(u8::MAX); s.consume_eq()?; + let eq_len = (s.pos() - qname_end).try_into().unwrap_or(u8::MAX); let quote = s.consume_quote()?; let quote_c = quote as char; // The attribute value must not contain the < character. @@ -562,7 +565,7 @@ fn parse_element<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'inp let value = s.slice_back_span(value_start); s.consume_byte(quote)?; let end = s.pos(); - events.token(Token::Attribute(start..end, prefix, local, value))?; + events.token(Token::Attribute(start..end, qname_len, eq_len, prefix, local, value))?; } } } diff --git a/src/tokenizer_tests.rs b/src/tokenizer_tests.rs index 8688506..5fde7de 100644 --- a/src/tokenizer_tests.rs +++ b/src/tokenizer_tests.rs @@ -90,7 +90,7 @@ impl<'a> xml::XmlEvents<'a> for EventsCollector<'a> { xml::Token::ElementStart(prefix, local, start) => { Token::ElementStart(prefix, local, start) } - xml::Token::Attribute(_, prefix, local, value) => { + xml::Token::Attribute(_, _, _, prefix, local, value) => { Token::Attribute(prefix, local, value.as_str()) } xml::Token::ElementEnd(end, range) => Token::ElementEnd( diff --git a/tests/api.rs b/tests/api.rs index 211f7d3..4104181 100644 --- a/tests/api.rs +++ b/tests/api.rs @@ -158,6 +158,13 @@ fn text_pos_01() { if let Some(attr) = node.attribute_node("a") { assert_eq!(doc.text_pos_at(attr.range().start), TextPos::new(1, 4)); assert_eq!(doc.text_pos_at(attr.range().end), TextPos::new(1, 9)); + #[cfg(feature = "positions-extra-attr")] + { + assert_eq!(doc.text_pos_at(attr.range_qname().start), TextPos::new(1, 4)); + assert_eq!(doc.text_pos_at(attr.range_qname().end), TextPos::new(1, 5)); + assert_eq!(doc.text_pos_at(attr.range_value().start), TextPos::new(1, 7)); + assert_eq!(doc.text_pos_at(attr.range_value().end), TextPos::new(1, 8)); + } } // first child is a text/whitespace, not a comment @@ -184,6 +191,13 @@ fn text_pos_02() { if let Some(attr) = node.attribute_node(("http://www.w3.org", "a")) { assert_eq!(doc.text_pos_at(attr.range().start), TextPos::new(1, 36)); assert_eq!(doc.text_pos_at(attr.range().end), TextPos::new(1, 44)); + #[cfg(feature = "positions-extra-attr")] + { + assert_eq!(doc.text_pos_at(attr.range_qname().start), TextPos::new(1, 36)); + assert_eq!(doc.text_pos_at(attr.range_qname().end), TextPos::new(1, 40)); + assert_eq!(doc.text_pos_at(attr.range_value().start), TextPos::new(1, 42)); + assert_eq!(doc.text_pos_at(attr.range_value().end), TextPos::new(1, 43)); + } } } @@ -202,6 +216,79 @@ fn text_pos_03() { assert_eq!(doc.text_pos_at(node.range().end), TextPos::new(2, 5)); } +#[cfg(feature = "positions-extra-attr")] +#[test] +fn text_pos_04() { + let data = ""; + + let doc = Document::parse(data).unwrap(); + let node = doc.root_element(); + + if let Some(attr) = node.attribute_node("a") { + assert_eq!(doc.text_pos_at(attr.range().start), TextPos::new(1, 36)); + assert_eq!(doc.text_pos_at(attr.range().end), TextPos::new(1, 43)); + assert_eq!(doc.text_pos_at(attr.range_qname().start), TextPos::new(1, 36)); + assert_eq!(doc.text_pos_at(attr.range_qname().end), TextPos::new(1, 40)); + assert_eq!(doc.text_pos_at(attr.range_value().start), TextPos::new(1, 42)); + assert_eq!(doc.text_pos_at(attr.range_value().end), TextPos::new(1, 42)); +} +} + +#[cfg(feature = "positions-extra-attr")] +#[test] +fn text_pos_05() { + let data = ""; + + let doc = Document::parse(data).unwrap(); + let node = doc.root_element(); + + if let Some(attr) = node.attribute_node("a") { + assert_eq!(doc.text_pos_at(attr.range().start), TextPos::new(1, 36)); + assert_eq!(doc.text_pos_at(attr.range().end), TextPos::new(1, 48)); + assert_eq!(doc.text_pos_at(attr.range_qname().start), TextPos::new(1, 36)); + assert_eq!(doc.text_pos_at(attr.range_qname().end), TextPos::new(1, 40)); + assert_eq!(doc.text_pos_at(attr.range_value().start), TextPos::new(1, 47)); + assert_eq!(doc.text_pos_at(attr.range_value().end), TextPos::new(1, 48)); + } +} + +#[cfg(feature = "positions-extra-attr")] +#[test] +fn text_pos_06() { + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 + let data = ""; + + let doc = Document::parse(data).unwrap(); + let node = doc.root_element(); + + if let Some(attr) = node.attribute_node("a") { + assert_eq!(doc.text_pos_at(attr.range().start), TextPos::new(1, 4)); + assert_eq!(doc.text_pos_at(attr.range().end), TextPos::new(1, 268)); + + // these are unreliable since qname.len > 255, but they still shouldn't panic + attr.range_qname(); + attr.range_value(); + } +} + +#[cfg(feature = "positions-extra-attr")] +#[test] +fn text_pos_07() { + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 + let data = ""; + + let doc = Document::parse(data).unwrap(); + let node = doc.root_element(); + + if let Some(attr) = node.attribute_node("a") { + assert_eq!(doc.text_pos_at(attr.range().start), TextPos::new(1, 4)); + assert_eq!(doc.text_pos_at(attr.range().end), TextPos::new(1, 269)); + assert_eq!(doc.text_pos_at(attr.range_qname().start), TextPos::new(1, 4)); + assert_eq!(doc.text_pos_at(attr.range_qname().end), TextPos::new(1, 5)); + attr.range_value(); // unreliable since >254 spaces around equal sign, but still shouldn't panic + } +} + #[test] fn next_sibling_element_01() { let data = ""; From 250d2b90f962b9060ad4776035729288a6dcde29 Mon Sep 17 00:00:00 2001 From: Jayonas <37936740+Jayonas@users.noreply.github.com> Date: Wed, 22 May 2024 18:48:11 -0400 Subject: [PATCH 2/2] Fold `positions-extra-attr` into `positions`, increase `qname_len` to `u16`, PR comments. --- Cargo.toml | 6 ++---- src/lib.rs | 24 ++++++++++++------------ src/parse.rs | 12 ++++++------ src/tokenizer.rs | 6 +++--- tests/api.rs | 47 +++++++++++------------------------------------ 5 files changed, 34 insertions(+), 61 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7208fcf..c66c681 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,8 +20,6 @@ exclude = ["testing-tools"] default = ["std", "positions"] std = [] # Enables Nodes and Attributes position in the original document preserving. -# Increases memory usage by `Range` for each Node and Attribute. +# Increases memory usage by `Range` for each Node. +# Increases memory usage by `Range` + `u16` + `u8` for each Attribute. positions = [] -# Enables ranges for each Attribute's qname and value individually. -# Increases memory usage by two `u8`s for each Attribute. -positions-extra-attr = ["positions"] diff --git a/src/lib.rs b/src/lib.rs index 37803e8..71a00a6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -490,9 +490,9 @@ struct AttributeData<'input> { value: StringStorage<'input>, #[cfg(feature = "positions")] range: Range, - #[cfg(feature = "positions-extra-attr")] - qname_len: u8, - #[cfg(feature = "positions-extra-attr")] + #[cfg(feature = "positions")] + qname_len: u16, + #[cfg(feature = "positions")] eq_len: u8, // includes any surrounding spaces } @@ -599,12 +599,12 @@ impl<'a, 'input> Attribute<'a, 'input> { /// ^^^^^^ /// ``` /// - /// This method will return incorrect data if the attribute's qname is longer than 255 bytes, - /// although it will not panic or exhibit undefined behavior. - #[cfg(feature = "positions-extra-attr")] + /// To reduce memory usage the qname length is limited by u16::MAX. + /// If the attribute exceeds that limit then the end of the returned range will be incorrect. + #[cfg(feature = "positions")] #[inline] pub fn range_qname(&self) -> Range { - let end = self.data.range.start + self.data.qname_len as usize; + let end = self.data.range.start + usize::from(self.data.qname_len); self.data.range.start..end } @@ -617,14 +617,14 @@ impl<'a, 'input> Attribute<'a, 'input> { /// ^^^^^ /// ``` /// - /// This method will return incorrect data if the attribute's qname is longer than 255 bytes - /// or if there are more than 254 spaces surrounding the attribute's equal sign, - /// although it will not panic or exhibit undefined behavior. - #[cfg(feature = "positions-extra-attr")] + /// To reduce memory usage the qname length is limited by u16::MAX, + /// and the number of spaces around the equal sign is limited by u8::MAX. + /// If the attribute exceeds those limits then the start of the returned range will be incorrect. + #[cfg(feature = "positions")] #[inline] pub fn range_value(&self) -> Range { // +1 on start and -1 on end are to exclude the quotes around the value (all valid quotes are 1 byte) - let start = self.data.range.start + self.data.qname_len as usize + self.data.eq_len as usize + 1; + let start = self.data.range.start + usize::from(self.data.qname_len) + usize::from(self.data.eq_len) + 1; let end = self.data.range.end - 1; start..end } diff --git a/src/parse.rs b/src/parse.rs index 497b79a..cbf718f 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -353,9 +353,9 @@ struct TempAttributeData<'input> { local: &'input str, value: StringStorage<'input>, range: Range, - #[allow(unused)] // only used for feature positions-extra-attr - qname_len: u8, - #[allow(unused)] // only used for feature positions-extra-attr + #[allow(unused)] // only used for feature "positions" + qname_len: u16, + #[allow(unused)] // only used for feature "positions" eq_len: u8, } @@ -670,7 +670,7 @@ impl<'input> tokenizer::XmlEvents<'input> for Context<'input> { #[allow(clippy::too_many_arguments)] fn process_attribute<'input>( range: Range, - qname_len: u8, + qname_len: u16, eq_len: u8, prefix: &'input str, local: &'input str, @@ -917,9 +917,9 @@ fn resolve_attributes(namespaces: ShortRange, ctx: &mut Context) -> Result { ElementStart(&'input str, &'input str, usize), // ns:attr="value" - Attribute(Range, u8, u8, &'input str, &'input str, StrSpan<'input>), + Attribute(Range, u16, u8, &'input str, &'input str, StrSpan<'input>), ElementEnd(ElementEnd<'input>, Range), @@ -554,9 +554,9 @@ fn parse_element<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'inp // because it will blow up the binary size. let (prefix, local) = s.consume_qname()?; let qname_end = s.pos(); - let qname_len = (qname_end - start).try_into().unwrap_or(u8::MAX); + let qname_len = u16::try_from(qname_end - start).unwrap_or(u16::MAX); s.consume_eq()?; - let eq_len = (s.pos() - qname_end).try_into().unwrap_or(u8::MAX); + let eq_len = u8::try_from(s.pos() - qname_end).unwrap_or(u8::MAX); let quote = s.consume_quote()?; let quote_c = quote as char; // The attribute value must not contain the < character. diff --git a/tests/api.rs b/tests/api.rs index 4104181..85f7a39 100644 --- a/tests/api.rs +++ b/tests/api.rs @@ -158,13 +158,10 @@ fn text_pos_01() { if let Some(attr) = node.attribute_node("a") { assert_eq!(doc.text_pos_at(attr.range().start), TextPos::new(1, 4)); assert_eq!(doc.text_pos_at(attr.range().end), TextPos::new(1, 9)); - #[cfg(feature = "positions-extra-attr")] - { - assert_eq!(doc.text_pos_at(attr.range_qname().start), TextPos::new(1, 4)); - assert_eq!(doc.text_pos_at(attr.range_qname().end), TextPos::new(1, 5)); - assert_eq!(doc.text_pos_at(attr.range_value().start), TextPos::new(1, 7)); - assert_eq!(doc.text_pos_at(attr.range_value().end), TextPos::new(1, 8)); - } + assert_eq!(doc.text_pos_at(attr.range_qname().start), TextPos::new(1, 4)); + assert_eq!(doc.text_pos_at(attr.range_qname().end), TextPos::new(1, 5)); + assert_eq!(doc.text_pos_at(attr.range_value().start), TextPos::new(1, 7)); + assert_eq!(doc.text_pos_at(attr.range_value().end), TextPos::new(1, 8)); } // first child is a text/whitespace, not a comment @@ -191,13 +188,10 @@ fn text_pos_02() { if let Some(attr) = node.attribute_node(("http://www.w3.org", "a")) { assert_eq!(doc.text_pos_at(attr.range().start), TextPos::new(1, 36)); assert_eq!(doc.text_pos_at(attr.range().end), TextPos::new(1, 44)); - #[cfg(feature = "positions-extra-attr")] - { - assert_eq!(doc.text_pos_at(attr.range_qname().start), TextPos::new(1, 36)); - assert_eq!(doc.text_pos_at(attr.range_qname().end), TextPos::new(1, 40)); - assert_eq!(doc.text_pos_at(attr.range_value().start), TextPos::new(1, 42)); - assert_eq!(doc.text_pos_at(attr.range_value().end), TextPos::new(1, 43)); - } + assert_eq!(doc.text_pos_at(attr.range_qname().start), TextPos::new(1, 36)); + assert_eq!(doc.text_pos_at(attr.range_qname().end), TextPos::new(1, 40)); + assert_eq!(doc.text_pos_at(attr.range_value().start), TextPos::new(1, 42)); + assert_eq!(doc.text_pos_at(attr.range_value().end), TextPos::new(1, 43)); } } @@ -216,7 +210,7 @@ fn text_pos_03() { assert_eq!(doc.text_pos_at(node.range().end), TextPos::new(2, 5)); } -#[cfg(feature = "positions-extra-attr")] +#[cfg(feature = "positions")] #[test] fn text_pos_04() { let data = ""; @@ -234,7 +228,7 @@ fn text_pos_04() { } } -#[cfg(feature = "positions-extra-attr")] +#[cfg(feature = "positions")] #[test] fn text_pos_05() { let data = ""; @@ -252,28 +246,9 @@ fn text_pos_05() { } } -#[cfg(feature = "positions-extra-attr")] +#[cfg(feature = "positions")] #[test] fn text_pos_06() { - // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 - let data = ""; - - let doc = Document::parse(data).unwrap(); - let node = doc.root_element(); - - if let Some(attr) = node.attribute_node("a") { - assert_eq!(doc.text_pos_at(attr.range().start), TextPos::new(1, 4)); - assert_eq!(doc.text_pos_at(attr.range().end), TextPos::new(1, 268)); - - // these are unreliable since qname.len > 255, but they still shouldn't panic - attr.range_qname(); - attr.range_value(); - } -} - -#[cfg(feature = "positions-extra-attr")] -#[test] -fn text_pos_07() { // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 let data = "";