RazrFalcon · RazrFalcon · May 23, 2024 · May 21, 2024 · May 22, 2024
diff --git a/Cargo.toml b/Cargo.toml
@@ -20,5 +20,6 @@ exclude = ["testing-tools"]
 default = ["std", "positions"]
 std = []
 # Enables Nodes and Attributes position in the original document preserving.
-# Increases memory usage by `Range<usize>` for each Node and Attribute.
+# Increases memory usage by `Range<usize>` for each Node.
+# Increases memory usage by `Range<usize>` + `u16` + `u8` for each Attribute.
 positions = []
diff --git a/src/lib.rs b/src/lib.rs
@@ -490,6 +490,10 @@ struct AttributeData<'input> {
     value: StringStorage<'input>,
     #[cfg(feature = "positions")]
     range: Range<usize>,
+    #[cfg(feature = "positions")]
+    qname_len: u16,
+    #[cfg(feature = "positions")]
+    eq_len: u8, // includes any surrounding spaces
 }
 
 /// An attribute.
@@ -587,6 +591,43 @@ impl<'a, 'input> Attribute<'a, 'input> {
     pub fn range(&self) -> Range<usize> {
         self.data.range.clone()
     }
+
+    /// Returns attribute's qname's range in bytes in the original document.
+    ///
+    /// ```text
+    /// <e n:attr='value'/>
+    ///    ^^^^^^
+    /// ```
+    ///
+    /// To reduce memory usage the qname length is limited by u16::MAX.
+    /// If the attribute exceeds that limit then the end of the returned range will be incorrect.
+    #[cfg(feature = "positions")]
+    #[inline]
+    pub fn range_qname(&self) -> Range<usize> {
+        let end = self.data.range.start + usize::from(self.data.qname_len);
+        self.data.range.start..end
+    }
+
+    /// Returns attribute's value's range in bytes in the original document, excluding the surrounding quotes.
+    ///
+    /// If the attribute's value is an empty string then the `start` and `end` of this `Range` are equal, and indicate the closing quote.
+    ///
+    /// ```text
+    /// <e n:attr='value'/>
+    ///            ^^^^^
+    /// ```
+    ///
+    /// To reduce memory usage the qname length is limited by u16::MAX,
+    /// and the number of spaces around the equal sign is limited by u8::MAX.
+    /// If the attribute exceeds those limits then the start of the returned range will be incorrect.
+    #[cfg(feature = "positions")]
+    #[inline]
+    pub fn range_value(&self) -> Range<usize> {
+        // +1 on start and -1 on end are to exclude the quotes around the value (all valid quotes are 1 byte)
+        let start = self.data.range.start + usize::from(self.data.qname_len) + usize::from(self.data.eq_len) + 1;
+        let end = self.data.range.end - 1;
+        start..end
+    }
 }
 
 impl PartialEq for Attribute<'_, '_> {

diff --git a/src/parse.rs b/src/parse.rs
@@ -353,6 +353,10 @@ struct TempAttributeData<'input> {
     local: &'input str,
     value: StringStorage<'input>,
     range: Range<usize>,
+    #[allow(unused)] // only used for feature "positions"
+    qname_len: u16,
+    #[allow(unused)] // only used for feature "positions"
+    eq_len: u8,
 }
 
 impl<'input> Document<'input> {
@@ -644,8 +648,8 @@ impl<'input> tokenizer::XmlEvents<'input> for Context<'input> {
 
                 self.after_text = false;
             }
-            tokenizer::Token::Attribute(range, prefix, local, value) => {
-                process_attribute(range, prefix, local, value, self)?;
+            tokenizer::Token::Attribute(range, qname_len, eq_len, prefix, local, value) => {
+                process_attribute(range, qname_len, eq_len, prefix, local, value, self)?;
             }
             tokenizer::Token::ElementEnd(end, range) => {
                 process_element(end, range, self)?;
@@ -666,6 +670,8 @@ impl<'input> tokenizer::XmlEvents<'input> for Context<'input> {
 #[allow(clippy::too_many_arguments)]
 fn process_attribute<'input>(
     range: Range<usize>,
+    qname_len: u16,
+    eq_len: u8,
     prefix: &'input str,
     local: &'input str,
     value: StrSpan<'input>,
@@ -732,6 +738,8 @@ fn process_attribute<'input>(
             local,
             value,
             range,
+            qname_len,
+            eq_len,
         });
     }
 
@@ -909,6 +917,10 @@ fn resolve_attributes(namespaces: ShortRange, ctx: &mut Context) -> Result<Short
             value: attr.value,
             #[cfg(feature = "positions")]
             range: attr.range,
+            #[cfg(feature = "positions")]
+            qname_len: attr.qname_len,
+            #[cfg(feature = "positions")]
+            eq_len: attr.eq_len,
         });
     }
 

diff --git a/src/tokenizer.rs b/src/tokenizer.rs
@@ -160,7 +160,7 @@ pub enum Token<'input> {
     ElementStart(&'input str, &'input str, usize),
 
     // ns:attr="value"
-    Attribute(Range<usize>, &'input str, &'input str, StrSpan<'input>),
+    Attribute(Range<usize>, u16, u8, &'input str, &'input str, StrSpan<'input>),
 
     ElementEnd(ElementEnd<'input>, Range<usize>),
 
@@ -553,7 +553,10 @@ fn parse_element<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'inp
                 // We cannot mark `parse_attribute` as `#[inline(always)]`
                 // because it will blow up the binary size.
                 let (prefix, local) = s.consume_qname()?;
+                let qname_end = s.pos();
+                let qname_len = u16::try_from(qname_end - start).unwrap_or(u16::MAX);
                 s.consume_eq()?;
+                let eq_len = u8::try_from(s.pos() - qname_end).unwrap_or(u8::MAX);
                 let quote = s.consume_quote()?;
                 let quote_c = quote as char;
                 // The attribute value must not contain the < character.
@@ -562,7 +565,7 @@ fn parse_element<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'inp
                 let value = s.slice_back_span(value_start);
                 s.consume_byte(quote)?;
                 let end = s.pos();
-                events.token(Token::Attribute(start..end, prefix, local, value))?;
+                events.token(Token::Attribute(start..end, qname_len, eq_len, prefix, local, value))?;
             }
         }
     }

diff --git a/src/tokenizer_tests.rs b/src/tokenizer_tests.rs
@@ -90,7 +90,7 @@ impl<'a> xml::XmlEvents<'a> for EventsCollector<'a> {
             xml::Token::ElementStart(prefix, local, start) => {
                 Token::ElementStart(prefix, local, start)
             }
-            xml::Token::Attribute(_, prefix, local, value) => {
+            xml::Token::Attribute(_, _, _, prefix, local, value) => {
                 Token::Attribute(prefix, local, value.as_str())
             }
             xml::Token::ElementEnd(end, range) => Token::ElementEnd(

diff --git a/tests/api.rs b/tests/api.rs
@@ -158,6 +158,10 @@ fn text_pos_01() {
     if let Some(attr) = node.attribute_node("a") {
         assert_eq!(doc.text_pos_at(attr.range().start), TextPos::new(1, 4));
         assert_eq!(doc.text_pos_at(attr.range().end), TextPos::new(1, 9));
+        assert_eq!(doc.text_pos_at(attr.range_qname().start), TextPos::new(1, 4));
+        assert_eq!(doc.text_pos_at(attr.range_qname().end), TextPos::new(1, 5));
+        assert_eq!(doc.text_pos_at(attr.range_value().start), TextPos::new(1, 7));
+        assert_eq!(doc.text_pos_at(attr.range_value().end), TextPos::new(1, 8));
     }
 
     // first child is a text/whitespace, not a comment
@@ -184,6 +188,10 @@ fn text_pos_02() {
     if let Some(attr) = node.attribute_node(("http://www.w3.org", "a")) {
         assert_eq!(doc.text_pos_at(attr.range().start), TextPos::new(1, 36));
         assert_eq!(doc.text_pos_at(attr.range().end), TextPos::new(1, 44));
+        assert_eq!(doc.text_pos_at(attr.range_qname().start), TextPos::new(1, 36));
+        assert_eq!(doc.text_pos_at(attr.range_qname().end), TextPos::new(1, 40));
+        assert_eq!(doc.text_pos_at(attr.range_value().start), TextPos::new(1, 42));
+        assert_eq!(doc.text_pos_at(attr.range_value().end), TextPos::new(1, 43));
     }
 }
 
@@ -202,6 +210,60 @@ fn text_pos_03() {
     assert_eq!(doc.text_pos_at(node.range().end), TextPos::new(2, 5));
 }
 
+#[cfg(feature = "positions")]
+#[test]
+fn text_pos_04() {
+    let data = "<n1:e xmlns:n1='http://www.w3.org' n1:a=''/>";
+
+    let doc = Document::parse(data).unwrap();
+    let node = doc.root_element();
+
+    if let Some(attr) = node.attribute_node("a") {
+        assert_eq!(doc.text_pos_at(attr.range().start), TextPos::new(1, 36));
+        assert_eq!(doc.text_pos_at(attr.range().end), TextPos::new(1, 43));
+        assert_eq!(doc.text_pos_at(attr.range_qname().start), TextPos::new(1, 36));
+        assert_eq!(doc.text_pos_at(attr.range_qname().end), TextPos::new(1, 40));
+        assert_eq!(doc.text_pos_at(attr.range_value().start), TextPos::new(1, 42));
+        assert_eq!(doc.text_pos_at(attr.range_value().end), TextPos::new(1, 42));
+}
+}
+
+#[cfg(feature = "positions")]
+#[test]
+fn text_pos_05() {
+    let data = "<n1:e xmlns:n1='http://www.w3.org' n1:a  =   'b'/>";
+
+    let doc = Document::parse(data).unwrap();
+    let node = doc.root_element();
+
+    if let Some(attr) = node.attribute_node("a") {
+        assert_eq!(doc.text_pos_at(attr.range().start), TextPos::new(1, 36));
+        assert_eq!(doc.text_pos_at(attr.range().end), TextPos::new(1, 48));
+        assert_eq!(doc.text_pos_at(attr.range_qname().start), TextPos::new(1, 36));
+        assert_eq!(doc.text_pos_at(attr.range_qname().end), TextPos::new(1, 40));
+        assert_eq!(doc.text_pos_at(attr.range_value().start), TextPos::new(1, 47));
+        assert_eq!(doc.text_pos_at(attr.range_value().end), TextPos::new(1, 48));
+    }
+}
+
+#[cfg(feature = "positions")]
+#[test]
+fn text_pos_06() {
+    //              0         1         2         3         4         5         6         7         8         9        10        11        12        13        14        15        16        17        18        19        20        21        22        23        24        25        26
+    let data = "<e a                                                                                                    =                                                                                                                                                                'b'/>";
+
+    let doc = Document::parse(data).unwrap();
+    let node = doc.root_element();
+
+    if let Some(attr) = node.attribute_node("a") {
+        assert_eq!(doc.text_pos_at(attr.range().start), TextPos::new(1, 4));
+        assert_eq!(doc.text_pos_at(attr.range().end), TextPos::new(1, 269));
+        assert_eq!(doc.text_pos_at(attr.range_qname().start), TextPos::new(1, 4));
+        assert_eq!(doc.text_pos_at(attr.range_qname().end), TextPos::new(1, 5));
+        attr.range_value(); // unreliable since >254 spaces around equal sign, but still shouldn't panic
+    }
+}
+
 #[test]
 fn next_sibling_element_01() {
     let data = "<root><a/><b/><c/></root>";