From 4f5ce0d69919bbb5f4dd4563c6d934aa20fc15a8 Mon Sep 17 00:00:00 2001
From: z0017k9m <yong1.zhang@nokia-sbell.com>
Date: Wed, 25 Sep 2024 10:12:43 +0800
Subject: [PATCH 01/18] Add normalize_line_end for unescape and test

---
 src/escape.rs   | 37 +++++++++++++++++++++++++++++++++----
 tests/escape.rs | 21 +++++++++++++++++++++
 2 files changed, 54 insertions(+), 4 deletions(-)
diff --git a/src/escape.rs b/src/escape.rs
index 76c7f7e6..ecfaf76b 100644
--- a/src/escape.rs
+++ b/src/escape.rs
@@ -1,6 +1,6 @@
 //! Manage xml character escapes
 
-use memchr::memchr2_iter;
+use memchr::{memchr2_iter, memchr_iter};
 use std::borrow::Cow;
 use std::num::ParseIntError;
 use std::ops::Range;
@@ -288,13 +288,42 @@ where
         }
     }
 
-    if let Some(mut unescaped) = unescaped {
+    let res = if let Some(mut unescaped) = unescaped {
         if let Some(raw) = raw.get(last_end..) {
             unescaped.push_str(raw);
         }
-        Ok(Cow::Owned(unescaped))
+        Cow::Owned(unescaped)
     } else {
-        Ok(Cow::Borrowed(raw))
+        Cow::Borrowed(raw)
+    };
+    Ok(normalize_line_end(res))
+}
+
+/// Normalize the line end, replace \r or \r\n with \n.
+#[inline]
+fn normalize_line_end(input: Cow<str>) -> Cow<str> {
+    let bytes = input.as_bytes();
+    let mut normalized = None;
+    let mut start = 0;
+    let iter = memchr_iter(b'\r', bytes);
+    for i in iter {
+        if normalized.is_none() {
+            normalized = Some(String::with_capacity(input.len()))
+        }
+        let normalized = normalized.as_mut().expect("initialized");
+        normalized.push_str(&input[start..i]);
+        normalized.push('\n');
+        start = i + 1;
+        if matches!(bytes.get(start), Some(&c) if c == b'\n') {
+            // \n right after \r, \r\n case, skip \n because we have already replaced \r with \n
+            start += 1;
+        }
+    }
+    if let Some(mut normalized) = normalized {
+        normalized.push_str(&input[start..]);
+        Cow::Owned(normalized)
+    } else {
+        input
     }
 }
 
diff --git a/tests/escape.rs b/tests/escape.rs
index 894c97c2..df5b648a 100644
--- a/tests/escape.rs
+++ b/tests/escape.rs
@@ -75,6 +75,27 @@ fn unescape() {
     );
 }
 
+#[test]
+fn unescape_line_end() {
+    let unchanged = escape::unescape("test\n");
+    // assert_eq does not check that Cow is borrowed, but we explicitly use Cow
+    // because it influences diff
+    // TODO: use assert_matches! when stabilized and other features will bump MSRV
+    assert_eq!(unchanged, Ok(Cow::Borrowed("test\n")));
+    assert!(matches!(unchanged, Ok(Cow::Borrowed(_))));
+
+    assert_eq!(
+        escape::unescape("&lt;&amp;test&apos;\r&quot;\r\n&gt;\r\n\r"),
+        Ok("<&test'\n\"\n>\n\n".into())
+    );
+    assert_eq!(escape::unescape("&#x30;\r\r\n"), Ok("0\n\n".into()));
+    assert_eq!(escape::unescape("\r&#48;\n\r\r"), Ok("\n0\n\n\n".into()));
+    assert_eq!(
+        escape::unescape("\r\n&foo;\n"),
+        Err(EscapeError::UnrecognizedEntity(3..6, "foo".into()))
+    );
+}
+
 /// XML allows any number of leading zeroes. That is not explicitly mentioned
 /// in the specification, but enforced by the conformance test suite
 /// (https://www.w3.org/XML/Test/)

From 45a63c549b802e2989d93124b9ae18e857a584de Mon Sep 17 00:00:00 2001
From: z0017k9m <yong1.zhang@nokia-sbell.com>
Date: Wed, 25 Sep 2024 13:51:39 +0800
Subject: [PATCH 02/18] Change normalize_line_end function parameter type to
 allow more type as input, e.g. &str

---
 src/escape.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/escape.rs b/src/escape.rs
index ecfaf76b..1843f084 100644
--- a/src/escape.rs
+++ b/src/escape.rs
@@ -301,7 +301,8 @@ where
 
 /// Normalize the line end, replace \r or \r\n with \n.
 #[inline]
-fn normalize_line_end(input: Cow<str>) -> Cow<str> {
+fn normalize_line_end<'a>(input: impl Into<Cow<'a, str>>) -> Cow<'a, str> {
+    let input = input.into();
     let bytes = input.as_bytes();
     let mut normalized = None;
     let mut start = 0;

From 1c2ddc526c64310155433c06c2342e152f90746b Mon Sep 17 00:00:00 2001
From: yorkz1994 <york-z@outlook.com>
Date: Wed, 25 Sep 2024 21:51:17 +0800
Subject: [PATCH 03/18] Update src/escape.rs

Co-authored-by: Mingun <Alexander_Sergey@mail.ru>
---
 src/escape.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/escape.rs b/src/escape.rs
index 1843f084..81111f27 100644
--- a/src/escape.rs
+++ b/src/escape.rs
@@ -301,7 +301,7 @@ where
 
 /// Normalize the line end, replace \r or \r\n with \n.
 #[inline]
-fn normalize_line_end<'a>(input: impl Into<Cow<'a, str>>) -> Cow<'a, str> {
+fn normalize_line_end<'a>(input: Cow<'a, str>) -> Cow<'a, str> {
     let input = input.into();
     let bytes = input.as_bytes();
     let mut normalized = None;

From 139b2cb067e0b003dcbe3a2150b171cba9175532 Mon Sep 17 00:00:00 2001
From: z0017k9m <yong1.zhang@nokia-sbell.com>
Date: Wed, 25 Sep 2024 21:57:19 +0800
Subject: [PATCH 04/18] change parameter type from generic to Cow

---
 src/escape.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/escape.rs b/src/escape.rs
index 81111f27..ecfaf76b 100644
--- a/src/escape.rs
+++ b/src/escape.rs
@@ -301,8 +301,7 @@ where
 
 /// Normalize the line end, replace \r or \r\n with \n.
 #[inline]
-fn normalize_line_end<'a>(input: Cow<'a, str>) -> Cow<'a, str> {
-    let input = input.into();
+fn normalize_line_end(input: Cow<str>) -> Cow<str> {
     let bytes = input.as_bytes();
     let mut normalized = None;
     let mut start = 0;

From 6962ae31d548dd7027b9bd196960961d6497e636 Mon Sep 17 00:00:00 2001
From: z0017k9m <yong1.zhang@nokia-sbell.com>
Date: Thu, 26 Sep 2024 09:18:17 +0800
Subject: [PATCH 05/18] Add issue 806 to change log.

---
 Changelog.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Changelog.md b/Changelog.md
index f2528b0c..e36c2b96 100644
--- a/Changelog.md
+++ b/Changelog.md
@@ -15,6 +15,11 @@
 
 ### New Features
 
+- [#806]: Added `normalize_line_end` that normalizes line end
+  from original XML data during unescape process.
+
+[#806]: https://github.com/tafia/quick-xml/issues/806
+
 ### Bug Fixes
 
 ### Misc Changes

From 11320adaf560d2f1992668f7380166abd922d5f9 Mon Sep 17 00:00:00 2001
From: z0017k9m <yong1.zhang@nokia-sbell.com>
Date: Thu, 26 Sep 2024 09:18:27 +0800
Subject: [PATCH 06/18] Fix regression test failure due to line end
 normalization

---
 src/escape.rs | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/escape.rs b/src/escape.rs
index ecfaf76b..9b2d1a6a 100644
--- a/src/escape.rs
+++ b/src/escape.rs
@@ -266,7 +266,7 @@ where
                     unescaped = Some(String::with_capacity(raw.len()));
                 }
                 let unescaped = unescaped.as_mut().expect("initialized");
-                unescaped.push_str(&raw[last_end..start]);
+                unescaped.push_str(&normalize_line_end(&raw[last_end..start]));
 
                 // search for character correctness
                 let pat = &raw[start + 1..end];
@@ -288,20 +288,19 @@ where
         }
     }
 
-    let res = if let Some(mut unescaped) = unescaped {
+    if let Some(mut unescaped) = unescaped {
         if let Some(raw) = raw.get(last_end..) {
-            unescaped.push_str(raw);
+            unescaped.push_str(&normalize_line_end(raw));
         }
-        Cow::Owned(unescaped)
+        Ok(Cow::Owned(unescaped))
     } else {
-        Cow::Borrowed(raw)
-    };
-    Ok(normalize_line_end(res))
+        Ok(Cow::Borrowed(raw))
+    }
 }
 
 /// Normalize the line end, replace \r or \r\n with \n.
 #[inline]
-fn normalize_line_end(input: Cow<str>) -> Cow<str> {
+fn normalize_line_end(input: &str) -> Cow<str> {
     let bytes = input.as_bytes();
     let mut normalized = None;
     let mut start = 0;
@@ -323,7 +322,7 @@ fn normalize_line_end(input: Cow<str>) -> Cow<str> {
         normalized.push_str(&input[start..]);
         Cow::Owned(normalized)
     } else {
-        input
+        input.into()
     }
 }
 

From 0bb282e74804df876d2bab92cb370b0f32f4352f Mon Sep 17 00:00:00 2001
From: yorkz1994 <york-z@outlook.com>
Date: Thu, 26 Sep 2024 21:29:18 +0800
Subject: [PATCH 07/18] Update Changelog.md

Co-authored-by: Mingun <Alexander_Sergey@mail.ru>
---
 Changelog.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Changelog.md b/Changelog.md
index e36c2b96..962e4e57 100644
--- a/Changelog.md
+++ b/Changelog.md
@@ -15,8 +15,7 @@
 
 ### New Features
 
-- [#806]: Added `normalize_line_end` that normalizes line end
-  from original XML data during unescape process.
+- [#806]: Perform normalization of line end during unescape process.
 
 [#806]: https://github.com/tafia/quick-xml/issues/806
 

From f970370ff64ee431094ad667620df8211a5e7656 Mon Sep 17 00:00:00 2001
From: z0017k9m <yong1.zhang@nokia-sbell.com>
Date: Fri, 27 Sep 2024 20:56:52 +0800
Subject: [PATCH 08/18] Still need to normalize if nothing unescaped

---
 src/escape.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/escape.rs b/src/escape.rs
index 9b2d1a6a..a9f4c2fb 100644
--- a/src/escape.rs
+++ b/src/escape.rs
@@ -294,7 +294,7 @@ where
         }
         Ok(Cow::Owned(unescaped))
     } else {
-        Ok(Cow::Borrowed(raw))
+        Ok(normalize_line_end(raw))
     }
 }
 

From d11c7563ee0b091c659052c9ae603767c234d61d Mon Sep 17 00:00:00 2001
From: z0017k9m <yong1.zhang@nokia-sbell.com>
Date: Fri, 27 Sep 2024 20:59:38 +0800
Subject: [PATCH 09/18] Add line ends test for attribute and text

---
 tests/reader-attributes.rs | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/tests/reader-attributes.rs b/tests/reader-attributes.rs
index 8d51b22a..5743f974 100644
--- a/tests/reader-attributes.rs
+++ b/tests/reader-attributes.rs
@@ -1,7 +1,10 @@
 use std::borrow::Cow;
 
 use quick_xml::events::attributes::Attribute;
-use quick_xml::events::{BytesEnd, Event::*};
+use quick_xml::events::{
+    BytesEnd,
+    Event::{self, *},
+};
 use quick_xml::name::QName;
 use quick_xml::reader::Reader;
 
@@ -159,3 +162,31 @@ fn equal_sign_in_value() {
         e => panic!("Expecting Empty event, got {:?}", e),
     }
 }
+
+#[test]
+fn line_ends() {
+    const XML: &str = "<root attribute=\"\r\r\n\nvalue1\r\r\n\nvalue2\r\r\n\n\">\r\r\n\nvalue3\r\r\n\nvalue4\r\r\n\n</root>";
+    let mut reader = Reader::from_str(XML);
+    match reader.read_event().unwrap() {
+        Event::Start(event) => {
+            let mut iter = event.attributes();
+            let a = iter.next().unwrap().unwrap();
+            #[cfg(not(feature = "encoding"))]
+            assert_eq!(
+                a.unescape_value().unwrap(),
+                "\n\n\nvalue1\n\n\nvalue2\n\n\n"
+            );
+            assert_eq!(
+                a.decode_and_unescape_value(reader.decoder()).unwrap(),
+                "\n\n\nvalue1\n\n\nvalue2\n\n\n"
+            );
+        }
+        event => panic!("Expected Start, found {:?}", event),
+    }
+    match reader.read_event().unwrap() {
+        Event::Text(event) => {
+            assert_eq!(event.unescape().unwrap(), "\n\n\nvalue3\n\n\nvalue4\n\n\n")
+        }
+        event => panic!("Expected Text, found {:?}", event),
+    }
+}

From d3ee1adb5493d33027d849e161cb6b575dc0d6c4 Mon Sep 17 00:00:00 2001
From: z0017k9m <yong1.zhang@nokia-sbell.com>
Date: Fri, 27 Sep 2024 21:55:33 +0800
Subject: [PATCH 10/18] roundtrip test cannot include \r

---
 tests/serde-se.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/serde-se.rs b/tests/serde-se.rs
index 39f6e66a..860530ee 100644
--- a/tests/serde-se.rs
+++ b/tests/serde-se.rs
@@ -1955,9 +1955,9 @@ mod with_root {
             <root>3</root>");
     serialize_as!(tuple:
         // Use to_string() to get owned type that is required for deserialization
-        ("<\"&'>".to_string(), "with\t\r\n spaces", 3usize)
+        ("<\"&'>".to_string(), "with\t\n spaces", 3usize)
         => "<root>&lt;\"&amp;'&gt;</root>\
-            <root>with\t\r\n spaces</root>\
+            <root>with\t\n spaces</root>\
             <root>3</root>");
     serialize_as!(tuple_struct:
         Tuple(42.0, "answer")

From cbf200fbe8b237b8c53ea0a3eb95174c16ff9e60 Mon Sep 17 00:00:00 2001
From: yorkz1994 <york-z@outlook.com>
Date: Sat, 28 Sep 2024 11:05:54 +0800
Subject: [PATCH 11/18] Update tests/reader-attributes.rs

Co-authored-by: Mingun <Alexander_Sergey@mail.ru>
---
 tests/reader-attributes.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/reader-attributes.rs b/tests/reader-attributes.rs
index 5743f974..57d1935c 100644
--- a/tests/reader-attributes.rs
+++ b/tests/reader-attributes.rs
@@ -184,7 +184,7 @@ fn line_ends() {
         event => panic!("Expected Start, found {:?}", event),
     }
     match reader.read_event().unwrap() {
-        Event::Text(event) => {
+        Text(event) => {
             assert_eq!(event.unescape().unwrap(), "\n\n\nvalue3\n\n\nvalue4\n\n\n")
         }
         event => panic!("Expected Text, found {:?}", event),

From e4f83a5d26a239746ad6cdb5c6bbe975e3e350c7 Mon Sep 17 00:00:00 2001
From: yorkz1994 <york-z@outlook.com>
Date: Sat, 28 Sep 2024 11:06:37 +0800
Subject: [PATCH 12/18] Update tests/reader-attributes.rs

Co-authored-by: Mingun <Alexander_Sergey@mail.ru>
---
 tests/reader-attributes.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/reader-attributes.rs b/tests/reader-attributes.rs
index 57d1935c..c27d8af0 100644
--- a/tests/reader-attributes.rs
+++ b/tests/reader-attributes.rs
@@ -168,7 +168,7 @@ fn line_ends() {
     const XML: &str = "<root attribute=\"\r\r\n\nvalue1\r\r\n\nvalue2\r\r\n\n\">\r\r\n\nvalue3\r\r\n\nvalue4\r\r\n\n</root>";
     let mut reader = Reader::from_str(XML);
     match reader.read_event().unwrap() {
-        Event::Start(event) => {
+        Start(event) => {
             let mut iter = event.attributes();
             let a = iter.next().unwrap().unwrap();
             #[cfg(not(feature = "encoding"))]

From cf99a45be719f142f89dd2fe06f3e0b23f8fbf8c Mon Sep 17 00:00:00 2001
From: yorkz1994 <york-z@outlook.com>
Date: Sat, 28 Sep 2024 17:26:15 +0800
Subject: [PATCH 13/18] Update src/escape.rs

Co-authored-by: Mingun <Alexander_Sergey@mail.ru>
---
 src/escape.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/escape.rs b/src/escape.rs
index a9f4c2fb..e28a0d9a 100644
--- a/src/escape.rs
+++ b/src/escape.rs
@@ -313,7 +313,7 @@ fn normalize_line_end(input: &str) -> Cow<str> {
         normalized.push_str(&input[start..i]);
         normalized.push('\n');
         start = i + 1;
-        if matches!(bytes.get(start), Some(&c) if c == b'\n') {
+        if let Some(&'\n') = bytes.get(start) {
             // \n right after \r, \r\n case, skip \n because we have already replaced \r with \n
             start += 1;
         }

From 628dce7da3e8362e79900e6e0c777b3f560ecbcf Mon Sep 17 00:00:00 2001
From: z0017k9m <yong1.zhang@nokia-sbell.com>
Date: Sat, 28 Sep 2024 11:15:07 +0800
Subject: [PATCH 14/18] Add comment why \r is not include in roundtrip test.

---
 tests/serde-se.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/serde-se.rs b/tests/serde-se.rs
index 860530ee..1ac36450 100644
--- a/tests/serde-se.rs
+++ b/tests/serde-se.rs
@@ -1955,6 +1955,7 @@ mod with_root {
             <root>3</root>");
     serialize_as!(tuple:
         // Use to_string() to get owned type that is required for deserialization
+        // Note: \r cannot include in whitespace character because we performs line end normalization.
         ("<\"&'>".to_string(), "with\t\n spaces", 3usize)
         => "<root>&lt;\"&amp;'&gt;</root>\
             <root>with\t\n spaces</root>\

From 44df9bfbfa9d84cbf21ac6ec27e245867545c616 Mon Sep 17 00:00:00 2001
From: z0017k9m <yong1.zhang@nokia-sbell.com>
Date: Sat, 28 Sep 2024 11:22:07 +0800
Subject: [PATCH 15/18] Remove unused import

---
 tests/reader-attributes.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/reader-attributes.rs b/tests/reader-attributes.rs
index c27d8af0..327faad3 100644
--- a/tests/reader-attributes.rs
+++ b/tests/reader-attributes.rs
@@ -3,7 +3,7 @@ use std::borrow::Cow;
 use quick_xml::events::attributes::Attribute;
 use quick_xml::events::{
     BytesEnd,
-    Event::{self, *},
+    Event::*,
 };
 use quick_xml::name::QName;
 use quick_xml::reader::Reader;

From 4e1eefc2d9531b15cea197be067a9e2838d00e63 Mon Sep 17 00:00:00 2001
From: z0017k9m <yong1.zhang@nokia-sbell.com>
Date: Sat, 28 Sep 2024 19:18:24 +0800
Subject: [PATCH 16/18] Missing b in byte literal

---
 src/escape.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/escape.rs b/src/escape.rs
index e28a0d9a..ce3a84de 100644
--- a/src/escape.rs
+++ b/src/escape.rs
@@ -313,7 +313,7 @@ fn normalize_line_end(input: &str) -> Cow<str> {
         normalized.push_str(&input[start..i]);
         normalized.push('\n');
         start = i + 1;
-        if let Some(&'\n') = bytes.get(start) {
+        if let Some(&b'\n') = bytes.get(start) {
             // \n right after \r, \r\n case, skip \n because we have already replaced \r with \n
             start += 1;
         }

From ecfeefc6245c70e5fc3d4df37802f38cba4dbc9b Mon Sep 17 00:00:00 2001
From: z0017k9m <yong1.zhang@nokia-sbell.com>
Date: Sun, 29 Sep 2024 17:05:19 +0800
Subject: [PATCH 17/18] Add some \r in unescape_text/mixed

---
 benches/microbenches.rs | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/benches/microbenches.rs b/benches/microbenches.rs
index 2f4ece04..bf64f131 100644
--- a/benches/microbenches.rs
+++ b/benches/microbenches.rs
@@ -327,18 +327,18 @@ fn unescaping(c: &mut Criterion) {
 
     group.bench_function("mixed", |b| {
         let text =
-"Lorem ipsum dolor sit amet, &amp;consectetur adipiscing elit, sed do eiusmod tempor incididunt
-ut labore et dolore magna aliqua. Hac habitasse platea dictumst vestibulum rhoncus est pellentesque.
-Risus ultricies &quot;tristique nulla aliquet enim tortor&quot; at. Fermentum odio eu feugiat pretium
-nibh ipsum. Volutpat sed cras ornare arcu dui. Scelerisque fermentum dui faucibus in ornare quam. Arcu
-cursus euismod quis &#60;viverra nibh cras pulvinar mattis. Sed viverra tellus in hac habitasse platea.
+"Lorem ipsum dolor sit amet, &amp;consectetur adipiscing elit, sed do eiusmod tempor incididunt\r
+ut labore et dolore magna aliqua. Hac habitasse platea dictumst vestibulum rhoncus est pellentesque.\r
+Risus ultricies &quot;tristique nulla aliquet enim tortor&quot; at. Fermentum odio eu feugiat pretium\r
+nibh ipsum. Volutpat sed cras ornare arcu dui. Scelerisque fermentum dui faucibus in ornare quam. Arcu\r
+cursus euismod quis &#60;viverra nibh cras pulvinar mattis. Sed viverra tellus in hac habitasse platea.\r
 Quis commodo odio aenean sed. Cursus in hac habitasse platea dictumst quisque sagittis purus.
 
-Neque convallis a cras semper auctor. Sit amet mauris commodo quis imperdiet massa. Ac ut consequat
-semper viverra nam libero justo &#35; laoreet sit. Adipiscing commodo elit at imperdiet dui accumsan.
-Enim lobortis scelerisque fermentum dui faucibus in ornare. Natoque penatibus et magnis dis parturient
-montes nascetur ridiculus mus. At lectus urna &#33;duis convallis convallis tellus id interdum. Libero
-volutpat sed cras ornare arcu dui vivamus arcu. Cursus in hac habitasse platea dictumst quisque sagittis
+Neque convallis a cras semper auctor. Sit amet mauris commodo quis imperdiet massa. Ac ut consequat\r
+semper viverra nam libero justo &#35; laoreet sit. Adipiscing commodo elit at imperdiet dui accumsan.\r
+Enim lobortis scelerisque fermentum dui faucibus in ornare. Natoque penatibus et magnis dis parturient\r
+montes nascetur ridiculus mus. At lectus urna &#33;duis convallis convallis tellus id interdum. Libero\r
+volutpat sed cras ornare arcu dui vivamus arcu. Cursus in hac habitasse platea dictumst quisque sagittis\r
 purus. Consequat id porta nibh venenatis cras sed felis.";
 
         b.iter(|| {

From d880edc4d1f51bcdd91019666cdb041262e94a8b Mon Sep 17 00:00:00 2001
From: z0017k9m <yong1.zhang@nokia-sbell.com>
Date: Sun, 29 Sep 2024 17:07:32 +0800
Subject: [PATCH 18/18] Normalizing more line ends, no allocation

---
 src/escape.rs              | 93 +++++++++++++++++++++++++++-----------
 tests/escape.rs            | 18 ++++++++
 tests/reader-attributes.rs |  8 ++--
 3 files changed, 89 insertions(+), 30 deletions(-)

diff --git a/src/escape.rs b/src/escape.rs
index ce3a84de..9f7a6439 100644
--- a/src/escape.rs
+++ b/src/escape.rs
@@ -1,6 +1,6 @@
 //! Manage xml character escapes
 
-use memchr::{memchr2_iter, memchr_iter};
+use memchr::{memchr2_iter, memchr3_iter};
 use std::borrow::Cow;
 use std::num::ParseIntError;
 use std::ops::Range;
@@ -266,7 +266,9 @@ where
                     unescaped = Some(String::with_capacity(raw.len()));
                 }
                 let unescaped = unescaped.as_mut().expect("initialized");
-                unescaped.push_str(&normalize_line_end(&raw[last_end..start]));
+                for normalized in normalize_line_end_iter(&raw[last_end..start]) {
+                    unescaped.push_str(normalized);
+                }
 
                 // search for character correctness
                 let pat = &raw[start + 1..end];
@@ -290,40 +292,79 @@ where
 
     if let Some(mut unescaped) = unescaped {
         if let Some(raw) = raw.get(last_end..) {
-            unescaped.push_str(&normalize_line_end(raw));
+            for normalized in normalize_line_end_iter(raw) {
+                unescaped.push_str(normalized);
+            }
         }
         Ok(Cow::Owned(unescaped))
     } else {
-        Ok(normalize_line_end(raw))
+        let mut norm_iter = normalize_line_end_iter(raw);
+        let first = norm_iter.next();
+        match first {
+            Some(normalized) if normalized.len() != raw.len() => {
+                let mut s = String::with_capacity(raw.len());
+                s.push_str(normalized);
+                for normalized in normalize_line_end_iter(raw) {
+                    s.push_str(normalized);
+                }
+                Ok(s.into())
+            }
+            _ => Ok(raw.into()),
+        }
     }
 }
 
-/// Normalize the line end, replace \r or \r\n with \n.
+/// Normalize the line end in input, replace \r or \r\n with \n, return iterator that can given normalized str.
+/// link to [line end spec]https://www.w3.org/TR/xml11/#sec-line-ends somewhere near to the code that handles normalization also would be useful. Well, actually, it describes the 5 combinations of LOF characters, that should be normalized to \n:
+
+/// \r\n
+/// \r\u0085 (UTF-8: \r\xC2\x85)
+/// \r
+/// \u0085 (UTF-8: \xC2\x85)
+/// \u2028 (UTF-8: \xE2\x80\xA8)
+///
+/// The reason to use iterator is to avoid allocation during normalizing line end.
+/// If nothing to normalize of the input, it will give back whole input in the first iteration. Caller can know
+/// there is nothing to normalize, so that it needs not to do normalization thus avoid allocation.
 #[inline]
-fn normalize_line_end(input: &str) -> Cow<str> {
+fn normalize_line_end_iter(input: &str) -> impl Iterator<Item = &str> {
     let bytes = input.as_bytes();
-    let mut normalized = None;
-    let mut start = 0;
-    let iter = memchr_iter(b'\r', bytes);
-    for i in iter {
-        if normalized.is_none() {
-            normalized = Some(String::with_capacity(input.len()))
+    let len = input.len();
+    let mut cursor = 0;
+    let mut iter = memchr3_iter(b'\r', b'\xC2', b'\xE2', bytes);
+    let mut temp = None;
+    std::iter::from_fn(move || {
+        if let Some(v) = temp.take() {
+            return Some(v);
         }
-        let normalized = normalized.as_mut().expect("initialized");
-        normalized.push_str(&input[start..i]);
-        normalized.push('\n');
-        start = i + 1;
-        if let Some(&b'\n') = bytes.get(start) {
-            // \n right after \r, \r\n case, skip \n because we have already replaced \r with \n
-            start += 1;
+        loop {
+            if let Some(p) = iter.next() {
+                if p < cursor {
+                    // already normalized in previous iteration, this position is invalid
+                    continue;
+                }
+                let skips = match &bytes[p..] {
+                    [b'\r', b'\n', ..] => 2,
+                    [b'\r', b'\xC2', b'\x85', ..] => 3,
+                    [b'\r', ..] => 1,
+                    [b'\xC2', b'\x85', ..] => 2,
+                    [b'\xE2', b'\x80', b'\xA8', ..] => 3,
+                    _ => continue,
+                };
+                // normalized
+                temp = Some("\n");
+                let start = cursor;
+                cursor = p + skips;
+                break Some(&input[start..p]);
+            } else if cursor < len {
+                let start = cursor;
+                cursor = len;
+                break Some(&input[start..]);
+            } else {
+                break None;
+            }
         }
-    }
-    if let Some(mut normalized) = normalized {
-        normalized.push_str(&input[start..]);
-        Cow::Owned(normalized)
-    } else {
-        input.into()
-    }
+    })
 }
 
 /// Resolves predefined XML entities or all HTML5 entities depending on the feature
diff --git a/tests/escape.rs b/tests/escape.rs
index df5b648a..90066e00 100644
--- a/tests/escape.rs
+++ b/tests/escape.rs
@@ -94,6 +94,24 @@ fn unescape_line_end() {
         escape::unescape("\r\n&foo;\n"),
         Err(EscapeError::UnrecognizedEntity(3..6, "foo".into()))
     );
+
+    assert_eq!(
+        escape::unescape("&lt;&amp;test&apos;\u{0085}\r\r\u{0085}\u{2028}&quot;\r\n&gt;\r\n\r"),
+        Ok("<&test'\n\n\n\n\"\n>\n\n".into())
+    );
+    assert_eq!(
+        escape::unescape("&#x30;\r\r\n\u{0085}"),
+        Ok("0\n\n\n".into())
+    );
+    assert_eq!(
+        escape::unescape("\r&#48;\n\r\r\u{2028}"),
+        Ok("\n0\n\n\n\n".into())
+    );
+    assert_eq!(escape::unescape("\r\r\u{0085}\n\n"), Ok("\n\n\n\n".into()));
+    assert_eq!(
+        escape::unescape("\r\n&foo;\n\u{2028}"),
+        Err(EscapeError::UnrecognizedEntity(3..6, "foo".into()))
+    );
 }
 
 /// XML allows any number of leading zeroes. That is not explicitly mentioned
diff --git a/tests/reader-attributes.rs b/tests/reader-attributes.rs
index 327faad3..23c53491 100644
--- a/tests/reader-attributes.rs
+++ b/tests/reader-attributes.rs
@@ -165,7 +165,7 @@ fn equal_sign_in_value() {
 
 #[test]
 fn line_ends() {
-    const XML: &str = "<root attribute=\"\r\r\n\nvalue1\r\r\n\nvalue2\r\r\n\n\">\r\r\n\nvalue3\r\r\n\nvalue4\r\r\n\n</root>";
+    const XML: &str = "<root attribute=\"\r\r\u{0085}\n\n\u{0085}\u{2028}value1\r\r\u{0085}\n\n\u{0085}\u{2028}value2\r\r\u{0085}\n\n\u{0085}\u{2028}\">\r\r\u{0085}\n\n\u{0085}\u{2028}value3\r\r\u{0085}\n\n\u{0085}\u{2028}value4\r\r\u{0085}\n\n\u{0085}\u{2028}</root>";
     let mut reader = Reader::from_str(XML);
     match reader.read_event().unwrap() {
         Start(event) => {
@@ -174,18 +174,18 @@ fn line_ends() {
             #[cfg(not(feature = "encoding"))]
             assert_eq!(
                 a.unescape_value().unwrap(),
-                "\n\n\nvalue1\n\n\nvalue2\n\n\n"
+                "\n\n\n\n\n\nvalue1\n\n\n\n\n\nvalue2\n\n\n\n\n\n"
             );
             assert_eq!(
                 a.decode_and_unescape_value(reader.decoder()).unwrap(),
-                "\n\n\nvalue1\n\n\nvalue2\n\n\n"
+                "\n\n\n\n\n\nvalue1\n\n\n\n\n\nvalue2\n\n\n\n\n\n"
             );
         }
         event => panic!("Expected Start, found {:?}", event),
     }
     match reader.read_event().unwrap() {
         Text(event) => {
-            assert_eq!(event.unescape().unwrap(), "\n\n\nvalue3\n\n\nvalue4\n\n\n")
+            assert_eq!(event.unescape().unwrap(), "\n\n\n\n\n\nvalue3\n\n\n\n\n\nvalue4\n\n\n\n\n\n")
         }
         event => panic!("Expected Text, found {:?}", event),
     }