From 4f5ce0d69919bbb5f4dd4563c6d934aa20fc15a8 Mon Sep 17 00:00:00 2001 From: z0017k9m Date: Wed, 25 Sep 2024 10:12:43 +0800 Subject: [PATCH 01/18] Add normalize_line_end for unescape and test --- src/escape.rs | 37 +++++++++++++++++++++++++++++++++---- tests/escape.rs | 21 +++++++++++++++++++++ 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/src/escape.rs b/src/escape.rs index 76c7f7e6..ecfaf76b 100644 --- a/src/escape.rs +++ b/src/escape.rs @@ -1,6 +1,6 @@ //! Manage xml character escapes -use memchr::memchr2_iter; +use memchr::{memchr2_iter, memchr_iter}; use std::borrow::Cow; use std::num::ParseIntError; use std::ops::Range; @@ -288,13 +288,42 @@ where } } - if let Some(mut unescaped) = unescaped { + let res = if let Some(mut unescaped) = unescaped { if let Some(raw) = raw.get(last_end..) { unescaped.push_str(raw); } - Ok(Cow::Owned(unescaped)) + Cow::Owned(unescaped) } else { - Ok(Cow::Borrowed(raw)) + Cow::Borrowed(raw) + }; + Ok(normalize_line_end(res)) +} + +/// Normalize the line end, replace \r or \r\n with \n. +#[inline] +fn normalize_line_end(input: Cow) -> Cow { + let bytes = input.as_bytes(); + let mut normalized = None; + let mut start = 0; + let iter = memchr_iter(b'\r', bytes); + for i in iter { + if normalized.is_none() { + normalized = Some(String::with_capacity(input.len())) + } + let normalized = normalized.as_mut().expect("initialized"); + normalized.push_str(&input[start..i]); + normalized.push('\n'); + start = i + 1; + if matches!(bytes.get(start), Some(&c) if c == b'\n') { + // \n right after \r, \r\n case, skip \n because we have already replaced \r with \n + start += 1; + } + } + if let Some(mut normalized) = normalized { + normalized.push_str(&input[start..]); + Cow::Owned(normalized) + } else { + input } } diff --git a/tests/escape.rs b/tests/escape.rs index 894c97c2..df5b648a 100644 --- a/tests/escape.rs +++ b/tests/escape.rs @@ -75,6 +75,27 @@ fn unescape() { ); } +#[test] +fn unescape_line_end() { + let unchanged = escape::unescape("test\n"); + // assert_eq does not check that Cow is borrowed, but we explicitly use Cow + // because it influences diff + // TODO: use assert_matches! when stabilized and other features will bump MSRV + assert_eq!(unchanged, Ok(Cow::Borrowed("test\n"))); + assert!(matches!(unchanged, Ok(Cow::Borrowed(_)))); + + assert_eq!( + escape::unescape("<&test'\r"\r\n>\r\n\r"), + Ok("<&test'\n\"\n>\n\n".into()) + ); + assert_eq!(escape::unescape("0\r\r\n"), Ok("0\n\n".into())); + assert_eq!(escape::unescape("\r0\n\r\r"), Ok("\n0\n\n\n".into())); + assert_eq!( + escape::unescape("\r\n&foo;\n"), + Err(EscapeError::UnrecognizedEntity(3..6, "foo".into())) + ); +} + /// XML allows any number of leading zeroes. That is not explicitly mentioned /// in the specification, but enforced by the conformance test suite /// (https://www.w3.org/XML/Test/) From 45a63c549b802e2989d93124b9ae18e857a584de Mon Sep 17 00:00:00 2001 From: z0017k9m Date: Wed, 25 Sep 2024 13:51:39 +0800 Subject: [PATCH 02/18] Change normalize_line_end function parameter type to allow more type as input, e.g. &str --- src/escape.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/escape.rs b/src/escape.rs index ecfaf76b..1843f084 100644 --- a/src/escape.rs +++ b/src/escape.rs @@ -301,7 +301,8 @@ where /// Normalize the line end, replace \r or \r\n with \n. #[inline] -fn normalize_line_end(input: Cow) -> Cow { +fn normalize_line_end<'a>(input: impl Into>) -> Cow<'a, str> { + let input = input.into(); let bytes = input.as_bytes(); let mut normalized = None; let mut start = 0; From 1c2ddc526c64310155433c06c2342e152f90746b Mon Sep 17 00:00:00 2001 From: yorkz1994 Date: Wed, 25 Sep 2024 21:51:17 +0800 Subject: [PATCH 03/18] Update src/escape.rs Co-authored-by: Mingun --- src/escape.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/escape.rs b/src/escape.rs index 1843f084..81111f27 100644 --- a/src/escape.rs +++ b/src/escape.rs @@ -301,7 +301,7 @@ where /// Normalize the line end, replace \r or \r\n with \n. #[inline] -fn normalize_line_end<'a>(input: impl Into>) -> Cow<'a, str> { +fn normalize_line_end<'a>(input: Cow<'a, str>) -> Cow<'a, str> { let input = input.into(); let bytes = input.as_bytes(); let mut normalized = None; From 139b2cb067e0b003dcbe3a2150b171cba9175532 Mon Sep 17 00:00:00 2001 From: z0017k9m Date: Wed, 25 Sep 2024 21:57:19 +0800 Subject: [PATCH 04/18] change parameter type from generic to Cow --- src/escape.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/escape.rs b/src/escape.rs index 81111f27..ecfaf76b 100644 --- a/src/escape.rs +++ b/src/escape.rs @@ -301,8 +301,7 @@ where /// Normalize the line end, replace \r or \r\n with \n. #[inline] -fn normalize_line_end<'a>(input: Cow<'a, str>) -> Cow<'a, str> { - let input = input.into(); +fn normalize_line_end(input: Cow) -> Cow { let bytes = input.as_bytes(); let mut normalized = None; let mut start = 0; From 6962ae31d548dd7027b9bd196960961d6497e636 Mon Sep 17 00:00:00 2001 From: z0017k9m Date: Thu, 26 Sep 2024 09:18:17 +0800 Subject: [PATCH 05/18] Add issue 806 to change log. --- Changelog.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Changelog.md b/Changelog.md index f2528b0c..e36c2b96 100644 --- a/Changelog.md +++ b/Changelog.md @@ -15,6 +15,11 @@ ### New Features +- [#806]: Added `normalize_line_end` that normalizes line end + from original XML data during unescape process. + +[#806]: https://github.com/tafia/quick-xml/issues/806 + ### Bug Fixes ### Misc Changes From 11320adaf560d2f1992668f7380166abd922d5f9 Mon Sep 17 00:00:00 2001 From: z0017k9m Date: Thu, 26 Sep 2024 09:18:27 +0800 Subject: [PATCH 06/18] Fix regression test failure due to line end normalization --- src/escape.rs | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/escape.rs b/src/escape.rs index ecfaf76b..9b2d1a6a 100644 --- a/src/escape.rs +++ b/src/escape.rs @@ -266,7 +266,7 @@ where unescaped = Some(String::with_capacity(raw.len())); } let unescaped = unescaped.as_mut().expect("initialized"); - unescaped.push_str(&raw[last_end..start]); + unescaped.push_str(&normalize_line_end(&raw[last_end..start])); // search for character correctness let pat = &raw[start + 1..end]; @@ -288,20 +288,19 @@ where } } - let res = if let Some(mut unescaped) = unescaped { + if let Some(mut unescaped) = unescaped { if let Some(raw) = raw.get(last_end..) { - unescaped.push_str(raw); + unescaped.push_str(&normalize_line_end(raw)); } - Cow::Owned(unescaped) + Ok(Cow::Owned(unescaped)) } else { - Cow::Borrowed(raw) - }; - Ok(normalize_line_end(res)) + Ok(Cow::Borrowed(raw)) + } } /// Normalize the line end, replace \r or \r\n with \n. #[inline] -fn normalize_line_end(input: Cow) -> Cow { +fn normalize_line_end(input: &str) -> Cow { let bytes = input.as_bytes(); let mut normalized = None; let mut start = 0; @@ -323,7 +322,7 @@ fn normalize_line_end(input: Cow) -> Cow { normalized.push_str(&input[start..]); Cow::Owned(normalized) } else { - input + input.into() } } From 0bb282e74804df876d2bab92cb370b0f32f4352f Mon Sep 17 00:00:00 2001 From: yorkz1994 Date: Thu, 26 Sep 2024 21:29:18 +0800 Subject: [PATCH 07/18] Update Changelog.md Co-authored-by: Mingun --- Changelog.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Changelog.md b/Changelog.md index e36c2b96..962e4e57 100644 --- a/Changelog.md +++ b/Changelog.md @@ -15,8 +15,7 @@ ### New Features -- [#806]: Added `normalize_line_end` that normalizes line end - from original XML data during unescape process. +- [#806]: Perform normalization of line end during unescape process. [#806]: https://github.com/tafia/quick-xml/issues/806 From f970370ff64ee431094ad667620df8211a5e7656 Mon Sep 17 00:00:00 2001 From: z0017k9m Date: Fri, 27 Sep 2024 20:56:52 +0800 Subject: [PATCH 08/18] Still need to normalize if nothing unescaped --- src/escape.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/escape.rs b/src/escape.rs index 9b2d1a6a..a9f4c2fb 100644 --- a/src/escape.rs +++ b/src/escape.rs @@ -294,7 +294,7 @@ where } Ok(Cow::Owned(unescaped)) } else { - Ok(Cow::Borrowed(raw)) + Ok(normalize_line_end(raw)) } } From d11c7563ee0b091c659052c9ae603767c234d61d Mon Sep 17 00:00:00 2001 From: z0017k9m Date: Fri, 27 Sep 2024 20:59:38 +0800 Subject: [PATCH 09/18] Add line ends test for attribute and text --- tests/reader-attributes.rs | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/tests/reader-attributes.rs b/tests/reader-attributes.rs index 8d51b22a..5743f974 100644 --- a/tests/reader-attributes.rs +++ b/tests/reader-attributes.rs @@ -1,7 +1,10 @@ use std::borrow::Cow; use quick_xml::events::attributes::Attribute; -use quick_xml::events::{BytesEnd, Event::*}; +use quick_xml::events::{ + BytesEnd, + Event::{self, *}, +}; use quick_xml::name::QName; use quick_xml::reader::Reader; @@ -159,3 +162,31 @@ fn equal_sign_in_value() { e => panic!("Expecting Empty event, got {:?}", e), } } + +#[test] +fn line_ends() { + const XML: &str = "\r\r\n\nvalue3\r\r\n\nvalue4\r\r\n\n"; + let mut reader = Reader::from_str(XML); + match reader.read_event().unwrap() { + Event::Start(event) => { + let mut iter = event.attributes(); + let a = iter.next().unwrap().unwrap(); + #[cfg(not(feature = "encoding"))] + assert_eq!( + a.unescape_value().unwrap(), + "\n\n\nvalue1\n\n\nvalue2\n\n\n" + ); + assert_eq!( + a.decode_and_unescape_value(reader.decoder()).unwrap(), + "\n\n\nvalue1\n\n\nvalue2\n\n\n" + ); + } + event => panic!("Expected Start, found {:?}", event), + } + match reader.read_event().unwrap() { + Event::Text(event) => { + assert_eq!(event.unescape().unwrap(), "\n\n\nvalue3\n\n\nvalue4\n\n\n") + } + event => panic!("Expected Text, found {:?}", event), + } +} From d3ee1adb5493d33027d849e161cb6b575dc0d6c4 Mon Sep 17 00:00:00 2001 From: z0017k9m Date: Fri, 27 Sep 2024 21:55:33 +0800 Subject: [PATCH 10/18] roundtrip test cannot include \r --- tests/serde-se.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/serde-se.rs b/tests/serde-se.rs index 39f6e66a..860530ee 100644 --- a/tests/serde-se.rs +++ b/tests/serde-se.rs @@ -1955,9 +1955,9 @@ mod with_root { 3"); serialize_as!(tuple: // Use to_string() to get owned type that is required for deserialization - ("<\"&'>".to_string(), "with\t\r\n spaces", 3usize) + ("<\"&'>".to_string(), "with\t\n spaces", 3usize) => "<\"&'>\ - with\t\r\n spaces\ + with\t\n spaces\ 3"); serialize_as!(tuple_struct: Tuple(42.0, "answer") From cbf200fbe8b237b8c53ea0a3eb95174c16ff9e60 Mon Sep 17 00:00:00 2001 From: yorkz1994 Date: Sat, 28 Sep 2024 11:05:54 +0800 Subject: [PATCH 11/18] Update tests/reader-attributes.rs Co-authored-by: Mingun --- tests/reader-attributes.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/reader-attributes.rs b/tests/reader-attributes.rs index 5743f974..57d1935c 100644 --- a/tests/reader-attributes.rs +++ b/tests/reader-attributes.rs @@ -184,7 +184,7 @@ fn line_ends() { event => panic!("Expected Start, found {:?}", event), } match reader.read_event().unwrap() { - Event::Text(event) => { + Text(event) => { assert_eq!(event.unescape().unwrap(), "\n\n\nvalue3\n\n\nvalue4\n\n\n") } event => panic!("Expected Text, found {:?}", event), From e4f83a5d26a239746ad6cdb5c6bbe975e3e350c7 Mon Sep 17 00:00:00 2001 From: yorkz1994 Date: Sat, 28 Sep 2024 11:06:37 +0800 Subject: [PATCH 12/18] Update tests/reader-attributes.rs Co-authored-by: Mingun --- tests/reader-attributes.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/reader-attributes.rs b/tests/reader-attributes.rs index 57d1935c..c27d8af0 100644 --- a/tests/reader-attributes.rs +++ b/tests/reader-attributes.rs @@ -168,7 +168,7 @@ fn line_ends() { const XML: &str = "\r\r\n\nvalue3\r\r\n\nvalue4\r\r\n\n"; let mut reader = Reader::from_str(XML); match reader.read_event().unwrap() { - Event::Start(event) => { + Start(event) => { let mut iter = event.attributes(); let a = iter.next().unwrap().unwrap(); #[cfg(not(feature = "encoding"))] From cf99a45be719f142f89dd2fe06f3e0b23f8fbf8c Mon Sep 17 00:00:00 2001 From: yorkz1994 Date: Sat, 28 Sep 2024 17:26:15 +0800 Subject: [PATCH 13/18] Update src/escape.rs Co-authored-by: Mingun --- src/escape.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/escape.rs b/src/escape.rs index a9f4c2fb..e28a0d9a 100644 --- a/src/escape.rs +++ b/src/escape.rs @@ -313,7 +313,7 @@ fn normalize_line_end(input: &str) -> Cow { normalized.push_str(&input[start..i]); normalized.push('\n'); start = i + 1; - if matches!(bytes.get(start), Some(&c) if c == b'\n') { + if let Some(&'\n') = bytes.get(start) { // \n right after \r, \r\n case, skip \n because we have already replaced \r with \n start += 1; } From 628dce7da3e8362e79900e6e0c777b3f560ecbcf Mon Sep 17 00:00:00 2001 From: z0017k9m Date: Sat, 28 Sep 2024 11:15:07 +0800 Subject: [PATCH 14/18] Add comment why \r is not include in roundtrip test. --- tests/serde-se.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/serde-se.rs b/tests/serde-se.rs index 860530ee..1ac36450 100644 --- a/tests/serde-se.rs +++ b/tests/serde-se.rs @@ -1955,6 +1955,7 @@ mod with_root { 3"); serialize_as!(tuple: // Use to_string() to get owned type that is required for deserialization + // Note: \r cannot include in whitespace character because we performs line end normalization. ("<\"&'>".to_string(), "with\t\n spaces", 3usize) => "<\"&'>\ with\t\n spaces\ From 44df9bfbfa9d84cbf21ac6ec27e245867545c616 Mon Sep 17 00:00:00 2001 From: z0017k9m Date: Sat, 28 Sep 2024 11:22:07 +0800 Subject: [PATCH 15/18] Remove unused import --- tests/reader-attributes.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/reader-attributes.rs b/tests/reader-attributes.rs index c27d8af0..327faad3 100644 --- a/tests/reader-attributes.rs +++ b/tests/reader-attributes.rs @@ -3,7 +3,7 @@ use std::borrow::Cow; use quick_xml::events::attributes::Attribute; use quick_xml::events::{ BytesEnd, - Event::{self, *}, + Event::*, }; use quick_xml::name::QName; use quick_xml::reader::Reader; From 4e1eefc2d9531b15cea197be067a9e2838d00e63 Mon Sep 17 00:00:00 2001 From: z0017k9m Date: Sat, 28 Sep 2024 19:18:24 +0800 Subject: [PATCH 16/18] Missing b in byte literal --- src/escape.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/escape.rs b/src/escape.rs index e28a0d9a..ce3a84de 100644 --- a/src/escape.rs +++ b/src/escape.rs @@ -313,7 +313,7 @@ fn normalize_line_end(input: &str) -> Cow { normalized.push_str(&input[start..i]); normalized.push('\n'); start = i + 1; - if let Some(&'\n') = bytes.get(start) { + if let Some(&b'\n') = bytes.get(start) { // \n right after \r, \r\n case, skip \n because we have already replaced \r with \n start += 1; } From ecfeefc6245c70e5fc3d4df37802f38cba4dbc9b Mon Sep 17 00:00:00 2001 From: z0017k9m Date: Sun, 29 Sep 2024 17:05:19 +0800 Subject: [PATCH 17/18] Add some \r in unescape_text/mixed --- benches/microbenches.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/benches/microbenches.rs b/benches/microbenches.rs index 2f4ece04..bf64f131 100644 --- a/benches/microbenches.rs +++ b/benches/microbenches.rs @@ -327,18 +327,18 @@ fn unescaping(c: &mut Criterion) { group.bench_function("mixed", |b| { let text = -"Lorem ipsum dolor sit amet, &consectetur adipiscing elit, sed do eiusmod tempor incididunt -ut labore et dolore magna aliqua. Hac habitasse platea dictumst vestibulum rhoncus est pellentesque. -Risus ultricies "tristique nulla aliquet enim tortor" at. Fermentum odio eu feugiat pretium -nibh ipsum. Volutpat sed cras ornare arcu dui. Scelerisque fermentum dui faucibus in ornare quam. Arcu -cursus euismod quis <viverra nibh cras pulvinar mattis. Sed viverra tellus in hac habitasse platea. +"Lorem ipsum dolor sit amet, &consectetur adipiscing elit, sed do eiusmod tempor incididunt\r +ut labore et dolore magna aliqua. Hac habitasse platea dictumst vestibulum rhoncus est pellentesque.\r +Risus ultricies "tristique nulla aliquet enim tortor" at. Fermentum odio eu feugiat pretium\r +nibh ipsum. Volutpat sed cras ornare arcu dui. Scelerisque fermentum dui faucibus in ornare quam. Arcu\r +cursus euismod quis <viverra nibh cras pulvinar mattis. Sed viverra tellus in hac habitasse platea.\r Quis commodo odio aenean sed. Cursus in hac habitasse platea dictumst quisque sagittis purus. -Neque convallis a cras semper auctor. Sit amet mauris commodo quis imperdiet massa. Ac ut consequat -semper viverra nam libero justo # laoreet sit. Adipiscing commodo elit at imperdiet dui accumsan. -Enim lobortis scelerisque fermentum dui faucibus in ornare. Natoque penatibus et magnis dis parturient -montes nascetur ridiculus mus. At lectus urna !duis convallis convallis tellus id interdum. Libero -volutpat sed cras ornare arcu dui vivamus arcu. Cursus in hac habitasse platea dictumst quisque sagittis +Neque convallis a cras semper auctor. Sit amet mauris commodo quis imperdiet massa. Ac ut consequat\r +semper viverra nam libero justo # laoreet sit. Adipiscing commodo elit at imperdiet dui accumsan.\r +Enim lobortis scelerisque fermentum dui faucibus in ornare. Natoque penatibus et magnis dis parturient\r +montes nascetur ridiculus mus. At lectus urna !duis convallis convallis tellus id interdum. Libero\r +volutpat sed cras ornare arcu dui vivamus arcu. Cursus in hac habitasse platea dictumst quisque sagittis\r purus. Consequat id porta nibh venenatis cras sed felis."; b.iter(|| { From d880edc4d1f51bcdd91019666cdb041262e94a8b Mon Sep 17 00:00:00 2001 From: z0017k9m Date: Sun, 29 Sep 2024 17:07:32 +0800 Subject: [PATCH 18/18] Normalizing more line ends, no allocation --- src/escape.rs | 93 +++++++++++++++++++++++++++----------- tests/escape.rs | 18 ++++++++ tests/reader-attributes.rs | 8 ++-- 3 files changed, 89 insertions(+), 30 deletions(-) diff --git a/src/escape.rs b/src/escape.rs index ce3a84de..9f7a6439 100644 --- a/src/escape.rs +++ b/src/escape.rs @@ -1,6 +1,6 @@ //! Manage xml character escapes -use memchr::{memchr2_iter, memchr_iter}; +use memchr::{memchr2_iter, memchr3_iter}; use std::borrow::Cow; use std::num::ParseIntError; use std::ops::Range; @@ -266,7 +266,9 @@ where unescaped = Some(String::with_capacity(raw.len())); } let unescaped = unescaped.as_mut().expect("initialized"); - unescaped.push_str(&normalize_line_end(&raw[last_end..start])); + for normalized in normalize_line_end_iter(&raw[last_end..start]) { + unescaped.push_str(normalized); + } // search for character correctness let pat = &raw[start + 1..end]; @@ -290,40 +292,79 @@ where if let Some(mut unescaped) = unescaped { if let Some(raw) = raw.get(last_end..) { - unescaped.push_str(&normalize_line_end(raw)); + for normalized in normalize_line_end_iter(raw) { + unescaped.push_str(normalized); + } } Ok(Cow::Owned(unescaped)) } else { - Ok(normalize_line_end(raw)) + let mut norm_iter = normalize_line_end_iter(raw); + let first = norm_iter.next(); + match first { + Some(normalized) if normalized.len() != raw.len() => { + let mut s = String::with_capacity(raw.len()); + s.push_str(normalized); + for normalized in normalize_line_end_iter(raw) { + s.push_str(normalized); + } + Ok(s.into()) + } + _ => Ok(raw.into()), + } } } -/// Normalize the line end, replace \r or \r\n with \n. +/// Normalize the line end in input, replace \r or \r\n with \n, return iterator that can given normalized str. +/// link to [line end spec]https://www.w3.org/TR/xml11/#sec-line-ends somewhere near to the code that handles normalization also would be useful. Well, actually, it describes the 5 combinations of LOF characters, that should be normalized to \n: + +/// \r\n +/// \r\u0085 (UTF-8: \r\xC2\x85) +/// \r +/// \u0085 (UTF-8: \xC2\x85) +/// \u2028 (UTF-8: \xE2\x80\xA8) +/// +/// The reason to use iterator is to avoid allocation during normalizing line end. +/// If nothing to normalize of the input, it will give back whole input in the first iteration. Caller can know +/// there is nothing to normalize, so that it needs not to do normalization thus avoid allocation. #[inline] -fn normalize_line_end(input: &str) -> Cow { +fn normalize_line_end_iter(input: &str) -> impl Iterator { let bytes = input.as_bytes(); - let mut normalized = None; - let mut start = 0; - let iter = memchr_iter(b'\r', bytes); - for i in iter { - if normalized.is_none() { - normalized = Some(String::with_capacity(input.len())) + let len = input.len(); + let mut cursor = 0; + let mut iter = memchr3_iter(b'\r', b'\xC2', b'\xE2', bytes); + let mut temp = None; + std::iter::from_fn(move || { + if let Some(v) = temp.take() { + return Some(v); } - let normalized = normalized.as_mut().expect("initialized"); - normalized.push_str(&input[start..i]); - normalized.push('\n'); - start = i + 1; - if let Some(&b'\n') = bytes.get(start) { - // \n right after \r, \r\n case, skip \n because we have already replaced \r with \n - start += 1; + loop { + if let Some(p) = iter.next() { + if p < cursor { + // already normalized in previous iteration, this position is invalid + continue; + } + let skips = match &bytes[p..] { + [b'\r', b'\n', ..] => 2, + [b'\r', b'\xC2', b'\x85', ..] => 3, + [b'\r', ..] => 1, + [b'\xC2', b'\x85', ..] => 2, + [b'\xE2', b'\x80', b'\xA8', ..] => 3, + _ => continue, + }; + // normalized + temp = Some("\n"); + let start = cursor; + cursor = p + skips; + break Some(&input[start..p]); + } else if cursor < len { + let start = cursor; + cursor = len; + break Some(&input[start..]); + } else { + break None; + } } - } - if let Some(mut normalized) = normalized { - normalized.push_str(&input[start..]); - Cow::Owned(normalized) - } else { - input.into() - } + }) } /// Resolves predefined XML entities or all HTML5 entities depending on the feature diff --git a/tests/escape.rs b/tests/escape.rs index df5b648a..90066e00 100644 --- a/tests/escape.rs +++ b/tests/escape.rs @@ -94,6 +94,24 @@ fn unescape_line_end() { escape::unescape("\r\n&foo;\n"), Err(EscapeError::UnrecognizedEntity(3..6, "foo".into())) ); + + assert_eq!( + escape::unescape("<&test'\u{0085}\r\r\u{0085}\u{2028}"\r\n>\r\n\r"), + Ok("<&test'\n\n\n\n\"\n>\n\n".into()) + ); + assert_eq!( + escape::unescape("0\r\r\n\u{0085}"), + Ok("0\n\n\n".into()) + ); + assert_eq!( + escape::unescape("\r0\n\r\r\u{2028}"), + Ok("\n0\n\n\n\n".into()) + ); + assert_eq!(escape::unescape("\r\r\u{0085}\n\n"), Ok("\n\n\n\n".into())); + assert_eq!( + escape::unescape("\r\n&foo;\n\u{2028}"), + Err(EscapeError::UnrecognizedEntity(3..6, "foo".into())) + ); } /// XML allows any number of leading zeroes. That is not explicitly mentioned diff --git a/tests/reader-attributes.rs b/tests/reader-attributes.rs index 327faad3..23c53491 100644 --- a/tests/reader-attributes.rs +++ b/tests/reader-attributes.rs @@ -165,7 +165,7 @@ fn equal_sign_in_value() { #[test] fn line_ends() { - const XML: &str = "\r\r\n\nvalue3\r\r\n\nvalue4\r\r\n\n"; + const XML: &str = "\r\r\u{0085}\n\n\u{0085}\u{2028}value3\r\r\u{0085}\n\n\u{0085}\u{2028}value4\r\r\u{0085}\n\n\u{0085}\u{2028}"; let mut reader = Reader::from_str(XML); match reader.read_event().unwrap() { Start(event) => { @@ -174,18 +174,18 @@ fn line_ends() { #[cfg(not(feature = "encoding"))] assert_eq!( a.unescape_value().unwrap(), - "\n\n\nvalue1\n\n\nvalue2\n\n\n" + "\n\n\n\n\n\nvalue1\n\n\n\n\n\nvalue2\n\n\n\n\n\n" ); assert_eq!( a.decode_and_unescape_value(reader.decoder()).unwrap(), - "\n\n\nvalue1\n\n\nvalue2\n\n\n" + "\n\n\n\n\n\nvalue1\n\n\n\n\n\nvalue2\n\n\n\n\n\n" ); } event => panic!("Expected Start, found {:?}", event), } match reader.read_event().unwrap() { Text(event) => { - assert_eq!(event.unescape().unwrap(), "\n\n\nvalue3\n\n\nvalue4\n\n\n") + assert_eq!(event.unescape().unwrap(), "\n\n\n\n\n\nvalue3\n\n\n\n\n\nvalue4\n\n\n\n\n\n") } event => panic!("Expected Text, found {:?}", event), }