-
Notifications
You must be signed in to change notification settings - Fork 175
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add example for DecomposingNormalizer source cursor #4900
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1864,6 +1864,92 @@ impl DecomposingNormalizer { | |
|
||
/// Wraps a delegate iterator into a decomposing iterator | ||
/// adapter by using the data already held by this normalizer. | ||
/// | ||
/// The [`Decomposition`] iterator will peek exactly one character | ||
/// ahead of the character being decomposed, allowing the caller | ||
/// to track the source character in the input string. | ||
/// | ||
/// # Examples | ||
/// | ||
/// Use a cursor to keep track of indices in the source string: | ||
/// | ||
/// ``` | ||
/// use icu_normalizer::DecomposingNormalizer; | ||
/// use std::cell::RefCell; | ||
/// use std::str::Chars; | ||
/// | ||
/// /// Struct with information on the source character being processed. | ||
/// #[derive(Default)] | ||
/// struct DecompositionCursorStatus { | ||
/// peeked_char: Option<char>, | ||
/// current_char: Option<char>, | ||
/// offset: usize, | ||
/// } | ||
/// | ||
/// /// Struct implementing `Iterator<char>` with a RefCell reference | ||
/// /// to the DecompositionCursorStatus | ||
/// struct DecompositionCursor<'a>( | ||
/// Chars<'a>, | ||
/// &'a RefCell<DecompositionCursorStatus>, | ||
/// ); | ||
/// | ||
/// impl Iterator for DecompositionCursor<'_> { | ||
/// type Item = char; | ||
/// #[inline] | ||
/// fn next(&mut self) -> Option<char> { | ||
/// let mut cell = self.1.borrow_mut(); | ||
/// if let Some(ch) = cell.current_char { | ||
/// cell.offset += ch.len_utf8(); | ||
/// } | ||
/// cell.current_char = cell.peeked_char; | ||
/// cell.peeked_char = self.0.next(); | ||
/// cell.peeked_char | ||
/// } | ||
/// } | ||
/// | ||
/// // Set up the DecomposingNormalizer | ||
/// let normalizer = DecomposingNormalizer::new_nfd(); | ||
/// let input = "Šéårçĥ réšûļţš"; | ||
/// let cell = RefCell::new(Default::default()); | ||
/// let cursor = DecompositionCursor(input.chars(), &cell); | ||
/// let mut iter = normalizer.normalize_iter(cursor); | ||
/// | ||
/// // Test the output. get_next() is a helper function that | ||
/// // fetches the next decomposed char when invoked. | ||
/// let mut get_next = || { | ||
/// ( | ||
/// iter.next().unwrap(), | ||
/// cell.borrow().current_char.unwrap(), | ||
/// cell.borrow().offset, | ||
/// ) | ||
/// }; | ||
/// | ||
/// assert_eq!(get_next(), ('S', 'Š', 0)); | ||
/// assert_eq!(get_next(), ('\u{30C}', 'Š', 0)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. question: Is the reason why the offset only jumps at most by 2 in this example because all of the characters are in a precomposed form in the original input string in the range U+0080 <= ch < U+0800 ? If so, then optional: it might be interesting to append to the input string something in the upper half of the BMP, and maybe something beyond the BMP. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Interesting, I can add non-BMP code points to this example. I was also hoping maybe you could shed some light on this behavior. Is it always guaranteed that the iterator peeks one code point ahead, as stated in this PR? |
||
/// assert_eq!(get_next(), ('e', 'é', 2)); | ||
/// assert_eq!(get_next(), ('\u{301}', 'é', 2)); | ||
/// assert_eq!(get_next(), ('a', 'å', 4)); | ||
/// assert_eq!(get_next(), ('\u{30A}', 'å', 4)); | ||
/// assert_eq!(get_next(), ('r', 'r', 6)); | ||
/// assert_eq!(get_next(), ('c', 'ç', 7)); | ||
/// assert_eq!(get_next(), ('\u{327}', 'ç', 7)); | ||
/// assert_eq!(get_next(), ('h', 'ĥ', 9)); | ||
/// assert_eq!(get_next(), ('\u{302}', 'ĥ', 9)); | ||
/// assert_eq!(get_next(), (' ', ' ', 11)); | ||
/// assert_eq!(get_next(), ('r', 'r', 12)); | ||
/// assert_eq!(get_next(), ('e', 'é', 13)); | ||
/// assert_eq!(get_next(), ('\u{301}', 'é', 13)); | ||
/// assert_eq!(get_next(), ('s', 'š', 15)); | ||
/// assert_eq!(get_next(), ('\u{30C}', 'š', 15)); | ||
/// assert_eq!(get_next(), ('u', 'û', 17)); | ||
/// assert_eq!(get_next(), ('\u{302}', 'û', 17)); | ||
/// assert_eq!(get_next(), ('l', 'ļ', 19)); | ||
/// assert_eq!(get_next(), ('\u{327}', 'ļ', 19)); | ||
/// assert_eq!(get_next(), ('t', 'ţ', 21)); | ||
/// assert_eq!(get_next(), ('\u{327}', 'ţ', 21)); | ||
/// assert_eq!(get_next(), ('s', 'š', 23)); | ||
/// assert_eq!(get_next(), ('\u{30C}', 'š', 23)); | ||
/// ``` | ||
pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Decomposition<I> { | ||
Decomposition::new_with_supplements( | ||
iter, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the character being decomposed is followed by characters whose canonical combining class is not zero, the normalizer will buffer up all of those in order to be able to reorder them in case they aren't already in the right order.