feat: segment by graphemes (#11)

EdJoPaTo · web-flow · commit f85280fdf6db · 2024-06-24T19:41:53.000-07:00
Before this zero length things were assumed to keep, but this is mostly
only a best-effort approach. `unicode-segmentation` bundles up
characters that belong together.

Sadly this is slower but more correct.

```plaintext
zhu fu/16384/end        time:   [98.795 µs 98.834 µs 98.883 µs]
                        thrpt:  [158.02 MiB/s 158.09 MiB/s 158.16 MiB/s]
                 change:
                        time:   [+420.90% +421.28% +421.82%] (p = 0.00 &lt; 0.05)
                        thrpt:  [-80.836% -80.816% -80.802%]
                        Performance has regressed.
Found 8 outliers among 200 measurements (4.00%)
  1 (0.50%) low mild
  6 (3.00%) high mild
  1 (0.50%) high severe
zhu fu/16384/start      time:   [112.87 µs 112.98 µs 113.10 µs]
                        thrpt:  [138.15 MiB/s 138.30 MiB/s 138.43 MiB/s]
                 change:
                        time:   [+461.21% +461.73% +462.28%] (p = 0.00 &lt; 0.05)
                        thrpt:  [-82.215% -82.198% -82.181%]
                        Performance has regressed.
Found 4 outliers among 200 measurements (2.00%)
  4 (2.00%) high mild
zhu fu/16384/centered   time:   [50.122 µs 50.177 µs 50.249 µs]
                        thrpt:  [310.95 MiB/s 311.40 MiB/s 311.74 MiB/s]
                 change:
                        time:   [+86.029% +86.268% +86.498%] (p = 0.00 &lt; 0.05)
                        thrpt:  [-46.380% -46.314% -46.245%]
                        Performance has regressed.
Found 9 outliers among 200 measurements (4.50%)
  8 (4.00%) low mild
  1 (0.50%) high severe
```

Interestingly centered is now faster than the other two by a lot.
Analyzing this could lead to performance improvements for the other two
too?
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -19,6 +19,7 @@ exclude = [
 
 [dependencies]
 itertools = { version = "0.12", default-features = false }
+unicode-segmentation = { version = "1", default-features = false }
 unicode-width = "0.1"
 
 [dev-dependencies]
diff --git a/benches/benchmark.rs b/benches/benchmark.rs
@@ -1,6 +1,6 @@
 use std::time::Duration;
 
-use criterion::{criterion_group, criterion_main, Criterion, Throughput};
+use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
 use unicode_truncate::UnicodeTruncateStr;
 
 fn roughly_cut(str: &str, size: usize) -> &str {
@@ -26,14 +26,14 @@ fn criterion_benchmark(criterion: &mut Criterion) {
             .throughput(Throughput::Bytes(size as u64));
         let input = roughly_cut(TEXT, size);
         let max_width = input.len() / 2;
-        group.bench_with_input("end", input, |bench, str| {
-            bench.iter(|| str.unicode_truncate(max_width));
+        group.bench_function("end", |bench| {
+            bench.iter(|| black_box(input).unicode_truncate(black_box(max_width)));
         });
-        group.bench_with_input("start", input, |bench, str| {
-            bench.iter(|| str.unicode_truncate_start(max_width));
+        group.bench_function("start", |bench| {
+            bench.iter(|| black_box(input).unicode_truncate_start(black_box(max_width)));
         });
-        group.bench_with_input("centered", input, |bench, str| {
-            bench.iter(|| str.unicode_truncate_centered(max_width));
+        group.bench_function("centered", |bench| {
+            bench.iter(|| black_box(input).unicode_truncate_centered(black_box(max_width)));
         });
         group.finish();
     }
diff --git a/src/lib.rs b/src/lib.rs
@@ -41,7 +41,8 @@ assert_eq!(str.width(), 5);
 )]
 
 use itertools::{merge_join_by, Either};
-use unicode_width::{UnicodeWidthChar, UnicodeWidthStr};
+use unicode_segmentation::UnicodeSegmentation;
+use unicode_width::UnicodeWidthStr;
 
 /// Defines the alignment for truncation and padding.
 #[derive(PartialEq, Eq, Debug, Copy, Clone)]
@@ -150,28 +151,26 @@ impl UnicodeTruncateStr for str {
     #[inline]
     fn unicode_truncate(&self, max_width: usize) -> (&str, usize) {
         let (byte_index, new_width) = self
-            .char_indices()
-            // map to byte index and the width of char start at the index
-            // control characters treated as of width 1
-            // https://github.com/unicode-rs/unicode-width/pull/45
-            .map(|(byte_index, char)| (byte_index, char.width().unwrap_or(1)))
+            .grapheme_indices(true)
+            // map to byte index and the width of grapheme at the index
+            .map(|(byte_index, grapheme)| (byte_index, grapheme.width()))
             // chain a final element representing the position past the last char
             .chain(core::iter::once((self.len(), 0)))
             // fold to byte index and the width up to the index
-            .scan(0, |sum: &mut usize, (byte_index, char_width)| {
-                // byte_index is the start while the char_width is at the end. Current width is the
-                // sum until now while the next byte_start width is including the current
-                // char_width.
+            .scan(0, |sum: &mut usize, (byte_index, grapheme_width)| {
+                // byte_index is the start while the grapheme_width is at the end. Current width is
+                // the sum until now while the next byte_index is including the current
+                // grapheme_width.
                 let current_width = *sum;
-                *sum = sum.checked_add(char_width)?;
+                *sum = sum.checked_add(grapheme_width)?;
                 Some((byte_index, current_width))
             })
             // take the longest but still shorter than requested
             .take_while(|&(_, current_width)| current_width <= max_width)
             .last()
             .unwrap_or((0, 0));
 
-        // unwrap is safe as the index comes from char_indices
+        // unwrap is safe as the index comes from grapheme_indices
         let result = self.get(..byte_index).unwrap();
         debug_assert_eq!(result.width(), new_width);
         (result, new_width)
@@ -180,32 +179,26 @@ impl UnicodeTruncateStr for str {
     #[inline]
     fn unicode_truncate_start(&self, max_width: usize) -> (&str, usize) {
         let (byte_index, new_width) = self
-            .char_indices()
+            .grapheme_indices(true)
             // instead of start checking from the start do so from the end
             .rev()
-            // map to byte index and the width of char start at the index
-            // control characters treated as of width 1
-            // https://github.com/unicode-rs/unicode-width/pull/45
-            .map(|(byte_index, char)| (byte_index, char.width().unwrap_or(1)))
-            // skip any position with zero width, the cut won't happen at these points
-            // this also helps with not including zero width char at the beginning
-            .filter(|&(_, char_width)| char_width > 0)
+            // map to byte index and the width of grapheme start at the index
+            .map(|(byte_index, grapheme)| (byte_index, grapheme.width()))
             // fold to byte index and the width from end to the index
-            .scan(0, |sum: &mut usize, (byte_index, char_width)| {
-                *sum = sum.checked_add(char_width)?;
+            .scan(0, |sum: &mut usize, (byte_index, grapheme_width)| {
+                *sum = sum.checked_add(grapheme_width)?;
                 Some((byte_index, *sum))
             })
             .take_while(|&(_, current_width)| current_width <= max_width)
             .last()
             .unwrap_or((self.len(), 0));
 
-        // unwrap is safe as the index comes from char_indices
+        // unwrap is safe as the index comes from grapheme_indices
         let result = self.get(byte_index..).unwrap();
         debug_assert_eq!(result.width(), new_width);
         (result, new_width)
     }
 
-    #[allow(clippy::collapsible_else_if)]
     #[inline]
     fn unicode_truncate_centered(&self, max_width: usize) -> (&str, usize) {
         if max_width == 0 {
@@ -221,48 +214,40 @@ impl UnicodeTruncateStr for str {
         // unwrap is safe as original_width > max_width
         let min_removal_width = original_width.checked_sub(max_width).unwrap();
 
-        // around the half (min_removal_width - 2) to prevent accidentally removing more than needed
-        // due to char width (max 2)
-        let less_than_half = min_removal_width.saturating_sub(2) / 2;
+        // Around the half to improve performance. In order to ensure the center grapheme stays
+        // remove its max possible length. This assumes a grapheme width is always <= 10 (4 people
+        // family emoji has width 8). This might end up not perfect on graphemes wider than this but
+        // performance is more important here.
+        let less_than_half = min_removal_width.saturating_sub(10) / 2;
 
         let from_start = self
-            .char_indices()
-            // control characters treated as of width 1
-            // https://github.com/unicode-rs/unicode-width/pull/45
-            .map(|(byte_index, char)| (byte_index, char.width().unwrap_or(1)))
-            // skip any position with zero width, the cut won't happen at these points
-            // this also helps with removing zero width char at the beginning
-            .filter(|&(_, char_width)| char_width > 0)
+            .grapheme_indices(true)
+            .map(|(byte_index, grapheme)| (byte_index, grapheme.width()))
             // fold to byte index and the width from start to the index (not including the current
-            // char width)
+            // grapheme width)
             .scan(
                 (0usize, 0usize),
-                |(sum, prev_width), (byte_index, char_width)| {
+                |(sum, prev_width), (byte_index, grapheme_width)| {
                     *sum = sum.checked_add(*prev_width)?;
-                    *prev_width = char_width;
+                    *prev_width = grapheme_width;
                     Some((byte_index, *sum))
                 },
             )
             // fast forward to around the half
-            .skip_while(|&(_, removed)| min_removal_width > 2 && removed < less_than_half);
+            .skip_while(|&(_, removed)| removed < less_than_half);
 
         let from_end = self
-            .char_indices()
-            // control characters treated as of width 1
-            // https://github.com/unicode-rs/unicode-width/pull/45
-            .map(|(byte_index, char)| (byte_index, char.width().unwrap_or(1)))
-            // skip any position with zero width, the cut won't happen at these points
-            // this also helps with keeping zero width char at the end
-            .filter(|&(_, char_width)| char_width > 0)
+            .grapheme_indices(true)
+            .map(|(byte_index, grapheme)| (byte_index, grapheme.width()))
             .rev()
-            // fold to byte index and the width from end to the index (including the current char
-            // width)
-            .scan(0usize, |sum, (byte_index, char_width)| {
-                *sum = sum.checked_add(char_width)?;
+            // fold to byte index and the width from end to the index (including the current
+            // grapheme width)
+            .scan(0usize, |sum, (byte_index, grapheme_width)| {
+                *sum = sum.checked_add(grapheme_width)?;
                 Some((byte_index, *sum))
             })
             // fast forward to around the half
-            .skip_while(|&(_, removed)| min_removal_width > 2 && removed < less_than_half);
+            .skip_while(|&(_, removed)| removed < less_than_half);
 
         let (start_index, end_index, removed_width) = merge_join_by(
             from_start,
@@ -294,7 +279,7 @@ impl UnicodeTruncateStr for str {
         // but a sane default is to remove everything (i.e. min_removal_width too large)
         .unwrap_or((0, 0, original_width));
 
-        // unwrap is safe as the index comes from char_indices
+        // unwrap is safe as the index comes from grapheme_indices
         let result = self.get(start_index..end_index).unwrap();
         // unwrap is safe as removed is always smaller than total width
         let result_width = original_width.checked_sub(removed_width).unwrap();
@@ -396,6 +381,15 @@ mod tests {
                 ("y\u{0306}ey\u{0306}", 3)
             );
         }
+
+        #[test]
+        fn family_stays_together() {
+            let input = "123👨‍👩‍👧‍👦456";
+            assert_eq!(input.unicode_truncate(4), ("123", 3));
+            assert_eq!(input.unicode_truncate(8), ("123", 3));
+            assert_eq!(input.unicode_truncate(12), ("123👨‍👩‍👧‍👦4", 12));
+            assert_eq!(input.unicode_truncate(20), (input, 14));
+        }
     }
 
     mod truncate_start {
@@ -444,6 +438,15 @@ mod tests {
             // zero width character in the middle at the cutting boundary is removed
             assert_eq!("y\u{0306}es".unicode_truncate_start(2), ("es", 2));
         }
+
+        #[test]
+        fn family_stays_together() {
+            let input = "123👨‍👩‍👧‍👦456";
+            assert_eq!(input.unicode_truncate_start(4), ("456", 3));
+            assert_eq!(input.unicode_truncate_start(8), ("456", 3));
+            assert_eq!(input.unicode_truncate_start(12), ("3👨‍👩‍👧‍👦456", 12));
+            assert_eq!(input.unicode_truncate_start(20), (input, 14));
+        }
     }
 
     mod truncate_centered {
@@ -522,10 +525,20 @@ mod tests {
 
         #[test]
         fn control_char() {
+            use unicode_width::UnicodeWidthChar;
             assert_eq!("\u{0019}".width(), 1);
             assert_eq!('\u{0019}'.width(), None);
             assert_eq!("\u{0019}".unicode_truncate(2), ("\u{0019}", 1));
         }
+
+        #[test]
+        fn family_stays_together() {
+            let input = "123👨‍👩‍👧‍👦456";
+            assert_eq!(input.unicode_truncate_centered(4), ("", 0));
+            assert_eq!(input.unicode_truncate_centered(8), ("👨‍👩‍👧‍👦", 8));
+            assert_eq!(input.unicode_truncate_centered(12), ("23👨‍👩‍👧‍👦45", 12));
+            assert_eq!(input.unicode_truncate_centered(20), (input, 14));
+        }
     }
 
     #[test]