Skip to content

Commit f85280f

Browse files
authored
feat: segment by graphemes (#11)
Before this zero length things were assumed to keep, but this is mostly only a best-effort approach. `unicode-segmentation` bundles up characters that belong together. Sadly this is slower but more correct. ```plaintext zhu fu/16384/end time: [98.795 µs 98.834 µs 98.883 µs] thrpt: [158.02 MiB/s 158.09 MiB/s 158.16 MiB/s] change: time: [+420.90% +421.28% +421.82%] (p = 0.00 < 0.05) thrpt: [-80.836% -80.816% -80.802%] Performance has regressed. Found 8 outliers among 200 measurements (4.00%) 1 (0.50%) low mild 6 (3.00%) high mild 1 (0.50%) high severe zhu fu/16384/start time: [112.87 µs 112.98 µs 113.10 µs] thrpt: [138.15 MiB/s 138.30 MiB/s 138.43 MiB/s] change: time: [+461.21% +461.73% +462.28%] (p = 0.00 < 0.05) thrpt: [-82.215% -82.198% -82.181%] Performance has regressed. Found 4 outliers among 200 measurements (2.00%) 4 (2.00%) high mild zhu fu/16384/centered time: [50.122 µs 50.177 µs 50.249 µs] thrpt: [310.95 MiB/s 311.40 MiB/s 311.74 MiB/s] change: time: [+86.029% +86.268% +86.498%] (p = 0.00 < 0.05) thrpt: [-46.380% -46.314% -46.245%] Performance has regressed. Found 9 outliers among 200 measurements (4.50%) 8 (4.00%) low mild 1 (0.50%) high severe ``` Interestingly centered is now faster than the other two by a lot. Analyzing this could lead to performance improvements for the other two too?
1 parent bc02011 commit f85280f

File tree

4 files changed

+79
-58
lines changed

4 files changed

+79
-58
lines changed

Cargo.lock

+7
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ exclude = [
1919

2020
[dependencies]
2121
itertools = { version = "0.12", default-features = false }
22+
unicode-segmentation = { version = "1", default-features = false }
2223
unicode-width = "0.1"
2324

2425
[dev-dependencies]

benches/benchmark.rs

+7-7
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use std::time::Duration;
22

3-
use criterion::{criterion_group, criterion_main, Criterion, Throughput};
3+
use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
44
use unicode_truncate::UnicodeTruncateStr;
55

66
fn roughly_cut(str: &str, size: usize) -> &str {
@@ -26,14 +26,14 @@ fn criterion_benchmark(criterion: &mut Criterion) {
2626
.throughput(Throughput::Bytes(size as u64));
2727
let input = roughly_cut(TEXT, size);
2828
let max_width = input.len() / 2;
29-
group.bench_with_input("end", input, |bench, str| {
30-
bench.iter(|| str.unicode_truncate(max_width));
29+
group.bench_function("end", |bench| {
30+
bench.iter(|| black_box(input).unicode_truncate(black_box(max_width)));
3131
});
32-
group.bench_with_input("start", input, |bench, str| {
33-
bench.iter(|| str.unicode_truncate_start(max_width));
32+
group.bench_function("start", |bench| {
33+
bench.iter(|| black_box(input).unicode_truncate_start(black_box(max_width)));
3434
});
35-
group.bench_with_input("centered", input, |bench, str| {
36-
bench.iter(|| str.unicode_truncate_centered(max_width));
35+
group.bench_function("centered", |bench| {
36+
bench.iter(|| black_box(input).unicode_truncate_centered(black_box(max_width)));
3737
});
3838
group.finish();
3939
}

src/lib.rs

+64-51
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ assert_eq!(str.width(), 5);
4141
)]
4242

4343
use itertools::{merge_join_by, Either};
44-
use unicode_width::{UnicodeWidthChar, UnicodeWidthStr};
44+
use unicode_segmentation::UnicodeSegmentation;
45+
use unicode_width::UnicodeWidthStr;
4546

4647
/// Defines the alignment for truncation and padding.
4748
#[derive(PartialEq, Eq, Debug, Copy, Clone)]
@@ -150,28 +151,26 @@ impl UnicodeTruncateStr for str {
150151
#[inline]
151152
fn unicode_truncate(&self, max_width: usize) -> (&str, usize) {
152153
let (byte_index, new_width) = self
153-
.char_indices()
154-
// map to byte index and the width of char start at the index
155-
// control characters treated as of width 1
156-
// https://github.com/unicode-rs/unicode-width/pull/45
157-
.map(|(byte_index, char)| (byte_index, char.width().unwrap_or(1)))
154+
.grapheme_indices(true)
155+
// map to byte index and the width of grapheme at the index
156+
.map(|(byte_index, grapheme)| (byte_index, grapheme.width()))
158157
// chain a final element representing the position past the last char
159158
.chain(core::iter::once((self.len(), 0)))
160159
// fold to byte index and the width up to the index
161-
.scan(0, |sum: &mut usize, (byte_index, char_width)| {
162-
// byte_index is the start while the char_width is at the end. Current width is the
163-
// sum until now while the next byte_start width is including the current
164-
// char_width.
160+
.scan(0, |sum: &mut usize, (byte_index, grapheme_width)| {
161+
// byte_index is the start while the grapheme_width is at the end. Current width is
162+
// the sum until now while the next byte_index is including the current
163+
// grapheme_width.
165164
let current_width = *sum;
166-
*sum = sum.checked_add(char_width)?;
165+
*sum = sum.checked_add(grapheme_width)?;
167166
Some((byte_index, current_width))
168167
})
169168
// take the longest but still shorter than requested
170169
.take_while(|&(_, current_width)| current_width <= max_width)
171170
.last()
172171
.unwrap_or((0, 0));
173172

174-
// unwrap is safe as the index comes from char_indices
173+
// unwrap is safe as the index comes from grapheme_indices
175174
let result = self.get(..byte_index).unwrap();
176175
debug_assert_eq!(result.width(), new_width);
177176
(result, new_width)
@@ -180,32 +179,26 @@ impl UnicodeTruncateStr for str {
180179
#[inline]
181180
fn unicode_truncate_start(&self, max_width: usize) -> (&str, usize) {
182181
let (byte_index, new_width) = self
183-
.char_indices()
182+
.grapheme_indices(true)
184183
// instead of start checking from the start do so from the end
185184
.rev()
186-
// map to byte index and the width of char start at the index
187-
// control characters treated as of width 1
188-
// https://github.com/unicode-rs/unicode-width/pull/45
189-
.map(|(byte_index, char)| (byte_index, char.width().unwrap_or(1)))
190-
// skip any position with zero width, the cut won't happen at these points
191-
// this also helps with not including zero width char at the beginning
192-
.filter(|&(_, char_width)| char_width > 0)
185+
// map to byte index and the width of grapheme start at the index
186+
.map(|(byte_index, grapheme)| (byte_index, grapheme.width()))
193187
// fold to byte index and the width from end to the index
194-
.scan(0, |sum: &mut usize, (byte_index, char_width)| {
195-
*sum = sum.checked_add(char_width)?;
188+
.scan(0, |sum: &mut usize, (byte_index, grapheme_width)| {
189+
*sum = sum.checked_add(grapheme_width)?;
196190
Some((byte_index, *sum))
197191
})
198192
.take_while(|&(_, current_width)| current_width <= max_width)
199193
.last()
200194
.unwrap_or((self.len(), 0));
201195

202-
// unwrap is safe as the index comes from char_indices
196+
// unwrap is safe as the index comes from grapheme_indices
203197
let result = self.get(byte_index..).unwrap();
204198
debug_assert_eq!(result.width(), new_width);
205199
(result, new_width)
206200
}
207201

208-
#[allow(clippy::collapsible_else_if)]
209202
#[inline]
210203
fn unicode_truncate_centered(&self, max_width: usize) -> (&str, usize) {
211204
if max_width == 0 {
@@ -221,48 +214,40 @@ impl UnicodeTruncateStr for str {
221214
// unwrap is safe as original_width > max_width
222215
let min_removal_width = original_width.checked_sub(max_width).unwrap();
223216

224-
// around the half (min_removal_width - 2) to prevent accidentally removing more than needed
225-
// due to char width (max 2)
226-
let less_than_half = min_removal_width.saturating_sub(2) / 2;
217+
// Around the half to improve performance. In order to ensure the center grapheme stays
218+
// remove its max possible length. This assumes a grapheme width is always <= 10 (4 people
219+
// family emoji has width 8). This might end up not perfect on graphemes wider than this but
220+
// performance is more important here.
221+
let less_than_half = min_removal_width.saturating_sub(10) / 2;
227222

228223
let from_start = self
229-
.char_indices()
230-
// control characters treated as of width 1
231-
// https://github.com/unicode-rs/unicode-width/pull/45
232-
.map(|(byte_index, char)| (byte_index, char.width().unwrap_or(1)))
233-
// skip any position with zero width, the cut won't happen at these points
234-
// this also helps with removing zero width char at the beginning
235-
.filter(|&(_, char_width)| char_width > 0)
224+
.grapheme_indices(true)
225+
.map(|(byte_index, grapheme)| (byte_index, grapheme.width()))
236226
// fold to byte index and the width from start to the index (not including the current
237-
// char width)
227+
// grapheme width)
238228
.scan(
239229
(0usize, 0usize),
240-
|(sum, prev_width), (byte_index, char_width)| {
230+
|(sum, prev_width), (byte_index, grapheme_width)| {
241231
*sum = sum.checked_add(*prev_width)?;
242-
*prev_width = char_width;
232+
*prev_width = grapheme_width;
243233
Some((byte_index, *sum))
244234
},
245235
)
246236
// fast forward to around the half
247-
.skip_while(|&(_, removed)| min_removal_width > 2 && removed < less_than_half);
237+
.skip_while(|&(_, removed)| removed < less_than_half);
248238

249239
let from_end = self
250-
.char_indices()
251-
// control characters treated as of width 1
252-
// https://github.com/unicode-rs/unicode-width/pull/45
253-
.map(|(byte_index, char)| (byte_index, char.width().unwrap_or(1)))
254-
// skip any position with zero width, the cut won't happen at these points
255-
// this also helps with keeping zero width char at the end
256-
.filter(|&(_, char_width)| char_width > 0)
240+
.grapheme_indices(true)
241+
.map(|(byte_index, grapheme)| (byte_index, grapheme.width()))
257242
.rev()
258-
// fold to byte index and the width from end to the index (including the current char
259-
// width)
260-
.scan(0usize, |sum, (byte_index, char_width)| {
261-
*sum = sum.checked_add(char_width)?;
243+
// fold to byte index and the width from end to the index (including the current
244+
// grapheme width)
245+
.scan(0usize, |sum, (byte_index, grapheme_width)| {
246+
*sum = sum.checked_add(grapheme_width)?;
262247
Some((byte_index, *sum))
263248
})
264249
// fast forward to around the half
265-
.skip_while(|&(_, removed)| min_removal_width > 2 && removed < less_than_half);
250+
.skip_while(|&(_, removed)| removed < less_than_half);
266251

267252
let (start_index, end_index, removed_width) = merge_join_by(
268253
from_start,
@@ -294,7 +279,7 @@ impl UnicodeTruncateStr for str {
294279
// but a sane default is to remove everything (i.e. min_removal_width too large)
295280
.unwrap_or((0, 0, original_width));
296281

297-
// unwrap is safe as the index comes from char_indices
282+
// unwrap is safe as the index comes from grapheme_indices
298283
let result = self.get(start_index..end_index).unwrap();
299284
// unwrap is safe as removed is always smaller than total width
300285
let result_width = original_width.checked_sub(removed_width).unwrap();
@@ -396,6 +381,15 @@ mod tests {
396381
("y\u{0306}ey\u{0306}", 3)
397382
);
398383
}
384+
385+
#[test]
386+
fn family_stays_together() {
387+
let input = "123👨‍👩‍👧‍👦456";
388+
assert_eq!(input.unicode_truncate(4), ("123", 3));
389+
assert_eq!(input.unicode_truncate(8), ("123", 3));
390+
assert_eq!(input.unicode_truncate(12), ("123👨‍👩‍👧‍👦4", 12));
391+
assert_eq!(input.unicode_truncate(20), (input, 14));
392+
}
399393
}
400394

401395
mod truncate_start {
@@ -444,6 +438,15 @@ mod tests {
444438
// zero width character in the middle at the cutting boundary is removed
445439
assert_eq!("y\u{0306}es".unicode_truncate_start(2), ("es", 2));
446440
}
441+
442+
#[test]
443+
fn family_stays_together() {
444+
let input = "123👨‍👩‍👧‍👦456";
445+
assert_eq!(input.unicode_truncate_start(4), ("456", 3));
446+
assert_eq!(input.unicode_truncate_start(8), ("456", 3));
447+
assert_eq!(input.unicode_truncate_start(12), ("3👨‍👩‍👧‍👦456", 12));
448+
assert_eq!(input.unicode_truncate_start(20), (input, 14));
449+
}
447450
}
448451

449452
mod truncate_centered {
@@ -522,10 +525,20 @@ mod tests {
522525

523526
#[test]
524527
fn control_char() {
528+
use unicode_width::UnicodeWidthChar;
525529
assert_eq!("\u{0019}".width(), 1);
526530
assert_eq!('\u{0019}'.width(), None);
527531
assert_eq!("\u{0019}".unicode_truncate(2), ("\u{0019}", 1));
528532
}
533+
534+
#[test]
535+
fn family_stays_together() {
536+
let input = "123👨‍👩‍👧‍👦456";
537+
assert_eq!(input.unicode_truncate_centered(4), ("", 0));
538+
assert_eq!(input.unicode_truncate_centered(8), ("👨‍👩‍👧‍👦", 8));
539+
assert_eq!(input.unicode_truncate_centered(12), ("23👨‍👩‍👧‍👦45", 12));
540+
assert_eq!(input.unicode_truncate_centered(20), (input, 14));
541+
}
529542
}
530543

531544
#[test]

0 commit comments

Comments
 (0)