Skip to content

Commit 5d02e96

Browse files
committed
Rewrite unicode_truncate_centered using iterator
Cleaner code and a nice ~9% performance boost
1 parent a9e7450 commit 5d02e96

File tree

2 files changed

+112
-54
lines changed

2 files changed

+112
-54
lines changed

Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ exclude = [
2121
travis-ci = { repository = "Aetf/unicode-truncate" }
2222

2323
[dependencies]
24+
itertools = { version = "0.12.1", default-features = false }
2425
unicode-width = "0.1"
2526

2627
[dev-dependencies]

src/lib.rs

+111-54
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ assert_eq!(str.width(), 5);
3939
"##
4040
)]
4141

42+
use itertools::{merge_join_by, Either};
4243
use unicode_width::{UnicodeWidthChar, UnicodeWidthStr};
4344

4445
/// Defines the alignment for truncation and padding.
@@ -206,60 +207,87 @@ impl UnicodeTruncateStr for str {
206207
return ("", 0);
207208
}
208209

209-
let mut current_width: usize = self.width();
210-
if current_width <= max_width {
211-
return (self, current_width);
210+
let original_width = self.width();
211+
if original_width <= max_width {
212+
return (self, original_width);
212213
}
213214

214-
let mut iter = self
215+
// We need to remove at least this much
216+
let min_removal_width = original_width - max_width;
217+
218+
let from_start = self
215219
.char_indices()
216-
// map to byte index and the width of char start at the index
217220
.map(|(byte_index, char)| (byte_index, char.width().unwrap_or(0)))
218-
// zero width doesn't need to be checked, they are always kept
219-
.filter(|&(_, char_width)| char_width > 0);
220-
221-
let mut start_is_truncated = false;
222-
let mut end_index = self.len();
223-
224-
// Amount of things taken from start / end. Tries to balance out to keep the center center.
225-
let mut balance: isize = 0;
226-
227-
while current_width > max_width {
228-
if balance >= 0 {
229-
if let Some((byte_index, char_width)) = iter.next_back() {
230-
current_width = current_width
231-
.checked_sub(char_width)
232-
.expect("total - parts shouldnt be less than 0");
233-
end_index = byte_index;
234-
balance = balance.saturating_sub(char_width as isize);
235-
} else {
236-
break;
237-
}
238-
} else {
239-
if let Some((_, char_width)) = iter.next() {
240-
current_width = current_width
241-
.checked_sub(char_width)
242-
.expect("total - parts shouldnt be less than 0");
243-
start_is_truncated = true;
244-
balance = balance.saturating_add(char_width as isize);
245-
} else {
246-
break;
221+
// skip any position with zero width, the cut won't happen at these points
222+
// this also helps with removing zero width char at the beginning
223+
.filter(|&(_, char_width)| char_width > 0)
224+
// fold to byte index and the width from start to the index (not including the current
225+
// char width)
226+
.scan(
227+
(0usize, 0usize),
228+
|(sum, prev_width), (byte_index, char_width)| {
229+
*sum = sum.checked_add(*prev_width)?;
230+
*prev_width = char_width;
231+
Some((byte_index, *sum))
232+
},
233+
)
234+
// fast forward to around the half (min_removal_width - 2) to take accound into
235+
// accidentally remove more than needed due to char width (max 2)
236+
.skip_while(|&(_, removed)| {
237+
min_removal_width > 2 && removed < (min_removal_width - 2) / 2
238+
});
239+
240+
let from_end = self
241+
.char_indices()
242+
.map(|(byte_index, char)| (byte_index, char.width().unwrap_or(0)))
243+
// skip any position with zero width, the cut won't happen at these points
244+
// this also helps with keeping zero width char at the end
245+
.filter(|&(_, char_width)| char_width > 0)
246+
.rev()
247+
// fold to byte index and the width from end to the index (including the current char width)
248+
.scan(0usize, |sum, (byte_index, char_width)| {
249+
*sum = sum.checked_add(char_width)?;
250+
Some((byte_index, *sum))
251+
})
252+
// fast forward to around the half (min_removal_width - 2) to take accound into
253+
// accidentally remove more than needed due to char width (max 2)
254+
.skip_while(|&(_, removed)| {
255+
min_removal_width > 2 && removed < (min_removal_width - 2 + 1) / 2
256+
});
257+
258+
let (start_index, end_index, removed_width) = merge_join_by(
259+
from_start,
260+
from_end,
261+
// taking from either left or right iter depending on which side has less removed width
262+
|&(_, start_removed), &(_, end_removed)| start_removed < end_removed,
263+
)
264+
// remember the last left or right and combine them to one sequence of operations
265+
.scan(
266+
(0usize, 0usize, 0usize, 0usize),
267+
|(start_removed, end_removed, start_index, end_index), position| {
268+
match position {
269+
Either::Left((idx, removed)) => {
270+
*start_index = idx;
271+
*start_removed = removed;
272+
}
273+
Either::Right((idx, removed)) => {
274+
*end_index = idx;
275+
*end_removed = removed;
276+
}
247277
}
248-
}
249-
}
250-
251-
// When truncation happened at the start then get the next byte_index as thats where it
252-
// actually starts. Reason: index is where the char starts, not where it ends.
253-
let start_index = if start_is_truncated {
254-
iter.next().map_or(end_index, |(byte_index, _)| byte_index)
255-
} else {
256-
0
257-
};
278+
Some((*start_index, *end_index, *start_removed + *end_removed))
279+
},
280+
)
281+
.find(|&(_, _, removed)| removed >= min_removal_width)
282+
// should not happen as the removed width is not larger than the original width
283+
// but a sane default is to remove everything (i.e. min_removal_width too large)
284+
.unwrap_or((0, 0, original_width));
258285

259286
// unwrap is safe as the index comes from char_indices
260287
let result = self.get(start_index..end_index).unwrap();
261-
debug_assert_eq!(result.width(), current_width);
262-
(result, current_width)
288+
// unwrap is safe as removed is always smaller than total width
289+
let result_width = original_width.checked_sub(removed_width).unwrap();
290+
(result, result_width)
263291
}
264292

265293
#[cfg(feature = "std")]
@@ -347,7 +375,10 @@ mod tests {
347375
#[test]
348376
fn keep_zero_width_char_at_boundary() {
349377
// zero width character at end is preserved
350-
assert_eq!("y\u{0306}ey\u{0306}s".unicode_truncate(3), ("y\u{0306}ey\u{0306}", 3));
378+
assert_eq!(
379+
"y\u{0306}ey\u{0306}s".unicode_truncate(3),
380+
("y\u{0306}ey\u{0306}", 3)
381+
);
351382
}
352383
}
353384

@@ -386,7 +417,10 @@ mod tests {
386417
#[test]
387418
fn zero_width_char_in_middle() {
388419
// zero width character in middle is preserved
389-
assert_eq!("y\u{0306}ey\u{0306}s".unicode_truncate_start(2), ("y\u{0306}s", 2));
420+
assert_eq!(
421+
"y\u{0306}ey\u{0306}s".unicode_truncate_start(2),
422+
("y\u{0306}s", 2)
423+
);
390424
}
391425

392426
#[test]
@@ -418,27 +452,50 @@ mod tests {
418452

419453
#[test]
420454
fn at_boundary() {
421-
assert_eq!("boundary".unicode_truncate_centered(5), ("ounda", 5));
422-
assert_eq!("你好吗".unicode_truncate_centered(4), ("你好", 4));
455+
assert_eq!(
456+
"boundaryboundary".unicode_truncate_centered(5),
457+
("arybo", 5)
458+
);
459+
assert_eq!(
460+
"你好吗你好吗你好吗".unicode_truncate_centered(4),
461+
("你好", 4)
462+
);
423463
}
424464

425465
#[test]
426466
fn not_boundary() {
427-
assert_eq!("你好吗".unicode_truncate_centered(3), ("", 2));
428-
assert_eq!("你好吗".unicode_truncate_centered(1), ("", 0));
467+
assert_eq!("你好吗你好吗".unicode_truncate_centered(3), ("", 2));
468+
assert_eq!("你好吗你好吗".unicode_truncate_centered(1), ("", 0));
429469
}
430470

431471
#[test]
432472
fn zero_width_char_in_middle() {
433473
// zero width character in middle is preserved
434-
assert_eq!("yy\u{0306}es".unicode_truncate_centered(2), ("y\u{0306}e", 2));
474+
assert_eq!(
475+
"yy\u{0306}es".unicode_truncate_centered(2),
476+
("y\u{0306}e", 2)
477+
);
435478
}
436479

437480
#[test]
438481
fn zero_width_char_at_boundary() {
439482
// zero width character at the cutting boundary in the start is removed
440483
// but those in the end is kept.
441-
assert_eq!("y\u{0306}ey\u{0306}y\u{0306}".unicode_truncate_centered(2), ("ey\u{0306}", 2));
484+
assert_eq!(
485+
"y\u{0306}ea\u{0306}b\u{0306}y\u{0306}ea\u{0306}b\u{0306}"
486+
.unicode_truncate_centered(2),
487+
("b\u{0306}y\u{0306}", 2)
488+
);
489+
assert_eq!(
490+
"ay\u{0306}ea\u{0306}b\u{0306}y\u{0306}ea\u{0306}b\u{0306}"
491+
.unicode_truncate_centered(2),
492+
("a\u{0306}b\u{0306}", 2)
493+
);
494+
assert_eq!(
495+
"y\u{0306}ea\u{0306}b\u{0306}y\u{0306}ea\u{0306}b\u{0306}a"
496+
.unicode_truncate_centered(2),
497+
("b\u{0306}y\u{0306}", 2)
498+
);
442499
}
443500
}
444501

0 commit comments

Comments
 (0)