@@ -41,7 +41,8 @@ assert_eq!(str.width(), 5);
41
41
) ]
42
42
43
43
use itertools:: { merge_join_by, Either } ;
44
- use unicode_width:: { UnicodeWidthChar , UnicodeWidthStr } ;
44
+ use unicode_segmentation:: UnicodeSegmentation ;
45
+ use unicode_width:: UnicodeWidthStr ;
45
46
46
47
/// Defines the alignment for truncation and padding.
47
48
#[ derive( PartialEq , Eq , Debug , Copy , Clone ) ]
@@ -150,28 +151,26 @@ impl UnicodeTruncateStr for str {
150
151
#[ inline]
151
152
fn unicode_truncate ( & self , max_width : usize ) -> ( & str , usize ) {
152
153
let ( byte_index, new_width) = self
153
- . char_indices ( )
154
- // map to byte index and the width of char start at the index
155
- // control characters treated as of width 1
156
- // https://github.com/unicode-rs/unicode-width/pull/45
157
- . map ( |( byte_index, char) | ( byte_index, char. width ( ) . unwrap_or ( 1 ) ) )
154
+ . grapheme_indices ( true )
155
+ // map to byte index and the width of grapheme at the index
156
+ . map ( |( byte_index, grapheme) | ( byte_index, grapheme. width ( ) ) )
158
157
// chain a final element representing the position past the last char
159
158
. chain ( core:: iter:: once ( ( self . len ( ) , 0 ) ) )
160
159
// fold to byte index and the width up to the index
161
- . scan ( 0 , |sum : & mut usize , ( byte_index, char_width ) | {
162
- // byte_index is the start while the char_width is at the end. Current width is the
163
- // sum until now while the next byte_start width is including the current
164
- // char_width .
160
+ . scan ( 0 , |sum : & mut usize , ( byte_index, grapheme_width ) | {
161
+ // byte_index is the start while the grapheme_width is at the end. Current width is
162
+ // the sum until now while the next byte_index is including the current
163
+ // grapheme_width .
165
164
let current_width = * sum;
166
- * sum = sum. checked_add ( char_width ) ?;
165
+ * sum = sum. checked_add ( grapheme_width ) ?;
167
166
Some ( ( byte_index, current_width) )
168
167
} )
169
168
// take the longest but still shorter than requested
170
169
. take_while ( |& ( _, current_width) | current_width <= max_width)
171
170
. last ( )
172
171
. unwrap_or ( ( 0 , 0 ) ) ;
173
172
174
- // unwrap is safe as the index comes from char_indices
173
+ // unwrap is safe as the index comes from grapheme_indices
175
174
let result = self . get ( ..byte_index) . unwrap ( ) ;
176
175
debug_assert_eq ! ( result. width( ) , new_width) ;
177
176
( result, new_width)
@@ -180,32 +179,26 @@ impl UnicodeTruncateStr for str {
180
179
#[ inline]
181
180
fn unicode_truncate_start ( & self , max_width : usize ) -> ( & str , usize ) {
182
181
let ( byte_index, new_width) = self
183
- . char_indices ( )
182
+ . grapheme_indices ( true )
184
183
// instead of start checking from the start do so from the end
185
184
. rev ( )
186
- // map to byte index and the width of char start at the index
187
- // control characters treated as of width 1
188
- // https://github.com/unicode-rs/unicode-width/pull/45
189
- . map ( |( byte_index, char) | ( byte_index, char. width ( ) . unwrap_or ( 1 ) ) )
190
- // skip any position with zero width, the cut won't happen at these points
191
- // this also helps with not including zero width char at the beginning
192
- . filter ( |& ( _, char_width) | char_width > 0 )
185
+ // map to byte index and the width of grapheme start at the index
186
+ . map ( |( byte_index, grapheme) | ( byte_index, grapheme. width ( ) ) )
193
187
// fold to byte index and the width from end to the index
194
- . scan ( 0 , |sum : & mut usize , ( byte_index, char_width ) | {
195
- * sum = sum. checked_add ( char_width ) ?;
188
+ . scan ( 0 , |sum : & mut usize , ( byte_index, grapheme_width ) | {
189
+ * sum = sum. checked_add ( grapheme_width ) ?;
196
190
Some ( ( byte_index, * sum) )
197
191
} )
198
192
. take_while ( |& ( _, current_width) | current_width <= max_width)
199
193
. last ( )
200
194
. unwrap_or ( ( self . len ( ) , 0 ) ) ;
201
195
202
- // unwrap is safe as the index comes from char_indices
196
+ // unwrap is safe as the index comes from grapheme_indices
203
197
let result = self . get ( byte_index..) . unwrap ( ) ;
204
198
debug_assert_eq ! ( result. width( ) , new_width) ;
205
199
( result, new_width)
206
200
}
207
201
208
- #[ allow( clippy:: collapsible_else_if) ]
209
202
#[ inline]
210
203
fn unicode_truncate_centered ( & self , max_width : usize ) -> ( & str , usize ) {
211
204
if max_width == 0 {
@@ -221,48 +214,40 @@ impl UnicodeTruncateStr for str {
221
214
// unwrap is safe as original_width > max_width
222
215
let min_removal_width = original_width. checked_sub ( max_width) . unwrap ( ) ;
223
216
224
- // around the half (min_removal_width - 2) to prevent accidentally removing more than needed
225
- // due to char width (max 2)
226
- let less_than_half = min_removal_width. saturating_sub ( 2 ) / 2 ;
217
+ // Around the half to improve performance. In order to ensure the center grapheme stays
218
+ // remove its max possible length. This assumes a grapheme width is always <= 10 (4 people
219
+ // family emoji has width 8). This might end up not perfect on graphemes wider than this but
220
+ // performance is more important here.
221
+ let less_than_half = min_removal_width. saturating_sub ( 10 ) / 2 ;
227
222
228
223
let from_start = self
229
- . char_indices ( )
230
- // control characters treated as of width 1
231
- // https://github.com/unicode-rs/unicode-width/pull/45
232
- . map ( |( byte_index, char) | ( byte_index, char. width ( ) . unwrap_or ( 1 ) ) )
233
- // skip any position with zero width, the cut won't happen at these points
234
- // this also helps with removing zero width char at the beginning
235
- . filter ( |& ( _, char_width) | char_width > 0 )
224
+ . grapheme_indices ( true )
225
+ . map ( |( byte_index, grapheme) | ( byte_index, grapheme. width ( ) ) )
236
226
// fold to byte index and the width from start to the index (not including the current
237
- // char width)
227
+ // grapheme width)
238
228
. scan (
239
229
( 0usize , 0usize ) ,
240
- |( sum, prev_width) , ( byte_index, char_width ) | {
230
+ |( sum, prev_width) , ( byte_index, grapheme_width ) | {
241
231
* sum = sum. checked_add ( * prev_width) ?;
242
- * prev_width = char_width ;
232
+ * prev_width = grapheme_width ;
243
233
Some ( ( byte_index, * sum) )
244
234
} ,
245
235
)
246
236
// fast forward to around the half
247
- . skip_while ( |& ( _, removed) | min_removal_width > 2 && removed < less_than_half) ;
237
+ . skip_while ( |& ( _, removed) | removed < less_than_half) ;
248
238
249
239
let from_end = self
250
- . char_indices ( )
251
- // control characters treated as of width 1
252
- // https://github.com/unicode-rs/unicode-width/pull/45
253
- . map ( |( byte_index, char) | ( byte_index, char. width ( ) . unwrap_or ( 1 ) ) )
254
- // skip any position with zero width, the cut won't happen at these points
255
- // this also helps with keeping zero width char at the end
256
- . filter ( |& ( _, char_width) | char_width > 0 )
240
+ . grapheme_indices ( true )
241
+ . map ( |( byte_index, grapheme) | ( byte_index, grapheme. width ( ) ) )
257
242
. rev ( )
258
- // fold to byte index and the width from end to the index (including the current char
259
- // width)
260
- . scan ( 0usize , |sum, ( byte_index, char_width ) | {
261
- * sum = sum. checked_add ( char_width ) ?;
243
+ // fold to byte index and the width from end to the index (including the current
244
+ // grapheme width)
245
+ . scan ( 0usize , |sum, ( byte_index, grapheme_width ) | {
246
+ * sum = sum. checked_add ( grapheme_width ) ?;
262
247
Some ( ( byte_index, * sum) )
263
248
} )
264
249
// fast forward to around the half
265
- . skip_while ( |& ( _, removed) | min_removal_width > 2 && removed < less_than_half) ;
250
+ . skip_while ( |& ( _, removed) | removed < less_than_half) ;
266
251
267
252
let ( start_index, end_index, removed_width) = merge_join_by (
268
253
from_start,
@@ -294,7 +279,7 @@ impl UnicodeTruncateStr for str {
294
279
// but a sane default is to remove everything (i.e. min_removal_width too large)
295
280
. unwrap_or ( ( 0 , 0 , original_width) ) ;
296
281
297
- // unwrap is safe as the index comes from char_indices
282
+ // unwrap is safe as the index comes from grapheme_indices
298
283
let result = self . get ( start_index..end_index) . unwrap ( ) ;
299
284
// unwrap is safe as removed is always smaller than total width
300
285
let result_width = original_width. checked_sub ( removed_width) . unwrap ( ) ;
@@ -396,6 +381,15 @@ mod tests {
396
381
( "y\u{0306} ey\u{0306} " , 3 )
397
382
) ;
398
383
}
384
+
385
+ #[ test]
386
+ fn family_stays_together ( ) {
387
+ let input = "123👨👩👧👦456" ;
388
+ assert_eq ! ( input. unicode_truncate( 4 ) , ( "123" , 3 ) ) ;
389
+ assert_eq ! ( input. unicode_truncate( 8 ) , ( "123" , 3 ) ) ;
390
+ assert_eq ! ( input. unicode_truncate( 12 ) , ( "123👨👩👧👦4" , 12 ) ) ;
391
+ assert_eq ! ( input. unicode_truncate( 20 ) , ( input, 14 ) ) ;
392
+ }
399
393
}
400
394
401
395
mod truncate_start {
@@ -444,6 +438,15 @@ mod tests {
444
438
// zero width character in the middle at the cutting boundary is removed
445
439
assert_eq ! ( "y\u{0306} es" . unicode_truncate_start( 2 ) , ( "es" , 2 ) ) ;
446
440
}
441
+
442
+ #[ test]
443
+ fn family_stays_together ( ) {
444
+ let input = "123👨👩👧👦456" ;
445
+ assert_eq ! ( input. unicode_truncate_start( 4 ) , ( "456" , 3 ) ) ;
446
+ assert_eq ! ( input. unicode_truncate_start( 8 ) , ( "456" , 3 ) ) ;
447
+ assert_eq ! ( input. unicode_truncate_start( 12 ) , ( "3👨👩👧👦456" , 12 ) ) ;
448
+ assert_eq ! ( input. unicode_truncate_start( 20 ) , ( input, 14 ) ) ;
449
+ }
447
450
}
448
451
449
452
mod truncate_centered {
@@ -522,10 +525,20 @@ mod tests {
522
525
523
526
#[ test]
524
527
fn control_char ( ) {
528
+ use unicode_width:: UnicodeWidthChar ;
525
529
assert_eq ! ( "\u{0019} " . width( ) , 1 ) ;
526
530
assert_eq ! ( '\u{0019}' . width( ) , None ) ;
527
531
assert_eq ! ( "\u{0019} " . unicode_truncate( 2 ) , ( "\u{0019} " , 1 ) ) ;
528
532
}
533
+
534
+ #[ test]
535
+ fn family_stays_together ( ) {
536
+ let input = "123👨👩👧👦456" ;
537
+ assert_eq ! ( input. unicode_truncate_centered( 4 ) , ( "" , 0 ) ) ;
538
+ assert_eq ! ( input. unicode_truncate_centered( 8 ) , ( "👨👩👧👦" , 8 ) ) ;
539
+ assert_eq ! ( input. unicode_truncate_centered( 12 ) , ( "23👨👩👧👦45" , 12 ) ) ;
540
+ assert_eq ! ( input. unicode_truncate_centered( 20 ) , ( input, 14 ) ) ;
541
+ }
529
542
}
530
543
531
544
#[ test]
0 commit comments