11
11
mod error;
12
12
mod iter;
13
13
mod traits;
14
+ mod validations;
14
15
15
16
use self :: pattern:: Pattern ;
16
17
use self :: pattern:: { DoubleEndedSearcher , ReverseSearcher , Searcher } ;
@@ -62,10 +63,15 @@ pub use iter::SplitAsciiWhitespace;
62
63
#[ unstable( feature = "split_inclusive" , issue = "72360" ) ]
63
64
use iter:: SplitInclusive ;
64
65
66
+ #[ unstable( feature = "str_internals" , issue = "none" ) ]
67
+ pub use validations:: next_code_point;
68
+
65
69
use iter:: MatchIndicesInternal ;
66
70
use iter:: SplitInternal ;
67
71
use iter:: { MatchesInternal , SplitNInternal } ;
68
72
73
+ use validations:: { run_utf8_validation, truncate_to_char_boundary} ;
74
+
69
75
/*
70
76
Section: Creating a string
71
77
*/
@@ -257,102 +263,6 @@ pub unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str {
257
263
unsafe { & mut * ( v as * mut [ u8 ] as * mut str ) }
258
264
}
259
265
260
- /// Returns the initial codepoint accumulator for the first byte.
261
- /// The first byte is special, only want bottom 5 bits for width 2, 4 bits
262
- /// for width 3, and 3 bits for width 4.
263
- #[ inline]
264
- fn utf8_first_byte ( byte : u8 , width : u32 ) -> u32 {
265
- ( byte & ( 0x7F >> width) ) as u32
266
- }
267
-
268
- /// Returns the value of `ch` updated with continuation byte `byte`.
269
- #[ inline]
270
- fn utf8_acc_cont_byte ( ch : u32 , byte : u8 ) -> u32 {
271
- ( ch << 6 ) | ( byte & CONT_MASK ) as u32
272
- }
273
-
274
- /// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
275
- /// bits `10`).
276
- #[ inline]
277
- fn utf8_is_cont_byte ( byte : u8 ) -> bool {
278
- ( byte & !CONT_MASK ) == TAG_CONT_U8
279
- }
280
-
281
- #[ inline]
282
- fn unwrap_or_0 ( opt : Option < & u8 > ) -> u8 {
283
- match opt {
284
- Some ( & byte) => byte,
285
- None => 0 ,
286
- }
287
- }
288
-
289
- /// Reads the next code point out of a byte iterator (assuming a
290
- /// UTF-8-like encoding).
291
- #[ unstable( feature = "str_internals" , issue = "none" ) ]
292
- #[ inline]
293
- pub fn next_code_point < ' a , I : Iterator < Item = & ' a u8 > > ( bytes : & mut I ) -> Option < u32 > {
294
- // Decode UTF-8
295
- let x = * bytes. next ( ) ?;
296
- if x < 128 {
297
- return Some ( x as u32 ) ;
298
- }
299
-
300
- // Multibyte case follows
301
- // Decode from a byte combination out of: [[[x y] z] w]
302
- // NOTE: Performance is sensitive to the exact formulation here
303
- let init = utf8_first_byte ( x, 2 ) ;
304
- let y = unwrap_or_0 ( bytes. next ( ) ) ;
305
- let mut ch = utf8_acc_cont_byte ( init, y) ;
306
- if x >= 0xE0 {
307
- // [[x y z] w] case
308
- // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
309
- let z = unwrap_or_0 ( bytes. next ( ) ) ;
310
- let y_z = utf8_acc_cont_byte ( ( y & CONT_MASK ) as u32 , z) ;
311
- ch = init << 12 | y_z;
312
- if x >= 0xF0 {
313
- // [x y z w] case
314
- // use only the lower 3 bits of `init`
315
- let w = unwrap_or_0 ( bytes. next ( ) ) ;
316
- ch = ( init & 7 ) << 18 | utf8_acc_cont_byte ( y_z, w) ;
317
- }
318
- }
319
-
320
- Some ( ch)
321
- }
322
-
323
- /// Reads the last code point out of a byte iterator (assuming a
324
- /// UTF-8-like encoding).
325
- #[ inline]
326
- fn next_code_point_reverse < ' a , I > ( bytes : & mut I ) -> Option < u32 >
327
- where
328
- I : DoubleEndedIterator < Item = & ' a u8 > ,
329
- {
330
- // Decode UTF-8
331
- let w = match * bytes. next_back ( ) ? {
332
- next_byte if next_byte < 128 => return Some ( next_byte as u32 ) ,
333
- back_byte => back_byte,
334
- } ;
335
-
336
- // Multibyte case follows
337
- // Decode from a byte combination out of: [x [y [z w]]]
338
- let mut ch;
339
- let z = unwrap_or_0 ( bytes. next_back ( ) ) ;
340
- ch = utf8_first_byte ( z, 2 ) ;
341
- if utf8_is_cont_byte ( z) {
342
- let y = unwrap_or_0 ( bytes. next_back ( ) ) ;
343
- ch = utf8_first_byte ( y, 3 ) ;
344
- if utf8_is_cont_byte ( y) {
345
- let x = unwrap_or_0 ( bytes. next_back ( ) ) ;
346
- ch = utf8_first_byte ( x, 4 ) ;
347
- ch = utf8_acc_cont_byte ( ch, y) ;
348
- }
349
- ch = utf8_acc_cont_byte ( ch, z) ;
350
- }
351
- ch = utf8_acc_cont_byte ( ch, w) ;
352
-
353
- Some ( ch)
354
- }
355
-
356
266
impl_fn_for_zst ! {
357
267
/// A nameable, cloneable fn type
358
268
#[ derive( Clone ) ]
@@ -363,184 +273,6 @@ impl_fn_for_zst! {
363
273
} ;
364
274
}
365
275
366
- /*
367
- Section: UTF-8 validation
368
- */
369
-
370
- // use truncation to fit u64 into usize
371
- const NONASCII_MASK : usize = 0x80808080_80808080u64 as usize ;
372
-
373
- /// Returns `true` if any byte in the word `x` is nonascii (>= 128).
374
- #[ inline]
375
- fn contains_nonascii ( x : usize ) -> bool {
376
- ( x & NONASCII_MASK ) != 0
377
- }
378
-
379
- /// Walks through `v` checking that it's a valid UTF-8 sequence,
380
- /// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`.
381
- #[ inline( always) ]
382
- fn run_utf8_validation ( v : & [ u8 ] ) -> Result < ( ) , Utf8Error > {
383
- let mut index = 0 ;
384
- let len = v. len ( ) ;
385
-
386
- let usize_bytes = mem:: size_of :: < usize > ( ) ;
387
- let ascii_block_size = 2 * usize_bytes;
388
- let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 } ;
389
- let align = v. as_ptr ( ) . align_offset ( usize_bytes) ;
390
-
391
- while index < len {
392
- let old_offset = index;
393
- macro_rules! err {
394
- ( $error_len: expr) => {
395
- return Err ( Utf8Error { valid_up_to: old_offset, error_len: $error_len } ) ;
396
- } ;
397
- }
398
-
399
- macro_rules! next {
400
- ( ) => { {
401
- index += 1 ;
402
- // we needed data, but there was none: error!
403
- if index >= len {
404
- err!( None )
405
- }
406
- v[ index]
407
- } } ;
408
- }
409
-
410
- let first = v[ index] ;
411
- if first >= 128 {
412
- let w = UTF8_CHAR_WIDTH [ first as usize ] ;
413
- // 2-byte encoding is for codepoints \u{0080} to \u{07ff}
414
- // first C2 80 last DF BF
415
- // 3-byte encoding is for codepoints \u{0800} to \u{ffff}
416
- // first E0 A0 80 last EF BF BF
417
- // excluding surrogates codepoints \u{d800} to \u{dfff}
418
- // ED A0 80 to ED BF BF
419
- // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
420
- // first F0 90 80 80 last F4 8F BF BF
421
- //
422
- // Use the UTF-8 syntax from the RFC
423
- //
424
- // https://tools.ietf.org/html/rfc3629
425
- // UTF8-1 = %x00-7F
426
- // UTF8-2 = %xC2-DF UTF8-tail
427
- // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
428
- // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
429
- // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
430
- // %xF4 %x80-8F 2( UTF8-tail )
431
- match w {
432
- 2 => {
433
- if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
434
- err ! ( Some ( 1 ) )
435
- }
436
- }
437
- 3 => {
438
- match ( first, next ! ( ) ) {
439
- ( 0xE0 , 0xA0 ..=0xBF )
440
- | ( 0xE1 ..=0xEC , 0x80 ..=0xBF )
441
- | ( 0xED , 0x80 ..=0x9F )
442
- | ( 0xEE ..=0xEF , 0x80 ..=0xBF ) => { }
443
- _ => err ! ( Some ( 1 ) ) ,
444
- }
445
- if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
446
- err ! ( Some ( 2 ) )
447
- }
448
- }
449
- 4 => {
450
- match ( first, next ! ( ) ) {
451
- ( 0xF0 , 0x90 ..=0xBF ) | ( 0xF1 ..=0xF3 , 0x80 ..=0xBF ) | ( 0xF4 , 0x80 ..=0x8F ) => { }
452
- _ => err ! ( Some ( 1 ) ) ,
453
- }
454
- if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
455
- err ! ( Some ( 2 ) )
456
- }
457
- if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
458
- err ! ( Some ( 3 ) )
459
- }
460
- }
461
- _ => err ! ( Some ( 1 ) ) ,
462
- }
463
- index += 1 ;
464
- } else {
465
- // Ascii case, try to skip forward quickly.
466
- // When the pointer is aligned, read 2 words of data per iteration
467
- // until we find a word containing a non-ascii byte.
468
- if align != usize:: MAX && align. wrapping_sub ( index) % usize_bytes == 0 {
469
- let ptr = v. as_ptr ( ) ;
470
- while index < blocks_end {
471
- // SAFETY: since `align - index` and `ascii_block_size` are
472
- // multiples of `usize_bytes`, `block = ptr.add(index)` is
473
- // always aligned with a `usize` so it's safe to dereference
474
- // both `block` and `block.offset(1)`.
475
- unsafe {
476
- let block = ptr. add ( index) as * const usize ;
477
- // break if there is a nonascii byte
478
- let zu = contains_nonascii ( * block) ;
479
- let zv = contains_nonascii ( * block. offset ( 1 ) ) ;
480
- if zu | zv {
481
- break ;
482
- }
483
- }
484
- index += ascii_block_size;
485
- }
486
- // step from the point where the wordwise loop stopped
487
- while index < len && v[ index] < 128 {
488
- index += 1 ;
489
- }
490
- } else {
491
- index += 1 ;
492
- }
493
- }
494
- }
495
-
496
- Ok ( ( ) )
497
- }
498
-
499
- // https://tools.ietf.org/html/rfc3629
500
- static UTF8_CHAR_WIDTH : [ u8 ; 256 ] = [
501
- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
502
- 1 , // 0x1F
503
- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
504
- 1 , // 0x3F
505
- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
506
- 1 , // 0x5F
507
- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
508
- 1 , // 0x7F
509
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
510
- 0 , // 0x9F
511
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
512
- 0 , // 0xBF
513
- 0 , 0 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 ,
514
- 2 , // 0xDF
515
- 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , // 0xEF
516
- 4 , 4 , 4 , 4 , 4 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 0xFF
517
- ] ;
518
-
519
- /// Given a first byte, determines how many bytes are in this UTF-8 character.
520
- #[ unstable( feature = "str_internals" , issue = "none" ) ]
521
- #[ inline]
522
- pub fn utf8_char_width ( b : u8 ) -> usize {
523
- UTF8_CHAR_WIDTH [ b as usize ] as usize
524
- }
525
-
526
- /// Mask of the value bits of a continuation byte.
527
- const CONT_MASK : u8 = 0b0011_1111 ;
528
- /// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte.
529
- const TAG_CONT_U8 : u8 = 0b1000_0000 ;
530
-
531
- // truncate `&str` to length at most equal to `max`
532
- // return `true` if it were truncated, and the new str.
533
- fn truncate_to_char_boundary ( s : & str , mut max : usize ) -> ( bool , & str ) {
534
- if max >= s. len ( ) {
535
- ( false , s)
536
- } else {
537
- while !s. is_char_boundary ( max) {
538
- max -= 1 ;
539
- }
540
- ( true , & s[ ..max] )
541
- }
542
- }
543
-
544
276
#[ inline( never) ]
545
277
#[ cold]
546
278
#[ track_caller]
0 commit comments