Skip to content

Commit 90c813a

Browse files
committed
Move utf-8 validating helpers to new mod
1 parent 5f0d724 commit 90c813a

File tree

4 files changed

+288
-279
lines changed

4 files changed

+288
-279
lines changed

library/core/src/str/iter.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ use crate::slice::{self, Split as SliceSplit};
1212
use super::from_utf8_unchecked;
1313
use super::pattern::Pattern;
1414
use super::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher};
15+
use super::validations::{next_code_point, next_code_point_reverse, utf8_is_cont_byte};
1516
use super::LinesAnyMap;
16-
use super::{next_code_point, next_code_point_reverse, utf8_is_cont_byte};
1717
use super::{BytesIsNotEmpty, UnsafeBytesToStr};
1818
use super::{CharEscapeDebugContinue, CharEscapeDefault, CharEscapeUnicode};
1919
use super::{IsAsciiWhitespace, IsNotEmpty, IsWhitespace};

library/core/src/str/lossy.rs

+6-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
use crate::char;
22
use crate::fmt::{self, Write};
33
use crate::mem;
4-
use crate::str as core_str;
4+
5+
use super::from_utf8_unchecked;
6+
use super::validations::utf8_char_width;
57

68
/// Lossy UTF-8 string.
79
#[unstable(feature = "str_internals", issue = "none")]
@@ -66,14 +68,14 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
6668

6769
if byte < 128 {
6870
} else {
69-
let w = core_str::utf8_char_width(byte);
71+
let w = utf8_char_width(byte);
7072

7173
macro_rules! error {
7274
() => {{
7375
// SAFETY: We have checked up to `i` that source is valid UTF-8.
7476
unsafe {
7577
let r = Utf8LossyChunk {
76-
valid: core_str::from_utf8_unchecked(&self.source[0..i_]),
78+
valid: from_utf8_unchecked(&self.source[0..i_]),
7779
broken: &self.source[i_..i],
7880
};
7981
self.source = &self.source[i..];
@@ -133,7 +135,7 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
133135

134136
let r = Utf8LossyChunk {
135137
// SAFETY: We have checked that the entire source is valid UTF-8.
136-
valid: unsafe { core_str::from_utf8_unchecked(self.source) },
138+
valid: unsafe { from_utf8_unchecked(self.source) },
137139
broken: &[],
138140
};
139141
self.source = &[];

library/core/src/str/mod.rs

+6-274
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
mod error;
1212
mod iter;
1313
mod traits;
14+
mod validations;
1415

1516
use self::pattern::Pattern;
1617
use self::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher};
@@ -62,10 +63,15 @@ pub use iter::SplitAsciiWhitespace;
6263
#[unstable(feature = "split_inclusive", issue = "72360")]
6364
use iter::SplitInclusive;
6465

66+
#[unstable(feature = "str_internals", issue = "none")]
67+
pub use validations::next_code_point;
68+
6569
use iter::MatchIndicesInternal;
6670
use iter::SplitInternal;
6771
use iter::{MatchesInternal, SplitNInternal};
6872

73+
use validations::{run_utf8_validation, truncate_to_char_boundary};
74+
6975
/*
7076
Section: Creating a string
7177
*/
@@ -257,102 +263,6 @@ pub unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str {
257263
unsafe { &mut *(v as *mut [u8] as *mut str) }
258264
}
259265

260-
/// Returns the initial codepoint accumulator for the first byte.
261-
/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
262-
/// for width 3, and 3 bits for width 4.
263-
#[inline]
264-
fn utf8_first_byte(byte: u8, width: u32) -> u32 {
265-
(byte & (0x7F >> width)) as u32
266-
}
267-
268-
/// Returns the value of `ch` updated with continuation byte `byte`.
269-
#[inline]
270-
fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
271-
(ch << 6) | (byte & CONT_MASK) as u32
272-
}
273-
274-
/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
275-
/// bits `10`).
276-
#[inline]
277-
fn utf8_is_cont_byte(byte: u8) -> bool {
278-
(byte & !CONT_MASK) == TAG_CONT_U8
279-
}
280-
281-
#[inline]
282-
fn unwrap_or_0(opt: Option<&u8>) -> u8 {
283-
match opt {
284-
Some(&byte) => byte,
285-
None => 0,
286-
}
287-
}
288-
289-
/// Reads the next code point out of a byte iterator (assuming a
290-
/// UTF-8-like encoding).
291-
#[unstable(feature = "str_internals", issue = "none")]
292-
#[inline]
293-
pub fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
294-
// Decode UTF-8
295-
let x = *bytes.next()?;
296-
if x < 128 {
297-
return Some(x as u32);
298-
}
299-
300-
// Multibyte case follows
301-
// Decode from a byte combination out of: [[[x y] z] w]
302-
// NOTE: Performance is sensitive to the exact formulation here
303-
let init = utf8_first_byte(x, 2);
304-
let y = unwrap_or_0(bytes.next());
305-
let mut ch = utf8_acc_cont_byte(init, y);
306-
if x >= 0xE0 {
307-
// [[x y z] w] case
308-
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
309-
let z = unwrap_or_0(bytes.next());
310-
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
311-
ch = init << 12 | y_z;
312-
if x >= 0xF0 {
313-
// [x y z w] case
314-
// use only the lower 3 bits of `init`
315-
let w = unwrap_or_0(bytes.next());
316-
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
317-
}
318-
}
319-
320-
Some(ch)
321-
}
322-
323-
/// Reads the last code point out of a byte iterator (assuming a
324-
/// UTF-8-like encoding).
325-
#[inline]
326-
fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
327-
where
328-
I: DoubleEndedIterator<Item = &'a u8>,
329-
{
330-
// Decode UTF-8
331-
let w = match *bytes.next_back()? {
332-
next_byte if next_byte < 128 => return Some(next_byte as u32),
333-
back_byte => back_byte,
334-
};
335-
336-
// Multibyte case follows
337-
// Decode from a byte combination out of: [x [y [z w]]]
338-
let mut ch;
339-
let z = unwrap_or_0(bytes.next_back());
340-
ch = utf8_first_byte(z, 2);
341-
if utf8_is_cont_byte(z) {
342-
let y = unwrap_or_0(bytes.next_back());
343-
ch = utf8_first_byte(y, 3);
344-
if utf8_is_cont_byte(y) {
345-
let x = unwrap_or_0(bytes.next_back());
346-
ch = utf8_first_byte(x, 4);
347-
ch = utf8_acc_cont_byte(ch, y);
348-
}
349-
ch = utf8_acc_cont_byte(ch, z);
350-
}
351-
ch = utf8_acc_cont_byte(ch, w);
352-
353-
Some(ch)
354-
}
355-
356266
impl_fn_for_zst! {
357267
/// A nameable, cloneable fn type
358268
#[derive(Clone)]
@@ -363,184 +273,6 @@ impl_fn_for_zst! {
363273
};
364274
}
365275

366-
/*
367-
Section: UTF-8 validation
368-
*/
369-
370-
// use truncation to fit u64 into usize
371-
const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;
372-
373-
/// Returns `true` if any byte in the word `x` is nonascii (>= 128).
374-
#[inline]
375-
fn contains_nonascii(x: usize) -> bool {
376-
(x & NONASCII_MASK) != 0
377-
}
378-
379-
/// Walks through `v` checking that it's a valid UTF-8 sequence,
380-
/// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`.
381-
#[inline(always)]
382-
fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
383-
let mut index = 0;
384-
let len = v.len();
385-
386-
let usize_bytes = mem::size_of::<usize>();
387-
let ascii_block_size = 2 * usize_bytes;
388-
let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 };
389-
let align = v.as_ptr().align_offset(usize_bytes);
390-
391-
while index < len {
392-
let old_offset = index;
393-
macro_rules! err {
394-
($error_len: expr) => {
395-
return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len });
396-
};
397-
}
398-
399-
macro_rules! next {
400-
() => {{
401-
index += 1;
402-
// we needed data, but there was none: error!
403-
if index >= len {
404-
err!(None)
405-
}
406-
v[index]
407-
}};
408-
}
409-
410-
let first = v[index];
411-
if first >= 128 {
412-
let w = UTF8_CHAR_WIDTH[first as usize];
413-
// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
414-
// first C2 80 last DF BF
415-
// 3-byte encoding is for codepoints \u{0800} to \u{ffff}
416-
// first E0 A0 80 last EF BF BF
417-
// excluding surrogates codepoints \u{d800} to \u{dfff}
418-
// ED A0 80 to ED BF BF
419-
// 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
420-
// first F0 90 80 80 last F4 8F BF BF
421-
//
422-
// Use the UTF-8 syntax from the RFC
423-
//
424-
// https://tools.ietf.org/html/rfc3629
425-
// UTF8-1 = %x00-7F
426-
// UTF8-2 = %xC2-DF UTF8-tail
427-
// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
428-
// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
429-
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
430-
// %xF4 %x80-8F 2( UTF8-tail )
431-
match w {
432-
2 => {
433-
if next!() & !CONT_MASK != TAG_CONT_U8 {
434-
err!(Some(1))
435-
}
436-
}
437-
3 => {
438-
match (first, next!()) {
439-
(0xE0, 0xA0..=0xBF)
440-
| (0xE1..=0xEC, 0x80..=0xBF)
441-
| (0xED, 0x80..=0x9F)
442-
| (0xEE..=0xEF, 0x80..=0xBF) => {}
443-
_ => err!(Some(1)),
444-
}
445-
if next!() & !CONT_MASK != TAG_CONT_U8 {
446-
err!(Some(2))
447-
}
448-
}
449-
4 => {
450-
match (first, next!()) {
451-
(0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {}
452-
_ => err!(Some(1)),
453-
}
454-
if next!() & !CONT_MASK != TAG_CONT_U8 {
455-
err!(Some(2))
456-
}
457-
if next!() & !CONT_MASK != TAG_CONT_U8 {
458-
err!(Some(3))
459-
}
460-
}
461-
_ => err!(Some(1)),
462-
}
463-
index += 1;
464-
} else {
465-
// Ascii case, try to skip forward quickly.
466-
// When the pointer is aligned, read 2 words of data per iteration
467-
// until we find a word containing a non-ascii byte.
468-
if align != usize::MAX && align.wrapping_sub(index) % usize_bytes == 0 {
469-
let ptr = v.as_ptr();
470-
while index < blocks_end {
471-
// SAFETY: since `align - index` and `ascii_block_size` are
472-
// multiples of `usize_bytes`, `block = ptr.add(index)` is
473-
// always aligned with a `usize` so it's safe to dereference
474-
// both `block` and `block.offset(1)`.
475-
unsafe {
476-
let block = ptr.add(index) as *const usize;
477-
// break if there is a nonascii byte
478-
let zu = contains_nonascii(*block);
479-
let zv = contains_nonascii(*block.offset(1));
480-
if zu | zv {
481-
break;
482-
}
483-
}
484-
index += ascii_block_size;
485-
}
486-
// step from the point where the wordwise loop stopped
487-
while index < len && v[index] < 128 {
488-
index += 1;
489-
}
490-
} else {
491-
index += 1;
492-
}
493-
}
494-
}
495-
496-
Ok(())
497-
}
498-
499-
// https://tools.ietf.org/html/rfc3629
500-
static UTF8_CHAR_WIDTH: [u8; 256] = [
501-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
502-
1, // 0x1F
503-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
504-
1, // 0x3F
505-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
506-
1, // 0x5F
507-
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
508-
1, // 0x7F
509-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
510-
0, // 0x9F
511-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
512-
0, // 0xBF
513-
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
514-
2, // 0xDF
515-
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xEF
516-
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xFF
517-
];
518-
519-
/// Given a first byte, determines how many bytes are in this UTF-8 character.
520-
#[unstable(feature = "str_internals", issue = "none")]
521-
#[inline]
522-
pub fn utf8_char_width(b: u8) -> usize {
523-
UTF8_CHAR_WIDTH[b as usize] as usize
524-
}
525-
526-
/// Mask of the value bits of a continuation byte.
527-
const CONT_MASK: u8 = 0b0011_1111;
528-
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte.
529-
const TAG_CONT_U8: u8 = 0b1000_0000;
530-
531-
// truncate `&str` to length at most equal to `max`
532-
// return `true` if it were truncated, and the new str.
533-
fn truncate_to_char_boundary(s: &str, mut max: usize) -> (bool, &str) {
534-
if max >= s.len() {
535-
(false, s)
536-
} else {
537-
while !s.is_char_boundary(max) {
538-
max -= 1;
539-
}
540-
(true, &s[..max])
541-
}
542-
}
543-
544276
#[inline(never)]
545277
#[cold]
546278
#[track_caller]

0 commit comments

Comments
 (0)