Move utf-8 validating helpers to new mod

tesuji · tesuji · commit 90c813a0f0b5 · 2020-09-26T05:20:53.000Z
diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs
@@ -12,8 +12,8 @@ use crate::slice::{self, Split as SliceSplit};
 use super::from_utf8_unchecked;
 use super::pattern::Pattern;
 use super::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher};
+use super::validations::{next_code_point, next_code_point_reverse, utf8_is_cont_byte};
 use super::LinesAnyMap;
-use super::{next_code_point, next_code_point_reverse, utf8_is_cont_byte};
 use super::{BytesIsNotEmpty, UnsafeBytesToStr};
 use super::{CharEscapeDebugContinue, CharEscapeDefault, CharEscapeUnicode};
 use super::{IsAsciiWhitespace, IsNotEmpty, IsWhitespace};
diff --git a/library/core/src/str/lossy.rs b/library/core/src/str/lossy.rs
@@ -1,7 +1,9 @@
 use crate::char;
 use crate::fmt::{self, Write};
 use crate::mem;
-use crate::str as core_str;
+
+use super::from_utf8_unchecked;
+use super::validations::utf8_char_width;
 
 /// Lossy UTF-8 string.
 #[unstable(feature = "str_internals", issue = "none")]
@@ -66,14 +68,14 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
 
             if byte < 128 {
             } else {
-                let w = core_str::utf8_char_width(byte);
+                let w = utf8_char_width(byte);
 
                 macro_rules! error {
                     () => {{
                         // SAFETY: We have checked up to `i` that source is valid UTF-8.
                         unsafe {
                             let r = Utf8LossyChunk {
-                                valid: core_str::from_utf8_unchecked(&self.source[0..i_]),
+                                valid: from_utf8_unchecked(&self.source[0..i_]),
                                 broken: &self.source[i_..i],
                             };
                             self.source = &self.source[i..];
@@ -133,7 +135,7 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
 
         let r = Utf8LossyChunk {
             // SAFETY: We have checked that the entire source is valid UTF-8.
-            valid: unsafe { core_str::from_utf8_unchecked(self.source) },
+            valid: unsafe { from_utf8_unchecked(self.source) },
             broken: &[],
         };
         self.source = &[];
diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs
@@ -11,6 +11,7 @@
 mod error;
 mod iter;
 mod traits;
+mod validations;
 
 use self::pattern::Pattern;
 use self::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher};
@@ -62,10 +63,15 @@ pub use iter::SplitAsciiWhitespace;
 #[unstable(feature = "split_inclusive", issue = "72360")]
 use iter::SplitInclusive;
 
+#[unstable(feature = "str_internals", issue = "none")]
+pub use validations::next_code_point;
+
 use iter::MatchIndicesInternal;
 use iter::SplitInternal;
 use iter::{MatchesInternal, SplitNInternal};
 
+use validations::{run_utf8_validation, truncate_to_char_boundary};
+
 /*
 Section: Creating a string
 */
@@ -257,102 +263,6 @@ pub unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str {
     unsafe { &mut *(v as *mut [u8] as *mut str) }
 }
 
-/// Returns the initial codepoint accumulator for the first byte.
-/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
-/// for width 3, and 3 bits for width 4.
-#[inline]
-fn utf8_first_byte(byte: u8, width: u32) -> u32 {
-    (byte & (0x7F >> width)) as u32
-}
-
-/// Returns the value of `ch` updated with continuation byte `byte`.
-#[inline]
-fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
-    (ch << 6) | (byte & CONT_MASK) as u32
-}
-
-/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
-/// bits `10`).
-#[inline]
-fn utf8_is_cont_byte(byte: u8) -> bool {
-    (byte & !CONT_MASK) == TAG_CONT_U8
-}
-
-#[inline]
-fn unwrap_or_0(opt: Option<&u8>) -> u8 {
-    match opt {
-        Some(&byte) => byte,
-        None => 0,
-    }
-}
-
-/// Reads the next code point out of a byte iterator (assuming a
-/// UTF-8-like encoding).
-#[unstable(feature = "str_internals", issue = "none")]
-#[inline]
-pub fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
-    // Decode UTF-8
-    let x = *bytes.next()?;
-    if x < 128 {
-        return Some(x as u32);
-    }
-
-    // Multibyte case follows
-    // Decode from a byte combination out of: [[[x y] z] w]
-    // NOTE: Performance is sensitive to the exact formulation here
-    let init = utf8_first_byte(x, 2);
-    let y = unwrap_or_0(bytes.next());
-    let mut ch = utf8_acc_cont_byte(init, y);
-    if x >= 0xE0 {
-        // [[x y z] w] case
-        // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
-        let z = unwrap_or_0(bytes.next());
-        let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
-        ch = init << 12 | y_z;
-        if x >= 0xF0 {
-            // [x y z w] case
-            // use only the lower 3 bits of `init`
-            let w = unwrap_or_0(bytes.next());
-            ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
-        }
-    }
-
-    Some(ch)
-}
-
-/// Reads the last code point out of a byte iterator (assuming a
-/// UTF-8-like encoding).
-#[inline]
-fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
-where
-    I: DoubleEndedIterator<Item = &'a u8>,
-{
-    // Decode UTF-8
-    let w = match *bytes.next_back()? {
-        next_byte if next_byte < 128 => return Some(next_byte as u32),
-        back_byte => back_byte,
-    };
-
-    // Multibyte case follows
-    // Decode from a byte combination out of: [x [y [z w]]]
-    let mut ch;
-    let z = unwrap_or_0(bytes.next_back());
-    ch = utf8_first_byte(z, 2);
-    if utf8_is_cont_byte(z) {
-        let y = unwrap_or_0(bytes.next_back());
-        ch = utf8_first_byte(y, 3);
-        if utf8_is_cont_byte(y) {
-            let x = unwrap_or_0(bytes.next_back());
-            ch = utf8_first_byte(x, 4);
-            ch = utf8_acc_cont_byte(ch, y);
-        }
-        ch = utf8_acc_cont_byte(ch, z);
-    }
-    ch = utf8_acc_cont_byte(ch, w);
-
-    Some(ch)
-}
-
 impl_fn_for_zst! {
     /// A nameable, cloneable fn type
     #[derive(Clone)]
@@ -363,184 +273,6 @@ impl_fn_for_zst! {
     };
 }
 
-/*
-Section: UTF-8 validation
-*/
-
-// use truncation to fit u64 into usize
-const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;
-
-/// Returns `true` if any byte in the word `x` is nonascii (>= 128).
-#[inline]
-fn contains_nonascii(x: usize) -> bool {
-    (x & NONASCII_MASK) != 0
-}
-
-/// Walks through `v` checking that it's a valid UTF-8 sequence,
-/// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`.
-#[inline(always)]
-fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
-    let mut index = 0;
-    let len = v.len();
-
-    let usize_bytes = mem::size_of::<usize>();
-    let ascii_block_size = 2 * usize_bytes;
-    let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 };
-    let align = v.as_ptr().align_offset(usize_bytes);
-
-    while index < len {
-        let old_offset = index;
-        macro_rules! err {
-            ($error_len: expr) => {
-                return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len });
-            };
-        }
-
-        macro_rules! next {
-            () => {{
-                index += 1;
-                // we needed data, but there was none: error!
-                if index >= len {
-                    err!(None)
-                }
-                v[index]
-            }};
-        }
-
-        let first = v[index];
-        if first >= 128 {
-            let w = UTF8_CHAR_WIDTH[first as usize];
-            // 2-byte encoding is for codepoints  \u{0080} to  \u{07ff}
-            //        first  C2 80        last DF BF
-            // 3-byte encoding is for codepoints  \u{0800} to  \u{ffff}
-            //        first  E0 A0 80     last EF BF BF
-            //   excluding surrogates codepoints  \u{d800} to  \u{dfff}
-            //               ED A0 80 to       ED BF BF
-            // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
-            //        first  F0 90 80 80  last F4 8F BF BF
-            //
-            // Use the UTF-8 syntax from the RFC
-            //
-            // https://tools.ietf.org/html/rfc3629
-            // UTF8-1      = %x00-7F
-            // UTF8-2      = %xC2-DF UTF8-tail
-            // UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
-            //               %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
-            // UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
-            //               %xF4 %x80-8F 2( UTF8-tail )
-            match w {
-                2 => {
-                    if next!() & !CONT_MASK != TAG_CONT_U8 {
-                        err!(Some(1))
-                    }
-                }
-                3 => {
-                    match (first, next!()) {
-                        (0xE0, 0xA0..=0xBF)
-                        | (0xE1..=0xEC, 0x80..=0xBF)
-                        | (0xED, 0x80..=0x9F)
-                        | (0xEE..=0xEF, 0x80..=0xBF) => {}
-                        _ => err!(Some(1)),
-                    }
-                    if next!() & !CONT_MASK != TAG_CONT_U8 {
-                        err!(Some(2))
-                    }
-                }
-                4 => {
-                    match (first, next!()) {
-                        (0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {}
-                        _ => err!(Some(1)),
-                    }
-                    if next!() & !CONT_MASK != TAG_CONT_U8 {
-                        err!(Some(2))
-                    }
-                    if next!() & !CONT_MASK != TAG_CONT_U8 {
-                        err!(Some(3))
-                    }
-                }
-                _ => err!(Some(1)),
-            }
-            index += 1;
-        } else {
-            // Ascii case, try to skip forward quickly.
-            // When the pointer is aligned, read 2 words of data per iteration
-            // until we find a word containing a non-ascii byte.
-            if align != usize::MAX && align.wrapping_sub(index) % usize_bytes == 0 {
-                let ptr = v.as_ptr();
-                while index < blocks_end {
-                    // SAFETY: since `align - index` and `ascii_block_size` are
-                    // multiples of `usize_bytes`, `block = ptr.add(index)` is
-                    // always aligned with a `usize` so it's safe to dereference
-                    // both `block` and `block.offset(1)`.
-                    unsafe {
-                        let block = ptr.add(index) as *const usize;
-                        // break if there is a nonascii byte
-                        let zu = contains_nonascii(*block);
-                        let zv = contains_nonascii(*block.offset(1));
-                        if zu | zv {
-                            break;
-                        }
-                    }
-                    index += ascii_block_size;
-                }
-                // step from the point where the wordwise loop stopped
-                while index < len && v[index] < 128 {
-                    index += 1;
-                }
-            } else {
-                index += 1;
-            }
-        }
-    }
-
-    Ok(())
-}
-
-// https://tools.ietf.org/html/rfc3629
-static UTF8_CHAR_WIDTH: [u8; 256] = [
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, // 0x1F
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, // 0x3F
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, // 0x5F
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, // 0x7F
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, // 0x9F
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, // 0xBF
-    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    2, // 0xDF
-    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xEF
-    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xFF
-];
-
-/// Given a first byte, determines how many bytes are in this UTF-8 character.
-#[unstable(feature = "str_internals", issue = "none")]
-#[inline]
-pub fn utf8_char_width(b: u8) -> usize {
-    UTF8_CHAR_WIDTH[b as usize] as usize
-}
-
-/// Mask of the value bits of a continuation byte.
-const CONT_MASK: u8 = 0b0011_1111;
-/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte.
-const TAG_CONT_U8: u8 = 0b1000_0000;
-
-// truncate `&str` to length at most equal to `max`
-// return `true` if it were truncated, and the new str.
-fn truncate_to_char_boundary(s: &str, mut max: usize) -> (bool, &str) {
-    if max >= s.len() {
-        (false, s)
-    } else {
-        while !s.is_char_boundary(max) {
-            max -= 1;
-        }
-        (true, &s[..max])
-    }
-}
-
 #[inline(never)]
 #[cold]
 #[track_caller]
diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs