From d2204cec36a1120f977fa11ed162768deb639a5b Mon Sep 17 00:00:00 2001 From: lincot Date: Mon, 6 May 2024 16:50:17 +0300 Subject: [PATCH] speed up `String::push` and `String::insert` --- library/alloc/src/lib.rs | 1 + library/alloc/src/string.rs | 63 ++++++++++++++++------ library/core/src/char/methods.rs | 90 ++++++++++++++++++++++---------- library/core/src/char/mod.rs | 2 +- 4 files changed, 110 insertions(+), 46 deletions(-) diff --git a/library/alloc/src/lib.rs b/library/alloc/src/lib.rs index 0bb7c432cc35f..2eb50c3c74cf7 100644 --- a/library/alloc/src/lib.rs +++ b/library/alloc/src/lib.rs @@ -105,6 +105,7 @@ #![feature(box_uninit_write)] #![feature(bstr)] #![feature(bstr_internals)] +#![feature(char_internals)] #![feature(clone_to_uninit)] #![feature(coerce_unsized)] #![feature(const_eval_select)] diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index 0c9535dfaa628..2f7e6ea322ea5 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -1417,9 +1417,14 @@ impl String { #[inline] #[stable(feature = "rust1", since = "1.0.0")] pub fn push(&mut self, ch: char) { - match ch.len_utf8() { - 1 => self.vec.push(ch as u8), - _ => self.vec.extend_from_slice(ch.encode_utf8(&mut [0; 4]).as_bytes()), + let len = self.len(); + let ch_len = ch.len_utf8(); + self.reserve(ch_len); + + // SAFETY: Just reserved capacity for at least the length needed to encode `ch`. + unsafe { + core::char::encode_utf8_raw_unchecked(ch as u32, self.vec.as_mut_ptr().add(self.len())); + self.vec.set_len(len + ch_len); } } @@ -1716,24 +1721,31 @@ impl String { #[rustc_confusables("set")] pub fn insert(&mut self, idx: usize, ch: char) { assert!(self.is_char_boundary(idx)); - let mut bits = [0; 4]; - let bits = ch.encode_utf8(&mut bits).as_bytes(); + let len = self.len(); + let ch_len = ch.len_utf8(); + self.reserve(ch_len); + + // SAFETY: Move the bytes starting from `idx` to their new location `ch_len` + // bytes ahead. This is safe because sufficient capacity was reserved, and `idx` + // is a char boundary. unsafe { - self.insert_bytes(idx, bits); + ptr::copy( + self.vec.as_ptr().add(idx), + self.vec.as_mut_ptr().add(idx + ch_len), + len - idx, + ); } - } - #[cfg(not(no_global_oom_handling))] - unsafe fn insert_bytes(&mut self, idx: usize, bytes: &[u8]) { - let len = self.len(); - let amt = bytes.len(); - self.vec.reserve(amt); + // SAFETY: Encode the character into the vacated region if `idx != len`, + // or into the uninitialized spare capacity otherwise. + unsafe { + core::char::encode_utf8_raw_unchecked(ch as u32, self.vec.as_mut_ptr().add(idx)); + } + // SAFETY: Update the length to include the newly added bytes. unsafe { - ptr::copy(self.vec.as_ptr().add(idx), self.vec.as_mut_ptr().add(idx + amt), len - idx); - ptr::copy_nonoverlapping(bytes.as_ptr(), self.vec.as_mut_ptr().add(idx), amt); - self.vec.set_len(len + amt); + self.vec.set_len(len + ch_len); } } @@ -1763,8 +1775,27 @@ impl String { pub fn insert_str(&mut self, idx: usize, string: &str) { assert!(self.is_char_boundary(idx)); + let len = self.len(); + let amt = string.len(); + self.reserve(amt); + + // SAFETY: Move the bytes starting from `idx` to their new location `amt` bytes + // ahead. This is safe because sufficient capacity was just reserved, and `idx` + // is a char boundary. unsafe { - self.insert_bytes(idx, string.as_bytes()); + ptr::copy(self.vec.as_ptr().add(idx), self.vec.as_mut_ptr().add(idx + amt), len - idx); + } + + // SAFETY: Copy the new string slice into the vacated region if `idx != len`, + // or into the uninitialized spare capacity otherwise. The borrow checker + // ensures that the source and destination do not overlap. + unsafe { + ptr::copy_nonoverlapping(string.as_ptr(), self.vec.as_mut_ptr().add(idx), amt); + } + + // SAFETY: Update the length to include the newly added bytes. + unsafe { + self.vec.set_len(len + amt); } } diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index fb8a740aced13..37a29fd163e8d 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -1795,39 +1795,71 @@ const fn len_utf16(code: u32) -> usize { #[inline] pub const fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] { let len = len_utf8(code); - match (len, &mut *dst) { - (1, [a, ..]) => { - *a = code as u8; - } - (2, [a, b, ..]) => { - *a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; - *b = (code & 0x3F) as u8 | TAG_CONT; - } - (3, [a, b, c, ..]) => { - *a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; - *b = (code >> 6 & 0x3F) as u8 | TAG_CONT; - *c = (code & 0x3F) as u8 | TAG_CONT; - } - (4, [a, b, c, d, ..]) => { - *a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; - *b = (code >> 12 & 0x3F) as u8 | TAG_CONT; - *c = (code >> 6 & 0x3F) as u8 | TAG_CONT; - *d = (code & 0x3F) as u8 | TAG_CONT; - } - _ => { - const_panic!( - "encode_utf8: buffer does not have enough bytes to encode code point", - "encode_utf8: need {len} bytes to encode U+{code:04X} but buffer has just {dst_len}", - code: u32 = code, - len: usize = len, - dst_len: usize = dst.len(), - ) - } - }; + if dst.len() < len { + const_panic!( + "encode_utf8: buffer does not have enough bytes to encode code point", + "encode_utf8: need {len} bytes to encode U+{code:04X} but buffer has just {dst_len}", + code: u32 = code, + len: usize = len, + dst_len: usize = dst.len(), + ); + } + + // SAFETY: `dst` is checked to be at least the length needed to encode the codepoint. + unsafe { encode_utf8_raw_unchecked(code, dst.as_mut_ptr()) }; + // SAFETY: `<&mut [u8]>::as_mut_ptr` is guaranteed to return a valid pointer and `len` has been tested to be within bounds. unsafe { slice::from_raw_parts_mut(dst.as_mut_ptr(), len) } } +/// Encodes a raw u32 value as UTF-8 to the provided destination buffer. +/// +/// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range. +/// (Creating a `char` in the surrogate range is UB.) +/// The result is valid [generalized UTF-8] but not valid UTF-8. +/// +/// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8 +/// +/// # Safety +/// +/// The behavior is undefined if the buffer pointed to by `dst` is not +/// large enough to hold the encoded codepoint. A buffer of length four +/// is large enough to encode any `char`. +/// +/// For a safe version of this function, see the [`encode_utf8_raw`] function. +#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] +#[doc(hidden)] +#[inline] +pub const unsafe fn encode_utf8_raw_unchecked(code: u32, dst: *mut u8) { + let len = len_utf8(code); + // SAFETY: The caller must guarantee that the buffer pointed to by `dst` + // is at least `len` bytes long. + unsafe { + match len { + 1 => { + *dst = code as u8; + } + 2 => { + *dst = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; + *dst.add(1) = (code & 0x3F) as u8 | TAG_CONT; + } + 3 => { + *dst = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; + *dst.add(1) = (code >> 6 & 0x3F) as u8 | TAG_CONT; + *dst.add(2) = (code & 0x3F) as u8 | TAG_CONT; + } + 4 => { + *dst = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; + *dst.add(1) = (code >> 12 & 0x3F) as u8 | TAG_CONT; + *dst.add(2) = (code >> 6 & 0x3F) as u8 | TAG_CONT; + *dst.add(3) = (code & 0x3F) as u8 | TAG_CONT; + } + // SAFETY: `char` always takes between 1 and 4 bytes to encode in UTF-8. + _ => crate::hint::unreachable_unchecked(), + } + } +} + /// Encodes a raw `u32` value as UTF-16 into the provided `u16` buffer, /// and then returns the subslice of the buffer that contains the encoded character. /// diff --git a/library/core/src/char/mod.rs b/library/core/src/char/mod.rs index 59fd7250e8f8e..a0bd991d3e7c4 100644 --- a/library/core/src/char/mod.rs +++ b/library/core/src/char/mod.rs @@ -38,7 +38,7 @@ pub use self::decode::{DecodeUtf16, DecodeUtf16Error}; #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] pub use self::methods::encode_utf16_raw; // perma-unstable #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] -pub use self::methods::encode_utf8_raw; // perma-unstable +pub use self::methods::{encode_utf8_raw, encode_utf8_raw_unchecked}; // perma-unstable #[rustfmt::skip] use crate::ascii;