Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

speed up String::push and String::insert #124810

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions library/alloc/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@
#![feature(box_uninit_write)]
#![feature(bstr)]
#![feature(bstr_internals)]
#![feature(char_internals)]
#![feature(char_max_len)]
#![feature(clone_to_uninit)]
#![feature(coerce_unsized)]
Expand Down
65 changes: 47 additions & 18 deletions library/alloc/src/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1417,11 +1417,14 @@ impl String {
#[inline]
#[stable(feature = "rust1", since = "1.0.0")]
pub fn push(&mut self, ch: char) {
match ch.len_utf8() {
1 => self.vec.push(ch as u8),
_ => {
self.vec.extend_from_slice(ch.encode_utf8(&mut [0; char::MAX_LEN_UTF8]).as_bytes())
}
let len = self.len();
let ch_len = ch.len_utf8();
self.reserve(ch_len);

// SAFETY: Just reserved capacity for at least the length needed to encode `ch`.
unsafe {
core::char::encode_utf8_raw_unchecked(ch as u32, self.vec.as_mut_ptr().add(self.len()));
self.vec.set_len(len + ch_len);
}
}

Expand Down Expand Up @@ -1718,24 +1721,31 @@ impl String {
#[rustc_confusables("set")]
pub fn insert(&mut self, idx: usize, ch: char) {
assert!(self.is_char_boundary(idx));
let mut bits = [0; char::MAX_LEN_UTF8];
let bits = ch.encode_utf8(&mut bits).as_bytes();

let len = self.len();
let ch_len = ch.len_utf8();
self.reserve(ch_len);

// SAFETY: Move the bytes starting from `idx` to their new location `ch_len`
// bytes ahead. This is safe because sufficient capacity was reserved, and `idx`
// is a char boundary.
unsafe {
self.insert_bytes(idx, bits);
ptr::copy(
self.vec.as_ptr().add(idx),
self.vec.as_mut_ptr().add(idx + ch_len),
len - idx,
);
}
}

#[cfg(not(no_global_oom_handling))]
unsafe fn insert_bytes(&mut self, idx: usize, bytes: &[u8]) {
let len = self.len();
let amt = bytes.len();
self.vec.reserve(amt);
// SAFETY: Encode the character into the vacated region if `idx != len`,
// or into the uninitialized spare capacity otherwise.
unsafe {
core::char::encode_utf8_raw_unchecked(ch as u32, self.vec.as_mut_ptr().add(idx));
}

// SAFETY: Update the length to include the newly added bytes.
unsafe {
ptr::copy(self.vec.as_ptr().add(idx), self.vec.as_mut_ptr().add(idx + amt), len - idx);
ptr::copy_nonoverlapping(bytes.as_ptr(), self.vec.as_mut_ptr().add(idx), amt);
self.vec.set_len(len + amt);
self.vec.set_len(len + ch_len);
}
}

Expand Down Expand Up @@ -1765,8 +1775,27 @@ impl String {
pub fn insert_str(&mut self, idx: usize, string: &str) {
assert!(self.is_char_boundary(idx));

let len = self.len();
let amt = string.len();
self.reserve(amt);

// SAFETY: Move the bytes starting from `idx` to their new location `amt` bytes
// ahead. This is safe because sufficient capacity was just reserved, and `idx`
// is a char boundary.
unsafe {
ptr::copy(self.vec.as_ptr().add(idx), self.vec.as_mut_ptr().add(idx + amt), len - idx);
}

// SAFETY: Copy the new string slice into the vacated region if `idx != len`,
// or into the uninitialized spare capacity otherwise. The borrow checker
// ensures that the source and destination do not overlap.
unsafe {
ptr::copy_nonoverlapping(string.as_ptr(), self.vec.as_mut_ptr().add(idx), amt);
}

// SAFETY: Update the length to include the newly added bytes.
unsafe {
self.insert_bytes(idx, string.as_bytes());
self.vec.set_len(len + amt);
}
}

Expand Down
90 changes: 61 additions & 29 deletions library/core/src/char/methods.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1806,39 +1806,71 @@ const fn len_utf16(code: u32) -> usize {
#[inline]
pub const fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] {
let len = len_utf8(code);
match (len, &mut *dst) {
(1, [a, ..]) => {
*a = code as u8;
}
(2, [a, b, ..]) => {
*a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
*b = (code & 0x3F) as u8 | TAG_CONT;
}
(3, [a, b, c, ..]) => {
*a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
*b = (code >> 6 & 0x3F) as u8 | TAG_CONT;
*c = (code & 0x3F) as u8 | TAG_CONT;
}
(4, [a, b, c, d, ..]) => {
*a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
*b = (code >> 12 & 0x3F) as u8 | TAG_CONT;
*c = (code >> 6 & 0x3F) as u8 | TAG_CONT;
*d = (code & 0x3F) as u8 | TAG_CONT;
}
_ => {
const_panic!(
"encode_utf8: buffer does not have enough bytes to encode code point",
"encode_utf8: need {len} bytes to encode U+{code:04X} but buffer has just {dst_len}",
code: u32 = code,
len: usize = len,
dst_len: usize = dst.len(),
)
}
};
if dst.len() < len {
const_panic!(
"encode_utf8: buffer does not have enough bytes to encode code point",
"encode_utf8: need {len} bytes to encode U+{code:04X} but buffer has just {dst_len}",
code: u32 = code,
len: usize = len,
dst_len: usize = dst.len(),
);
}

// SAFETY: `dst` is checked to be at least the length needed to encode the codepoint.
unsafe { encode_utf8_raw_unchecked(code, dst.as_mut_ptr()) };

// SAFETY: `<&mut [u8]>::as_mut_ptr` is guaranteed to return a valid pointer and `len` has been tested to be within bounds.
unsafe { slice::from_raw_parts_mut(dst.as_mut_ptr(), len) }
}

/// Encodes a raw `u32` value as UTF-8 into the byte buffer pointed to by `dst`.
///
/// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range.
/// (Creating a `char` in the surrogate range is UB.)
/// The result is valid [generalized UTF-8] but not valid UTF-8.
///
/// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8
///
/// # Safety
///
/// The behavior is undefined if the buffer pointed to by `dst` is not
/// large enough to hold the encoded codepoint. A buffer of length four
/// is large enough to encode any `char`.
///
/// For a safe version of this function, see the [`encode_utf8_raw`] function.
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
#[doc(hidden)]
#[inline]
pub const unsafe fn encode_utf8_raw_unchecked(code: u32, dst: *mut u8) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dst could probably still be a &mut [u8] in this signature, call .as_mut_ptr() within this function. Then you can add a debug_assert!(dst.len() >= len).

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe &mut [u8] cannot point to uninitialized memory according to safety guidelines, which is why I used &mut [MaybeUninit<u8>] first. If a slice is necessary, we can revert to using &mut [MaybeUninit<u8>] and .as_mut_ptr().

let len = len_utf8(code);
// SAFETY: The caller must guarantee that the buffer pointed to by `dst`
// is at least `len` bytes long.
unsafe {
match len {
1 => {
*dst = code as u8;
}
2 => {
*dst = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
*dst.add(1) = (code & 0x3F) as u8 | TAG_CONT;
}
3 => {
*dst = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
*dst.add(1) = (code >> 6 & 0x3F) as u8 | TAG_CONT;
*dst.add(2) = (code & 0x3F) as u8 | TAG_CONT;
}
4 => {
*dst = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
*dst.add(1) = (code >> 12 & 0x3F) as u8 | TAG_CONT;
*dst.add(2) = (code >> 6 & 0x3F) as u8 | TAG_CONT;
*dst.add(3) = (code & 0x3F) as u8 | TAG_CONT;
}
// SAFETY: `char` always takes between 1 and 4 bytes to encode in UTF-8.
_ => crate::hint::unreachable_unchecked(),
}
}
}

/// Encodes a raw `u32` value as native endian UTF-16 into the provided `u16` buffer,
/// and then returns the subslice of the buffer that contains the encoded character.
///
Expand Down
2 changes: 1 addition & 1 deletion library/core/src/char/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ pub use self::decode::{DecodeUtf16, DecodeUtf16Error};
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
pub use self::methods::encode_utf16_raw; // perma-unstable
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
pub use self::methods::encode_utf8_raw; // perma-unstable
pub use self::methods::{encode_utf8_raw, encode_utf8_raw_unchecked}; // perma-unstable

#[rustfmt::skip]
use crate::ascii;
Expand Down
11 changes: 11 additions & 0 deletions tests/codegen/string-push.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
//! Check that `String::push` is optimized enough not to call `memcpy`.
//@ compile-flags: -O
#![crate_type = "lib"]

// CHECK-LABEL: @string_push_does_not_call_memcpy
#[no_mangle]
pub fn string_push_does_not_call_memcpy(s: &mut String, ch: char) {
// CHECK-NOT: call void @llvm.memcpy
s.push(ch);
}
Loading