Skip to content

Commit c0e0d6d

Browse files
committed
speed up String::push and String::insert
1 parent 1f67a7a commit c0e0d6d

File tree

4 files changed

+90
-38
lines changed

4 files changed

+90
-38
lines changed

library/alloc/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@
105105
#![feature(async_fn_traits)]
106106
#![feature(async_iterator)]
107107
#![feature(box_uninit_write)]
108+
#![feature(char_internals)]
108109
#![feature(clone_to_uninit)]
109110
#![feature(coerce_unsized)]
110111
#![feature(const_align_of_val)]

library/alloc/src/string.rs

Lines changed: 33 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1417,9 +1417,14 @@ impl String {
14171417
#[inline]
14181418
#[stable(feature = "rust1", since = "1.0.0")]
14191419
pub fn push(&mut self, ch: char) {
1420-
match ch.len_utf8() {
1421-
1 => self.vec.push(ch as u8),
1422-
_ => self.vec.extend_from_slice(ch.encode_utf8(&mut [0; 4]).as_bytes()),
1420+
let len = self.len();
1421+
let ch_len = ch.len_utf8();
1422+
self.reserve(ch_len);
1423+
1424+
// SAFETY: Just reserved capacity for at least the length needed to encode `ch`.
1425+
unsafe {
1426+
core::char::encode_utf8_raw_unchecked(ch as u32, self.as_mut_ptr().add(self.len()));
1427+
self.vec.set_len(len + ch_len);
14231428
}
14241429
}
14251430

@@ -1716,24 +1721,30 @@ impl String {
17161721
#[rustc_confusables("set")]
17171722
pub fn insert(&mut self, idx: usize, ch: char) {
17181723
assert!(self.is_char_boundary(idx));
1719-
let mut bits = [0; 4];
1720-
let bits = ch.encode_utf8(&mut bits).as_bytes();
17211724

1725+
let len = self.len();
1726+
let ch_len = ch.len_utf8();
1727+
self.reserve(ch_len);
1728+
1729+
// SAFETY: Shift data `ch_len` bytes to the right,
1730+
// capacity was just reserved for at least that many bytes.
17221731
unsafe {
1723-
self.insert_bytes(idx, bits);
1732+
ptr::copy(
1733+
self.vec.as_ptr().add(idx),
1734+
self.vec.as_mut_ptr().add(idx + ch_len),
1735+
len - idx,
1736+
);
17241737
}
1725-
}
17261738

1727-
#[cfg(not(no_global_oom_handling))]
1728-
unsafe fn insert_bytes(&mut self, idx: usize, bytes: &[u8]) {
1729-
let len = self.len();
1730-
let amt = bytes.len();
1731-
self.vec.reserve(amt);
1739+
// SAFETY: Encode the character into the space left after the shift
1740+
// if `idx != len`, or into the uninitialized spare capacity otherwise.
1741+
unsafe {
1742+
core::char::encode_utf8_raw_unchecked(ch as u32, self.vec.as_mut_ptr().add(idx));
1743+
}
17321744

1745+
// SAFETY: `ch_len` initialized bytes have been added.
17331746
unsafe {
1734-
ptr::copy(self.vec.as_ptr().add(idx), self.vec.as_mut_ptr().add(idx + amt), len - idx);
1735-
ptr::copy_nonoverlapping(bytes.as_ptr(), self.vec.as_mut_ptr().add(idx), amt);
1736-
self.vec.set_len(len + amt);
1747+
self.vec.set_len(len + ch_len);
17371748
}
17381749
}
17391750

@@ -1763,8 +1774,14 @@ impl String {
17631774
pub fn insert_str(&mut self, idx: usize, string: &str) {
17641775
assert!(self.is_char_boundary(idx));
17651776

1777+
let len = self.len();
1778+
let amt = string.len();
1779+
self.reserve(amt);
1780+
17661781
unsafe {
1767-
self.insert_bytes(idx, string.as_bytes());
1782+
ptr::copy(self.vec.as_ptr().add(idx), self.vec.as_mut_ptr().add(idx + amt), len - idx);
1783+
ptr::copy_nonoverlapping(string.as_ptr(), self.vec.as_mut_ptr().add(idx), amt);
1784+
self.vec.set_len(len + amt);
17681785
}
17691786
}
17701787

library/core/src/char/methods.rs

Lines changed: 55 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1789,32 +1789,66 @@ pub const fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] {
17891789
);
17901790
}
17911791
let len = len_utf8(code);
1792-
match (len, &mut *dst) {
1793-
(1, [a, ..]) => {
1794-
*a = code as u8;
1795-
}
1796-
(2, [a, b, ..]) => {
1797-
*a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
1798-
*b = (code & 0x3F) as u8 | TAG_CONT;
1799-
}
1800-
(3, [a, b, c, ..]) => {
1801-
*a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
1802-
*b = (code >> 6 & 0x3F) as u8 | TAG_CONT;
1803-
*c = (code & 0x3F) as u8 | TAG_CONT;
1804-
}
1805-
(4, [a, b, c, d, ..]) => {
1806-
*a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
1807-
*b = (code >> 12 & 0x3F) as u8 | TAG_CONT;
1808-
*c = (code >> 6 & 0x3F) as u8 | TAG_CONT;
1809-
*d = (code & 0x3F) as u8 | TAG_CONT;
1810-
}
1792+
if dst.len() < len {
18111793
// FIXME(const-hack): We would prefer to have streamlined panics when formatters become const-friendly.
1812-
_ => const_eval_select((code, len, dst.len()), panic_at_const, panic_at_rt),
1813-
};
1794+
const_eval_select((code, len, dst.len()), panic_at_const, panic_at_rt);
1795+
}
1796+
1797+
// SAFETY: `dst` is checked to be at least the length needed to encode the codepoint.
1798+
unsafe { encode_utf8_raw_unchecked(code, dst.as_mut_ptr()) };
1799+
18141800
// SAFETY: `<&mut [u8]>::as_mut_ptr` is guaranteed to return a valid pointer and `len` has been tested to be within bounds.
18151801
unsafe { slice::from_raw_parts_mut(dst.as_mut_ptr(), len) }
18161802
}
18171803

1804+
/// Encodes a raw u32 value as UTF-8 to the provided destination buffer.
1805+
///
1806+
/// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range.
1807+
/// (Creating a `char` in the surrogate range is UB.)
1808+
/// The result is valid [generalized UTF-8] but not valid UTF-8.
1809+
///
1810+
/// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8
1811+
///
1812+
/// # Safety
1813+
///
1814+
/// The behavior is undefined if the buffer pointed to by `dst` is not
1815+
/// large enough to hold the encoded codepoint. A buffer of length four
1816+
/// is large enough to encode any `char`.
1817+
///
1818+
/// For a safe version of this function, see the [`encode_utf8_raw`] function.
1819+
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
1820+
#[doc(hidden)]
1821+
#[inline]
1822+
pub const unsafe fn encode_utf8_raw_unchecked(code: u32, dst: *mut u8) {
1823+
let len = len_utf8(code);
1824+
// SAFETY: The caller must guarantee that the buffer pointed to by `dst`
1825+
// is at least `len` bytes long.
1826+
unsafe {
1827+
match len {
1828+
1 => {
1829+
*dst = code as u8;
1830+
}
1831+
2 => {
1832+
*dst = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
1833+
*dst.add(1) = (code & 0x3F) as u8 | TAG_CONT;
1834+
}
1835+
3 => {
1836+
*dst = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
1837+
*dst.add(1) = (code >> 6 & 0x3F) as u8 | TAG_CONT;
1838+
*dst.add(2) = (code & 0x3F) as u8 | TAG_CONT;
1839+
}
1840+
4 => {
1841+
*dst = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
1842+
*dst.add(1) = (code >> 12 & 0x3F) as u8 | TAG_CONT;
1843+
*dst.add(2) = (code >> 6 & 0x3F) as u8 | TAG_CONT;
1844+
*dst.add(3) = (code & 0x3F) as u8 | TAG_CONT;
1845+
}
1846+
// SAFETY: `char` always takes between 1 and 4 bytes to encode in UTF-8.
1847+
_ => crate::hint::unreachable_unchecked(),
1848+
}
1849+
}
1850+
}
1851+
18181852
/// Encodes a raw `u32` value as UTF-16 into the provided `u16` buffer,
18191853
/// and then returns the subslice of the buffer that contains the encoded character.
18201854
///

library/core/src/char/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ pub use self::decode::{DecodeUtf16, DecodeUtf16Error};
3838
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
3939
pub use self::methods::encode_utf16_raw; // perma-unstable
4040
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
41-
pub use self::methods::encode_utf8_raw; // perma-unstable
41+
pub use self::methods::{encode_utf8_raw, encode_utf8_raw_unchecked}; // perma-unstable
4242

4343
#[rustfmt::skip]
4444
use crate::ascii;

0 commit comments

Comments
 (0)