diff --git a/library/alloc/src/slice.rs b/library/alloc/src/slice.rs index edc8d99f2f990..7826151ab6bda 100644 --- a/library/alloc/src/slice.rs +++ b/library/alloc/src/slice.rs @@ -81,6 +81,11 @@ use crate::borrow::ToOwned; use crate::boxed::Box; use crate::vec::Vec; +// Using a module here allows having the no_global_oom_handling +// in only one place +#[cfg(not(no_global_oom_handling))] +mod byte_slice_make_case; + // HACK(japaric): With cfg(test) `impl [T]` is not available, these three // functions are actually methods that are in `impl [T]` but not in // `core::slice::SliceExt` - we need to supply these functions for the diff --git a/library/alloc/src/slice/byte_slice_make_case.rs b/library/alloc/src/slice/byte_slice_make_case.rs new file mode 100644 index 0000000000000..09bb9842c48b5 --- /dev/null +++ b/library/alloc/src/slice/byte_slice_make_case.rs @@ -0,0 +1,162 @@ +use crate::collections::VecDeque; + +#[cfg(not(test))] +impl [u8] { + #[rustc_allow_incoherent_impl] + #[unstable(issue = "none", feature = "std_internals")] + #[allow(dead_code)] + /// Safety: + /// - Must be valid UTF-8 + pub unsafe fn make_utf8_uppercase(&mut self) -> Result<usize, VecDeque<u8>> { + let mut queue = VecDeque::new(); + + let mut read_offset = 0; + let mut write_offset = 0; + + while let Some((codepoint, width)) = + unsafe { core::str::next_code_point_with_width(&mut self[read_offset..].iter()) } + { + read_offset += width; + // Queue must be flushed before encode_to_slice_or_else_to_queue is + // called to ensure proper order of bytes + dump_queue(&mut queue, &mut self[..read_offset], &mut write_offset); + let lowercase_char = unsafe { char::from_u32_unchecked(codepoint) }; + for c in lowercase_char.to_uppercase() { + encode_to_slice_or_else_to_queue( + c, + &mut queue, + &mut self[..read_offset], + &mut write_offset, + ); + } + } + assert_eq!(read_offset, self.len()); + if write_offset < read_offset { Ok(write_offset) } else { Err(queue) } + } + + #[rustc_allow_incoherent_impl] + #[unstable(issue = "none", feature = "std_internals")] + #[allow(dead_code)] + /// Safety: + /// - Must be valid UTF-8 + pub unsafe fn make_utf8_lowercase(&mut self) -> Result<usize, VecDeque<u8>> { + let mut queue = VecDeque::new(); + + let mut read_offset = 0; + let mut write_offset = 0; + + let mut final_sigma_automata = FinalSigmaAutomata::new(); + while let Some((codepoint, width)) = + unsafe { core::str::next_code_point_with_width(&mut self[read_offset..].iter()) } + { + read_offset += width; + // Queue must be flushed before encode_to_slice_or_else_to_queue is + // called to ensure proper order of bytes + dump_queue(&mut queue, &mut self[..read_offset], &mut write_offset); + let uppercase_char = unsafe { char::from_u32_unchecked(codepoint) }; + if uppercase_char == 'Σ' { + // Σ maps to σ, except at the end of a word where it maps to ς. + // See core::str::to_lowercase + let rest = unsafe { core::str::from_utf8_unchecked(&self[read_offset..]) }; + let is_word_final = + final_sigma_automata.is_accepting() && !case_ignorable_then_cased(rest.chars()); + let sigma_lowercase = if is_word_final { 'ς' } else { 'σ' }; + encode_to_slice_or_else_to_queue( + sigma_lowercase, + &mut queue, + &mut self[..read_offset], + &mut write_offset, + ); + } else { + for c in uppercase_char.to_lowercase() { + encode_to_slice_or_else_to_queue( + c, + &mut queue, + &mut self[..read_offset], + &mut write_offset, + ); + } + } + final_sigma_automata.step(uppercase_char); + } + assert_eq!(read_offset, self.len()); + return if write_offset < read_offset { Ok(write_offset) } else { Err(queue) }; + + // For now this is copy pasted from core::str, FIXME: DRY + fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool { + use core::unicode::{Case_Ignorable, Cased}; + match iter.skip_while(|&c| Case_Ignorable(c)).next() { + Some(c) => Cased(c), + None => false, + } + } + } +} + +fn encode_to_slice_or_else_to_queue( + c: char, + queue: &mut VecDeque<u8>, + slice: &mut [u8], + write_offset: &mut usize, +) { + let mut buffer = [0; 4]; + let len = c.encode_utf8(&mut buffer).len(); + let writable_slice = &mut slice[*write_offset..]; + let direct_copy_length = core::cmp::min(len, writable_slice.len()); + writable_slice[..direct_copy_length].copy_from_slice(&buffer[..direct_copy_length]); + *write_offset += direct_copy_length; + queue.extend(&buffer[direct_copy_length..len]); +} + +fn dump_queue(queue: &mut VecDeque<u8>, slice: &mut [u8], write_offset: &mut usize) { + while *write_offset < slice.len() { + match queue.pop_front() { + Some(b) => { + slice[*write_offset] = b; + *write_offset += 1; + } + None => break, + } + } +} + +#[derive(Clone)] +enum FinalSigmaAutomata { + Init, + Accepted, +} + +impl FinalSigmaAutomata { + fn new() -> Self { + Self::Init + } + + fn is_accepting(&self) -> bool { + match self { + FinalSigmaAutomata::Accepted => true, + FinalSigmaAutomata::Init => false, + } + } + + fn step(&mut self, c: char) { + use core::unicode::{Case_Ignorable, Cased}; + + use FinalSigmaAutomata::*; + *self = match self { + Init => { + if Cased(c) { + Accepted + } else { + Init + } + } + Accepted => { + if Cased(c) || Case_Ignorable(c) { + Accepted + } else { + Init + } + } + } + } +} diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index 0c9535dfaa628..726e3d87d9f09 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -1127,6 +1127,32 @@ impl String { self.vec.extend_from_slice(string.as_bytes()) } + #[cfg(not(no_global_oom_handling))] + #[unstable(feature = "string_make_uplowercase", issue = "135885")] + #[allow(missing_docs)] + pub fn make_uppercase(&mut self) { + let mut v = core::mem::take(self).vec; + let res = unsafe { v.make_utf8_uppercase() }; + match res { + Ok(n) => v.truncate(n), + Err(queue) => v.extend(queue), + } + *self = unsafe { Self::from_utf8_unchecked(v) } + } + + #[cfg(not(no_global_oom_handling))] + #[unstable(feature = "string_make_uplowercase", issue = "135885")] + #[allow(missing_docs)] + pub fn make_lowercase(&mut self) { + let mut v = core::mem::take(self).vec; + let res = unsafe { v.make_utf8_lowercase() }; + match res { + Ok(n) => v.truncate(n), + Err(queue) => v.extend(queue), + } + *self = unsafe { Self::from_utf8_unchecked(v) } + } + /// Copies elements from `src` range to the end of the string. /// /// # Panics diff --git a/library/alloc/tests/lib.rs b/library/alloc/tests/lib.rs index 393bdfe48b741..aa66111b86328 100644 --- a/library/alloc/tests/lib.rs +++ b/library/alloc/tests/lib.rs @@ -43,6 +43,7 @@ #![allow(internal_features)] #![deny(fuzzy_provenance_casts)] #![deny(unsafe_op_in_unsafe_fn)] +#![feature(string_make_uplowercase)] extern crate test; diff --git a/library/alloc/tests/string.rs b/library/alloc/tests/string.rs index 1c8bff1564db2..b8a89e1a504b7 100644 --- a/library/alloc/tests/string.rs +++ b/library/alloc/tests/string.rs @@ -903,3 +903,83 @@ fn test_str_concat() { let s: String = format!("{a}{b}"); assert_eq!(s.as_bytes()[9], 'd' as u8); } + +#[test] +fn make_uppercase() { + fn test(s: &str) { + let ground_truth = s.to_uppercase(); + let mut tested = s.to_owned(); + tested.make_uppercase(); + assert!( + tested == ground_truth, + r#"When uppercased "{s}" gave "{tested}" while "{ground_truth}" was expected"# + ); + } + test(""); + test("abcde"); + // 4 to 9 bytes + test("ǰΐ"); + // 10*3 to 10*2 bytes + test("ⱥⱥⱥⱥⱥⱥⱥⱥⱥⱥ"); + test("aéDžßfiᾀ"); +} + +#[test] +fn make_lowercase() { + fn test(s: &str) { + let ground_truth = s.to_lowercase(); + let mut tested = s.to_owned(); + tested.make_lowercase(); + assert!( + tested == ground_truth, + r#"When lowercased "{s}" gave "{tested}" while "{ground_truth}" was expected"# + ); + } + test(""); + test("AÉDžaé "); + + // https://github.com/rust-lang/rust/issues/26035 + test("ΑΣ"); + test("Α'Σ"); + test("Α''Σ"); + + test("ΑΣ Α"); + test("Α'Σ Α"); + test("Α''Σ Α"); + + test("ΑΣ' Α"); + test("ΑΣ'' Α"); + + test("Α'Σ' Α"); + test("Α''Σ'' Α"); + + test("Α Σ"); + test("Α 'Σ"); + test("Α ''Σ"); + + test("Σ"); + test("'Σ"); + test("''Σ"); + + test("ΑΣΑ"); + test("ΑΣ'Α"); + test("ΑΣ''Α"); + + // https://github.com/rust-lang/rust/issues/124714 + // input lengths around the boundary of the chunk size used by the ascii prefix optimization + test("abcdefghijklmnoΣ"); + test("abcdefghijklmnopΣ"); + test("abcdefghijklmnopqΣ"); + + // a really long string that has it's lowercase form + // even longer. this tests that implementations don't assume + // an incorrect upper bound on allocations + let upper = str::repeat("İ", 512); + test(&upper); + + // a really long ascii-only string. + // This test that the ascii hot-path + // functions correctly + let upper = str::repeat("A", 511); + test(&upper); +} diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 8a473b398bb5f..74d4109977162 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -57,7 +57,7 @@ pub use lossy::{Utf8Chunk, Utf8Chunks}; #[stable(feature = "rust1", since = "1.0.0")] pub use traits::FromStr; #[unstable(feature = "str_internals", issue = "none")] -pub use validations::{next_code_point, utf8_char_width}; +pub use validations::{next_code_point, next_code_point_with_width, utf8_char_width}; #[inline(never)] #[cold] diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs index 0f724dd961329..9dea1b7b0de86 100644 --- a/library/core/src/str/validations.rs +++ b/library/core/src/str/validations.rs @@ -26,18 +26,21 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool { } /// Reads the next code point out of a byte iterator (assuming a -/// UTF-8-like encoding). +/// UTF-8-like encoding) and returns it along with its width. /// /// # Safety /// /// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string #[unstable(feature = "str_internals", issue = "none")] #[inline] -pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> { +#[allow(dead_code)] +pub unsafe fn next_code_point_with_width<'a, I: Iterator<Item = &'a u8>>( + bytes: &mut I, +) -> Option<(u32, usize)> { // Decode UTF-8 let x = *bytes.next()?; if x < 128 { - return Some(x as u32); + return Some((x as u32, 1)); } // Multibyte case follows @@ -47,6 +50,7 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. let y = unsafe { *bytes.next().unwrap_unchecked() }; + let mut width = 2; let mut ch = utf8_acc_cont_byte(init, y); if x >= 0xE0 { // [[x y z] w] case @@ -54,6 +58,7 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. let z = unsafe { *bytes.next().unwrap_unchecked() }; + width = 3; let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); ch = init << 12 | y_z; if x >= 0xF0 { @@ -62,11 +67,25 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. let w = unsafe { *bytes.next().unwrap_unchecked() }; + width = 4; ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); } } - Some(ch) + Some((ch, width)) +} + +/// Reads the next code point out of a byte iterator (assuming a +/// UTF-8-like encoding). +/// +/// # Safety +/// +/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string +#[unstable(feature = "str_internals", issue = "none")] +#[inline] +pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> { + // SAFETY: same call condition + Some(unsafe { next_code_point_with_width(bytes) }?.0) } /// Reads the last code point out of a byte iterator (assuming a