From c5b0e91a6d4c305bac75326b7889006d7d5a4c3c Mon Sep 17 00:00:00 2001 From: Arthur Carcano Date: Wed, 22 Jan 2025 17:25:40 +0100 Subject: [PATCH 1/4] First prototype of make_uppercase --- library/alloc/src/slice.rs | 40 +++++++++++++++++++++++++++++ library/alloc/src/string.rs | 12 +++++++++ library/alloc/tests/lib.rs | 1 + library/alloc/tests/string.rs | 20 +++++++++++++++ library/core/src/str/mod.rs | 2 +- library/core/src/str/validations.rs | 27 ++++++++++++++++--- 6 files changed, 97 insertions(+), 5 deletions(-) diff --git a/library/alloc/src/slice.rs b/library/alloc/src/slice.rs index edc8d99f2f990..2e1e3f0f989a0 100644 --- a/library/alloc/src/slice.rs +++ b/library/alloc/src/slice.rs @@ -79,6 +79,7 @@ use crate::alloc::Global; #[cfg(not(no_global_oom_handling))] use crate::borrow::ToOwned; use crate::boxed::Box; +use crate::collections::VecDeque; use crate::vec::Vec; // HACK(japaric): With cfg(test) `impl [T]` is not available, these three @@ -665,6 +666,45 @@ impl [T] { } } +#[cfg(not(test))] +impl [u8] { + #[rustc_allow_incoherent_impl] + #[unstable(issue = "none", feature = "std_internals")] + #[allow(dead_code)] + /// Safety: + /// - Must be UTF-8 + pub unsafe fn make_utf8_uppercase(&mut self) -> Result> { + let mut queue = VecDeque::new(); + + let mut read_offset = 0; + let mut write_offset = 0; + + let mut buffer = [0; 4]; + while let Some((codepoint, width)) = + unsafe { core::str::next_code_point_with_width(&mut self[read_offset..].iter()) } + { + read_offset += width; + let lowercase_char = unsafe { char::from_u32_unchecked(codepoint) }; + for c in lowercase_char.to_uppercase() { + let l = c.len_utf8(); + c.encode_utf8(&mut buffer); + queue.extend(&buffer[..l]); + } + while write_offset < read_offset { + match queue.pop_front() { + Some(b) => { + self[write_offset] = b; + write_offset += 1; + } + None => break, + } + } + } + assert_eq!(read_offset, self.len()); + if write_offset < read_offset { Ok(write_offset) } else { Err(queue) } + } +} + #[cfg(not(test))] impl [u8] { /// Returns a vector containing a copy of this slice where each byte diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index 0c9535dfaa628..99026353455b1 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -1127,6 +1127,18 @@ impl String { self.vec.extend_from_slice(string.as_bytes()) } + #[unstable(feature = "string_make_uplowercase", issue = "135885")] + #[allow(missing_docs)] + pub fn make_uppercase(&mut self) { + let mut v = core::mem::take(self).vec; + let res = unsafe { v.make_utf8_uppercase() }; + match res { + Ok(n) => v.truncate(n), + Err(queue) => v.extend(queue), + } + *self = unsafe { Self::from_utf8_unchecked(v) } + } + /// Copies elements from `src` range to the end of the string. /// /// # Panics diff --git a/library/alloc/tests/lib.rs b/library/alloc/tests/lib.rs index 393bdfe48b741..aa66111b86328 100644 --- a/library/alloc/tests/lib.rs +++ b/library/alloc/tests/lib.rs @@ -43,6 +43,7 @@ #![allow(internal_features)] #![deny(fuzzy_provenance_casts)] #![deny(unsafe_op_in_unsafe_fn)] +#![feature(string_make_uplowercase)] extern crate test; diff --git a/library/alloc/tests/string.rs b/library/alloc/tests/string.rs index 1c8bff1564db2..c5c188fbb9240 100644 --- a/library/alloc/tests/string.rs +++ b/library/alloc/tests/string.rs @@ -903,3 +903,23 @@ fn test_str_concat() { let s: String = format!("{a}{b}"); assert_eq!(s.as_bytes()[9], 'd' as u8); } + +#[test] +fn make_uppercase() { + fn test(s: &str) { + let ground_truth = s.to_uppercase(); + let mut tested = s.to_owned(); + tested.make_uppercase(); + assert!( + tested == ground_truth, + r#"When uppercased "{s}" gave "{tested}" while "{ground_truth}" was expected"# + ); + } + test(""); + test("abcde"); + // 4 to 9 bytes + test("ǰΐ"); + // 10*3 to 10*2 bytes + test("ⱥⱥⱥⱥⱥⱥⱥⱥⱥⱥ"); + test("aéDžßfiᾀ"); +} diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 8a473b398bb5f..74d4109977162 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -57,7 +57,7 @@ pub use lossy::{Utf8Chunk, Utf8Chunks}; #[stable(feature = "rust1", since = "1.0.0")] pub use traits::FromStr; #[unstable(feature = "str_internals", issue = "none")] -pub use validations::{next_code_point, utf8_char_width}; +pub use validations::{next_code_point, next_code_point_with_width, utf8_char_width}; #[inline(never)] #[cold] diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs index 0f724dd961329..9dea1b7b0de86 100644 --- a/library/core/src/str/validations.rs +++ b/library/core/src/str/validations.rs @@ -26,18 +26,21 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool { } /// Reads the next code point out of a byte iterator (assuming a -/// UTF-8-like encoding). +/// UTF-8-like encoding) and returns it along with its width. /// /// # Safety /// /// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string #[unstable(feature = "str_internals", issue = "none")] #[inline] -pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> Option { +#[allow(dead_code)] +pub unsafe fn next_code_point_with_width<'a, I: Iterator>( + bytes: &mut I, +) -> Option<(u32, usize)> { // Decode UTF-8 let x = *bytes.next()?; if x < 128 { - return Some(x as u32); + return Some((x as u32, 1)); } // Multibyte case follows @@ -47,6 +50,7 @@ pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. let y = unsafe { *bytes.next().unwrap_unchecked() }; + let mut width = 2; let mut ch = utf8_acc_cont_byte(init, y); if x >= 0xE0 { // [[x y z] w] case @@ -54,6 +58,7 @@ pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. let z = unsafe { *bytes.next().unwrap_unchecked() }; + width = 3; let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); ch = init << 12 | y_z; if x >= 0xF0 { @@ -62,11 +67,25 @@ pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. let w = unsafe { *bytes.next().unwrap_unchecked() }; + width = 4; ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); } } - Some(ch) + Some((ch, width)) +} + +/// Reads the next code point out of a byte iterator (assuming a +/// UTF-8-like encoding). +/// +/// # Safety +/// +/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string +#[unstable(feature = "str_internals", issue = "none")] +#[inline] +pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> Option { + // SAFETY: same call condition + Some(unsafe { next_code_point_with_width(bytes) }?.0) } /// Reads the last code point out of a byte iterator (assuming a From fffbb33cad812a9677483ee0c7ae2636e80ea862 Mon Sep 17 00:00:00 2001 From: Arthur Carcano Date: Mon, 17 Feb 2025 19:06:14 +0100 Subject: [PATCH 2/4] First prototype of make_lowercase --- library/alloc/src/slice.rs | 100 ++++++++++++++++++++++++++++++++++ library/alloc/src/string.rs | 12 ++++ library/alloc/tests/string.rs | 60 ++++++++++++++++++++ 3 files changed, 172 insertions(+) diff --git a/library/alloc/src/slice.rs b/library/alloc/src/slice.rs index 2e1e3f0f989a0..0c23d7cc0b071 100644 --- a/library/alloc/src/slice.rs +++ b/library/alloc/src/slice.rs @@ -703,6 +703,106 @@ impl [u8] { assert_eq!(read_offset, self.len()); if write_offset < read_offset { Ok(write_offset) } else { Err(queue) } } + + #[rustc_allow_incoherent_impl] + #[unstable(issue = "none", feature = "std_internals")] + #[allow(dead_code)] + /// Safety: + /// - Must be UTF-8 + pub unsafe fn make_utf8_lowercase(&mut self) -> Result> { + let mut queue = VecDeque::new(); + + let mut read_offset = 0; + let mut write_offset = 0; + + let mut buffer = [0; 4]; + let mut final_sigma_automata = FinalSigmaAutomata::new(); + while let Some((codepoint, width)) = + unsafe { core::str::next_code_point_with_width(&mut self[read_offset..].iter()) } + { + read_offset += width; + let uppercase_char = unsafe { char::from_u32_unchecked(codepoint) }; + if uppercase_char == 'Σ' { + // Σ maps to σ, except at the end of a word where it maps to ς. + // See core::str::to_lowercase + let rest = unsafe { core::str::from_utf8_unchecked(&self[read_offset..]) }; + let is_word_final = + final_sigma_automata.is_accepting() && !case_ignorable_then_cased(rest.chars()); + let sigma_lowercase = if is_word_final { 'ς' } else { 'σ' }; + let l = sigma_lowercase.len_utf8(); + sigma_lowercase.encode_utf8(&mut buffer); + queue.extend(&buffer[..l]); + } else { + for c in uppercase_char.to_lowercase() { + let l = c.len_utf8(); + c.encode_utf8(&mut buffer); + queue.extend(&buffer[..l]); + } + } + final_sigma_automata.step(uppercase_char); + while write_offset < read_offset { + match queue.pop_front() { + Some(b) => { + self[write_offset] = b; + write_offset += 1; + } + None => break, + } + } + } + assert_eq!(read_offset, self.len()); + return if write_offset < read_offset { Ok(write_offset) } else { Err(queue) }; + + // For now this is copy pasted from core::str, FIXME: DRY + fn case_ignorable_then_cased>(iter: I) -> bool { + use core::unicode::{Case_Ignorable, Cased}; + match iter.skip_while(|&c| Case_Ignorable(c)).next() { + Some(c) => Cased(c), + None => false, + } + } + } +} + +#[derive(Clone)] +enum FinalSigmaAutomata { + Init, + Accepted, +} + +impl FinalSigmaAutomata { + fn new() -> Self { + Self::Init + } + + fn is_accepting(&self) -> bool { + match self { + FinalSigmaAutomata::Accepted => true, + FinalSigmaAutomata::Init => false, + } + } + + fn step(&mut self, c: char) { + use core::unicode::{Case_Ignorable, Cased}; + + use FinalSigmaAutomata::*; + *self = match self { + Init => { + if Cased(c) { + Accepted + } else { + Init + } + } + Accepted => { + if Cased(c) || Case_Ignorable(c) { + Accepted + } else { + Init + } + } + } + } } #[cfg(not(test))] diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index 99026353455b1..889bfba6e0474 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -1139,6 +1139,18 @@ impl String { *self = unsafe { Self::from_utf8_unchecked(v) } } + #[unstable(feature = "string_make_uplowercase", issue = "135885")] + #[allow(missing_docs)] + pub fn make_lowercase(&mut self) { + let mut v = core::mem::take(self).vec; + let res = unsafe { v.make_utf8_lowercase() }; + match res { + Ok(n) => v.truncate(n), + Err(queue) => v.extend(queue), + } + *self = unsafe { Self::from_utf8_unchecked(v) } + } + /// Copies elements from `src` range to the end of the string. /// /// # Panics diff --git a/library/alloc/tests/string.rs b/library/alloc/tests/string.rs index c5c188fbb9240..b8a89e1a504b7 100644 --- a/library/alloc/tests/string.rs +++ b/library/alloc/tests/string.rs @@ -923,3 +923,63 @@ fn make_uppercase() { test("ⱥⱥⱥⱥⱥⱥⱥⱥⱥⱥ"); test("aéDžßfiᾀ"); } + +#[test] +fn make_lowercase() { + fn test(s: &str) { + let ground_truth = s.to_lowercase(); + let mut tested = s.to_owned(); + tested.make_lowercase(); + assert!( + tested == ground_truth, + r#"When lowercased "{s}" gave "{tested}" while "{ground_truth}" was expected"# + ); + } + test(""); + test("AÉDžaé "); + + // https://github.com/rust-lang/rust/issues/26035 + test("ΑΣ"); + test("Α'Σ"); + test("Α''Σ"); + + test("ΑΣ Α"); + test("Α'Σ Α"); + test("Α''Σ Α"); + + test("ΑΣ' Α"); + test("ΑΣ'' Α"); + + test("Α'Σ' Α"); + test("Α''Σ'' Α"); + + test("Α Σ"); + test("Α 'Σ"); + test("Α ''Σ"); + + test("Σ"); + test("'Σ"); + test("''Σ"); + + test("ΑΣΑ"); + test("ΑΣ'Α"); + test("ΑΣ''Α"); + + // https://github.com/rust-lang/rust/issues/124714 + // input lengths around the boundary of the chunk size used by the ascii prefix optimization + test("abcdefghijklmnoΣ"); + test("abcdefghijklmnopΣ"); + test("abcdefghijklmnopqΣ"); + + // a really long string that has it's lowercase form + // even longer. this tests that implementations don't assume + // an incorrect upper bound on allocations + let upper = str::repeat("İ", 512); + test(&upper); + + // a really long ascii-only string. + // This test that the ascii hot-path + // functions correctly + let upper = str::repeat("A", 511); + test(&upper); +} From a00b4efb2cbeade9fb06e5be90ed726326a92b65 Mon Sep 17 00:00:00 2001 From: Arthur Carcano Date: Tue, 18 Feb 2025 11:07:25 +0100 Subject: [PATCH 3/4] Bypass queue when possible in slice::make_*case --- library/alloc/src/slice.rs | 84 ++++++++++++++++++++++++-------------- 1 file changed, 53 insertions(+), 31 deletions(-) diff --git a/library/alloc/src/slice.rs b/library/alloc/src/slice.rs index 0c23d7cc0b071..40287e9fa1040 100644 --- a/library/alloc/src/slice.rs +++ b/library/alloc/src/slice.rs @@ -672,32 +672,28 @@ impl [u8] { #[unstable(issue = "none", feature = "std_internals")] #[allow(dead_code)] /// Safety: - /// - Must be UTF-8 + /// - Must be valid UTF-8 pub unsafe fn make_utf8_uppercase(&mut self) -> Result> { let mut queue = VecDeque::new(); let mut read_offset = 0; let mut write_offset = 0; - let mut buffer = [0; 4]; while let Some((codepoint, width)) = unsafe { core::str::next_code_point_with_width(&mut self[read_offset..].iter()) } { read_offset += width; + // Queue must be flushed before encode_to_slice_or_else_to_queue is + // called to ensure proper order of bytes + dump_queue(&mut queue, &mut self[..read_offset], &mut write_offset); let lowercase_char = unsafe { char::from_u32_unchecked(codepoint) }; for c in lowercase_char.to_uppercase() { - let l = c.len_utf8(); - c.encode_utf8(&mut buffer); - queue.extend(&buffer[..l]); - } - while write_offset < read_offset { - match queue.pop_front() { - Some(b) => { - self[write_offset] = b; - write_offset += 1; - } - None => break, - } + encode_to_slice_or_else_to_queue( + c, + &mut queue, + &mut self[..read_offset], + &mut write_offset, + ); } } assert_eq!(read_offset, self.len()); @@ -708,19 +704,21 @@ impl [u8] { #[unstable(issue = "none", feature = "std_internals")] #[allow(dead_code)] /// Safety: - /// - Must be UTF-8 + /// - Must be valid UTF-8 pub unsafe fn make_utf8_lowercase(&mut self) -> Result> { let mut queue = VecDeque::new(); let mut read_offset = 0; let mut write_offset = 0; - let mut buffer = [0; 4]; let mut final_sigma_automata = FinalSigmaAutomata::new(); while let Some((codepoint, width)) = unsafe { core::str::next_code_point_with_width(&mut self[read_offset..].iter()) } { read_offset += width; + // Queue must be flushed before encode_to_slice_or_else_to_queue is + // called to ensure proper order of bytes + dump_queue(&mut queue, &mut self[..read_offset], &mut write_offset); let uppercase_char = unsafe { char::from_u32_unchecked(codepoint) }; if uppercase_char == 'Σ' { // Σ maps to σ, except at the end of a word where it maps to ς. @@ -729,26 +727,23 @@ impl [u8] { let is_word_final = final_sigma_automata.is_accepting() && !case_ignorable_then_cased(rest.chars()); let sigma_lowercase = if is_word_final { 'ς' } else { 'σ' }; - let l = sigma_lowercase.len_utf8(); - sigma_lowercase.encode_utf8(&mut buffer); - queue.extend(&buffer[..l]); + encode_to_slice_or_else_to_queue( + sigma_lowercase, + &mut queue, + &mut self[..read_offset], + &mut write_offset, + ); } else { for c in uppercase_char.to_lowercase() { - let l = c.len_utf8(); - c.encode_utf8(&mut buffer); - queue.extend(&buffer[..l]); + encode_to_slice_or_else_to_queue( + c, + &mut queue, + &mut self[..read_offset], + &mut write_offset, + ); } } final_sigma_automata.step(uppercase_char); - while write_offset < read_offset { - match queue.pop_front() { - Some(b) => { - self[write_offset] = b; - write_offset += 1; - } - None => break, - } - } } assert_eq!(read_offset, self.len()); return if write_offset < read_offset { Ok(write_offset) } else { Err(queue) }; @@ -764,6 +759,33 @@ impl [u8] { } } +fn encode_to_slice_or_else_to_queue( + c: char, + queue: &mut VecDeque, + slice: &mut [u8], + write_offset: &mut usize, +) { + let mut buffer = [0; 4]; + let len = c.encode_utf8(&mut buffer).len(); + let writable_slice = &mut slice[*write_offset..]; + let direct_copy_length = core::cmp::min(len, writable_slice.len()); + writable_slice[..direct_copy_length].copy_from_slice(&buffer[..direct_copy_length]); + *write_offset += direct_copy_length; + queue.extend(&buffer[direct_copy_length..len]); +} + +fn dump_queue(queue: &mut VecDeque, slice: &mut [u8], write_offset: &mut usize) { + while *write_offset < slice.len() { + match queue.pop_front() { + Some(b) => { + slice[*write_offset] = b; + *write_offset += 1; + } + None => break, + } + } +} + #[derive(Clone)] enum FinalSigmaAutomata { Init, From 6f1f32ed69b0896b4aaacec4b1a4df11ca3b9440 Mon Sep 17 00:00:00 2001 From: Arthur Carcano Date: Tue, 18 Feb 2025 12:17:10 +0100 Subject: [PATCH 4/4] Add needed no_global_oom_handling cfg to make_case methods --- library/alloc/src/slice.rs | 167 +----------------- .../alloc/src/slice/byte_slice_make_case.rs | 162 +++++++++++++++++ library/alloc/src/string.rs | 2 + 3 files changed, 169 insertions(+), 162 deletions(-) create mode 100644 library/alloc/src/slice/byte_slice_make_case.rs diff --git a/library/alloc/src/slice.rs b/library/alloc/src/slice.rs index 40287e9fa1040..7826151ab6bda 100644 --- a/library/alloc/src/slice.rs +++ b/library/alloc/src/slice.rs @@ -79,9 +79,13 @@ use crate::alloc::Global; #[cfg(not(no_global_oom_handling))] use crate::borrow::ToOwned; use crate::boxed::Box; -use crate::collections::VecDeque; use crate::vec::Vec; +// Using a module here allows having the no_global_oom_handling +// in only one place +#[cfg(not(no_global_oom_handling))] +mod byte_slice_make_case; + // HACK(japaric): With cfg(test) `impl [T]` is not available, these three // functions are actually methods that are in `impl [T]` but not in // `core::slice::SliceExt` - we need to supply these functions for the @@ -666,167 +670,6 @@ impl [T] { } } -#[cfg(not(test))] -impl [u8] { - #[rustc_allow_incoherent_impl] - #[unstable(issue = "none", feature = "std_internals")] - #[allow(dead_code)] - /// Safety: - /// - Must be valid UTF-8 - pub unsafe fn make_utf8_uppercase(&mut self) -> Result> { - let mut queue = VecDeque::new(); - - let mut read_offset = 0; - let mut write_offset = 0; - - while let Some((codepoint, width)) = - unsafe { core::str::next_code_point_with_width(&mut self[read_offset..].iter()) } - { - read_offset += width; - // Queue must be flushed before encode_to_slice_or_else_to_queue is - // called to ensure proper order of bytes - dump_queue(&mut queue, &mut self[..read_offset], &mut write_offset); - let lowercase_char = unsafe { char::from_u32_unchecked(codepoint) }; - for c in lowercase_char.to_uppercase() { - encode_to_slice_or_else_to_queue( - c, - &mut queue, - &mut self[..read_offset], - &mut write_offset, - ); - } - } - assert_eq!(read_offset, self.len()); - if write_offset < read_offset { Ok(write_offset) } else { Err(queue) } - } - - #[rustc_allow_incoherent_impl] - #[unstable(issue = "none", feature = "std_internals")] - #[allow(dead_code)] - /// Safety: - /// - Must be valid UTF-8 - pub unsafe fn make_utf8_lowercase(&mut self) -> Result> { - let mut queue = VecDeque::new(); - - let mut read_offset = 0; - let mut write_offset = 0; - - let mut final_sigma_automata = FinalSigmaAutomata::new(); - while let Some((codepoint, width)) = - unsafe { core::str::next_code_point_with_width(&mut self[read_offset..].iter()) } - { - read_offset += width; - // Queue must be flushed before encode_to_slice_or_else_to_queue is - // called to ensure proper order of bytes - dump_queue(&mut queue, &mut self[..read_offset], &mut write_offset); - let uppercase_char = unsafe { char::from_u32_unchecked(codepoint) }; - if uppercase_char == 'Σ' { - // Σ maps to σ, except at the end of a word where it maps to ς. - // See core::str::to_lowercase - let rest = unsafe { core::str::from_utf8_unchecked(&self[read_offset..]) }; - let is_word_final = - final_sigma_automata.is_accepting() && !case_ignorable_then_cased(rest.chars()); - let sigma_lowercase = if is_word_final { 'ς' } else { 'σ' }; - encode_to_slice_or_else_to_queue( - sigma_lowercase, - &mut queue, - &mut self[..read_offset], - &mut write_offset, - ); - } else { - for c in uppercase_char.to_lowercase() { - encode_to_slice_or_else_to_queue( - c, - &mut queue, - &mut self[..read_offset], - &mut write_offset, - ); - } - } - final_sigma_automata.step(uppercase_char); - } - assert_eq!(read_offset, self.len()); - return if write_offset < read_offset { Ok(write_offset) } else { Err(queue) }; - - // For now this is copy pasted from core::str, FIXME: DRY - fn case_ignorable_then_cased>(iter: I) -> bool { - use core::unicode::{Case_Ignorable, Cased}; - match iter.skip_while(|&c| Case_Ignorable(c)).next() { - Some(c) => Cased(c), - None => false, - } - } - } -} - -fn encode_to_slice_or_else_to_queue( - c: char, - queue: &mut VecDeque, - slice: &mut [u8], - write_offset: &mut usize, -) { - let mut buffer = [0; 4]; - let len = c.encode_utf8(&mut buffer).len(); - let writable_slice = &mut slice[*write_offset..]; - let direct_copy_length = core::cmp::min(len, writable_slice.len()); - writable_slice[..direct_copy_length].copy_from_slice(&buffer[..direct_copy_length]); - *write_offset += direct_copy_length; - queue.extend(&buffer[direct_copy_length..len]); -} - -fn dump_queue(queue: &mut VecDeque, slice: &mut [u8], write_offset: &mut usize) { - while *write_offset < slice.len() { - match queue.pop_front() { - Some(b) => { - slice[*write_offset] = b; - *write_offset += 1; - } - None => break, - } - } -} - -#[derive(Clone)] -enum FinalSigmaAutomata { - Init, - Accepted, -} - -impl FinalSigmaAutomata { - fn new() -> Self { - Self::Init - } - - fn is_accepting(&self) -> bool { - match self { - FinalSigmaAutomata::Accepted => true, - FinalSigmaAutomata::Init => false, - } - } - - fn step(&mut self, c: char) { - use core::unicode::{Case_Ignorable, Cased}; - - use FinalSigmaAutomata::*; - *self = match self { - Init => { - if Cased(c) { - Accepted - } else { - Init - } - } - Accepted => { - if Cased(c) || Case_Ignorable(c) { - Accepted - } else { - Init - } - } - } - } -} - #[cfg(not(test))] impl [u8] { /// Returns a vector containing a copy of this slice where each byte diff --git a/library/alloc/src/slice/byte_slice_make_case.rs b/library/alloc/src/slice/byte_slice_make_case.rs new file mode 100644 index 0000000000000..09bb9842c48b5 --- /dev/null +++ b/library/alloc/src/slice/byte_slice_make_case.rs @@ -0,0 +1,162 @@ +use crate::collections::VecDeque; + +#[cfg(not(test))] +impl [u8] { + #[rustc_allow_incoherent_impl] + #[unstable(issue = "none", feature = "std_internals")] + #[allow(dead_code)] + /// Safety: + /// - Must be valid UTF-8 + pub unsafe fn make_utf8_uppercase(&mut self) -> Result> { + let mut queue = VecDeque::new(); + + let mut read_offset = 0; + let mut write_offset = 0; + + while let Some((codepoint, width)) = + unsafe { core::str::next_code_point_with_width(&mut self[read_offset..].iter()) } + { + read_offset += width; + // Queue must be flushed before encode_to_slice_or_else_to_queue is + // called to ensure proper order of bytes + dump_queue(&mut queue, &mut self[..read_offset], &mut write_offset); + let lowercase_char = unsafe { char::from_u32_unchecked(codepoint) }; + for c in lowercase_char.to_uppercase() { + encode_to_slice_or_else_to_queue( + c, + &mut queue, + &mut self[..read_offset], + &mut write_offset, + ); + } + } + assert_eq!(read_offset, self.len()); + if write_offset < read_offset { Ok(write_offset) } else { Err(queue) } + } + + #[rustc_allow_incoherent_impl] + #[unstable(issue = "none", feature = "std_internals")] + #[allow(dead_code)] + /// Safety: + /// - Must be valid UTF-8 + pub unsafe fn make_utf8_lowercase(&mut self) -> Result> { + let mut queue = VecDeque::new(); + + let mut read_offset = 0; + let mut write_offset = 0; + + let mut final_sigma_automata = FinalSigmaAutomata::new(); + while let Some((codepoint, width)) = + unsafe { core::str::next_code_point_with_width(&mut self[read_offset..].iter()) } + { + read_offset += width; + // Queue must be flushed before encode_to_slice_or_else_to_queue is + // called to ensure proper order of bytes + dump_queue(&mut queue, &mut self[..read_offset], &mut write_offset); + let uppercase_char = unsafe { char::from_u32_unchecked(codepoint) }; + if uppercase_char == 'Σ' { + // Σ maps to σ, except at the end of a word where it maps to ς. + // See core::str::to_lowercase + let rest = unsafe { core::str::from_utf8_unchecked(&self[read_offset..]) }; + let is_word_final = + final_sigma_automata.is_accepting() && !case_ignorable_then_cased(rest.chars()); + let sigma_lowercase = if is_word_final { 'ς' } else { 'σ' }; + encode_to_slice_or_else_to_queue( + sigma_lowercase, + &mut queue, + &mut self[..read_offset], + &mut write_offset, + ); + } else { + for c in uppercase_char.to_lowercase() { + encode_to_slice_or_else_to_queue( + c, + &mut queue, + &mut self[..read_offset], + &mut write_offset, + ); + } + } + final_sigma_automata.step(uppercase_char); + } + assert_eq!(read_offset, self.len()); + return if write_offset < read_offset { Ok(write_offset) } else { Err(queue) }; + + // For now this is copy pasted from core::str, FIXME: DRY + fn case_ignorable_then_cased>(iter: I) -> bool { + use core::unicode::{Case_Ignorable, Cased}; + match iter.skip_while(|&c| Case_Ignorable(c)).next() { + Some(c) => Cased(c), + None => false, + } + } + } +} + +fn encode_to_slice_or_else_to_queue( + c: char, + queue: &mut VecDeque, + slice: &mut [u8], + write_offset: &mut usize, +) { + let mut buffer = [0; 4]; + let len = c.encode_utf8(&mut buffer).len(); + let writable_slice = &mut slice[*write_offset..]; + let direct_copy_length = core::cmp::min(len, writable_slice.len()); + writable_slice[..direct_copy_length].copy_from_slice(&buffer[..direct_copy_length]); + *write_offset += direct_copy_length; + queue.extend(&buffer[direct_copy_length..len]); +} + +fn dump_queue(queue: &mut VecDeque, slice: &mut [u8], write_offset: &mut usize) { + while *write_offset < slice.len() { + match queue.pop_front() { + Some(b) => { + slice[*write_offset] = b; + *write_offset += 1; + } + None => break, + } + } +} + +#[derive(Clone)] +enum FinalSigmaAutomata { + Init, + Accepted, +} + +impl FinalSigmaAutomata { + fn new() -> Self { + Self::Init + } + + fn is_accepting(&self) -> bool { + match self { + FinalSigmaAutomata::Accepted => true, + FinalSigmaAutomata::Init => false, + } + } + + fn step(&mut self, c: char) { + use core::unicode::{Case_Ignorable, Cased}; + + use FinalSigmaAutomata::*; + *self = match self { + Init => { + if Cased(c) { + Accepted + } else { + Init + } + } + Accepted => { + if Cased(c) || Case_Ignorable(c) { + Accepted + } else { + Init + } + } + } + } +} diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index 889bfba6e0474..726e3d87d9f09 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -1127,6 +1127,7 @@ impl String { self.vec.extend_from_slice(string.as_bytes()) } + #[cfg(not(no_global_oom_handling))] #[unstable(feature = "string_make_uplowercase", issue = "135885")] #[allow(missing_docs)] pub fn make_uppercase(&mut self) { @@ -1139,6 +1140,7 @@ impl String { *self = unsafe { Self::from_utf8_unchecked(v) } } + #[cfg(not(no_global_oom_handling))] #[unstable(feature = "string_make_uplowercase", issue = "135885")] #[allow(missing_docs)] pub fn make_lowercase(&mut self) {