rust-lang · krtab · Jan 22, 2025 · Feb 17, 2025 · Feb 18, 2025 · Feb 18, 2025
diff --git a/library/alloc/src/slice.rs b/library/alloc/src/slice.rs
@@ -81,6 +81,11 @@ use crate::borrow::ToOwned;
 use crate::boxed::Box;
 use crate::vec::Vec;
 
+// Using a module here allows having the no_global_oom_handling
+// in only one place
+#[cfg(not(no_global_oom_handling))]
+mod byte_slice_make_case;
+
 // HACK(japaric): With cfg(test) `impl [T]` is not available, these three
 // functions are actually methods that are in `impl [T]` but not in
 // `core::slice::SliceExt` - we need to supply these functions for the

diff --git a/library/alloc/src/slice/byte_slice_make_case.rs b/library/alloc/src/slice/byte_slice_make_case.rs
@@ -0,0 +1,162 @@
+use crate::collections::VecDeque;
+
+#[cfg(not(test))]
+impl [u8] {
+    #[rustc_allow_incoherent_impl]
+    #[unstable(issue = "none", feature = "std_internals")]
+    #[allow(dead_code)]
+    /// Safety:
+    ///    - Must be valid UTF-8
+    pub unsafe fn make_utf8_uppercase(&mut self) -> Result<usize, VecDeque<u8>> {
+        let mut queue = VecDeque::new();
+
+        let mut read_offset = 0;
+        let mut write_offset = 0;
+
+        while let Some((codepoint, width)) =
+            unsafe { core::str::next_code_point_with_width(&mut self[read_offset..].iter()) }
+        {
+            read_offset += width;
+            // Queue must be flushed before encode_to_slice_or_else_to_queue is
+            // called to ensure proper order of bytes
+            dump_queue(&mut queue, &mut self[..read_offset], &mut write_offset);
+            let lowercase_char = unsafe { char::from_u32_unchecked(codepoint) };
+            for c in lowercase_char.to_uppercase() {
+                encode_to_slice_or_else_to_queue(
+                    c,
+                    &mut queue,
+                    &mut self[..read_offset],
+                    &mut write_offset,
+                );
+            }
+        }
+        assert_eq!(read_offset, self.len());
+        if write_offset < read_offset { Ok(write_offset) } else { Err(queue) }
+    }
+
+    #[rustc_allow_incoherent_impl]
+    #[unstable(issue = "none", feature = "std_internals")]
+    #[allow(dead_code)]
+    /// Safety:
+    ///    - Must be valid UTF-8
+    pub unsafe fn make_utf8_lowercase(&mut self) -> Result<usize, VecDeque<u8>> {
+        let mut queue = VecDeque::new();
+
+        let mut read_offset = 0;
+        let mut write_offset = 0;
+
+        let mut final_sigma_automata = FinalSigmaAutomata::new();
+        while let Some((codepoint, width)) =
+            unsafe { core::str::next_code_point_with_width(&mut self[read_offset..].iter()) }
+        {
+            read_offset += width;
+            // Queue must be flushed before encode_to_slice_or_else_to_queue is
+            // called to ensure proper order of bytes
+            dump_queue(&mut queue, &mut self[..read_offset], &mut write_offset);
+            let uppercase_char = unsafe { char::from_u32_unchecked(codepoint) };
+            if uppercase_char == 'Σ' {
+                // Σ maps to σ, except at the end of a word where it maps to ς.
+                // See core::str::to_lowercase
+                let rest = unsafe { core::str::from_utf8_unchecked(&self[read_offset..]) };
+                let is_word_final =
+                    final_sigma_automata.is_accepting() && !case_ignorable_then_cased(rest.chars());
+                let sigma_lowercase = if is_word_final { 'ς' } else { 'σ' };
+                encode_to_slice_or_else_to_queue(
+                    sigma_lowercase,
+                    &mut queue,
+                    &mut self[..read_offset],
+                    &mut write_offset,
+                );
+            } else {
+                for c in uppercase_char.to_lowercase() {
+                    encode_to_slice_or_else_to_queue(
+                        c,
+                        &mut queue,
+                        &mut self[..read_offset],
+                        &mut write_offset,
+                    );
+                }
+            }
+            final_sigma_automata.step(uppercase_char);
+        }
+        assert_eq!(read_offset, self.len());
+        return if write_offset < read_offset { Ok(write_offset) } else { Err(queue) };
+
+        // For now this is copy pasted from core::str, FIXME: DRY
+        fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
+            use core::unicode::{Case_Ignorable, Cased};
+            match iter.skip_while(|&c| Case_Ignorable(c)).next() {
+                Some(c) => Cased(c),
+                None => false,
+            }
+        }
+    }
+}
+
+fn encode_to_slice_or_else_to_queue(
+    c: char,
+    queue: &mut VecDeque<u8>,
+    slice: &mut [u8],
+    write_offset: &mut usize,
+) {
+    let mut buffer = [0; 4];
+    let len = c.encode_utf8(&mut buffer).len();
+    let writable_slice = &mut slice[*write_offset..];
+    let direct_copy_length = core::cmp::min(len, writable_slice.len());
+    writable_slice[..direct_copy_length].copy_from_slice(&buffer[..direct_copy_length]);
+    *write_offset += direct_copy_length;
+    queue.extend(&buffer[direct_copy_length..len]);
+}
+
+fn dump_queue(queue: &mut VecDeque<u8>, slice: &mut [u8], write_offset: &mut usize) {
+    while *write_offset < slice.len() {
+        match queue.pop_front() {
+            Some(b) => {
+                slice[*write_offset] = b;
+                *write_offset += 1;
+            }
+            None => break,
+        }
+    }
+}
+
+#[derive(Clone)]
+enum FinalSigmaAutomata {
+    Init,
+    Accepted,
+}
+
+impl FinalSigmaAutomata {
+    fn new() -> Self {
+        Self::Init
+    }
+
+    fn is_accepting(&self) -> bool {
+        match self {
+            FinalSigmaAutomata::Accepted => true,
+            FinalSigmaAutomata::Init => false,
+        }
+    }
+
+    fn step(&mut self, c: char) {
+        use core::unicode::{Case_Ignorable, Cased};
+
+        use FinalSigmaAutomata::*;
+        *self = match self {
+            Init => {
+                if Cased(c) {
+                    Accepted
+                } else {
+                    Init
+                }
+            }
+            Accepted => {
+                if Cased(c) || Case_Ignorable(c) {
+                    Accepted
+                } else {
+                    Init
+                }
+            }
+        }
+    }
+}
diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs
@@ -1127,6 +1127,32 @@ impl String {
         self.vec.extend_from_slice(string.as_bytes())
     }
 
+    #[cfg(not(no_global_oom_handling))]
+    #[unstable(feature = "string_make_uplowercase", issue = "135885")]
+    #[allow(missing_docs)]
+    pub fn make_uppercase(&mut self) {
+        let mut v = core::mem::take(self).vec;
+        let res = unsafe { v.make_utf8_uppercase() };
+        match res {
+            Ok(n) => v.truncate(n),
+            Err(queue) => v.extend(queue),
+        }
+        *self = unsafe { Self::from_utf8_unchecked(v) }
+    }
+
+    #[cfg(not(no_global_oom_handling))]
+    #[unstable(feature = "string_make_uplowercase", issue = "135885")]
+    #[allow(missing_docs)]
+    pub fn make_lowercase(&mut self) {
+        let mut v = core::mem::take(self).vec;
+        let res = unsafe { v.make_utf8_lowercase() };
+        match res {
+            Ok(n) => v.truncate(n),
+            Err(queue) => v.extend(queue),
+        }
+        *self = unsafe { Self::from_utf8_unchecked(v) }
+    }
+
     /// Copies elements from `src` range to the end of the string.
     ///
     /// # Panics

diff --git a/library/alloc/tests/lib.rs b/library/alloc/tests/lib.rs
@@ -43,6 +43,7 @@
 #![allow(internal_features)]
 #![deny(fuzzy_provenance_casts)]
 #![deny(unsafe_op_in_unsafe_fn)]
+#![feature(string_make_uplowercase)]
 
 extern crate test;
 

diff --git a/library/alloc/tests/string.rs b/library/alloc/tests/string.rs
@@ -903,3 +903,83 @@ fn test_str_concat() {
     let s: String = format!("{a}{b}");
     assert_eq!(s.as_bytes()[9], 'd' as u8);
 }
+
+#[test]
+fn make_uppercase() {
+    fn test(s: &str) {
+        let ground_truth = s.to_uppercase();
+        let mut tested = s.to_owned();
+        tested.make_uppercase();
+        assert!(
+            tested == ground_truth,
+            r#"When uppercased "{s}" gave "{tested}" while "{ground_truth}" was expected"#
+        );
+    }
+    test("");
+    test("abcde");
+    // 4 to 9 bytes
+    test("ǰΐ");
+    // 10*3 to 10*2 bytes
+    test("ⱥⱥⱥⱥⱥⱥⱥⱥⱥⱥ");
+    test("aéǅßﬁᾀ");
+}
+
+#[test]
+fn make_lowercase() {
+    fn test(s: &str) {
+        let ground_truth = s.to_lowercase();
+        let mut tested = s.to_owned();
+        tested.make_lowercase();
+        assert!(
+            tested == ground_truth,
+            r#"When lowercased "{s}" gave "{tested}" while "{ground_truth}" was expected"#
+        );
+    }
+    test("");
+    test("AÉǅaé ");
+
+    // https://github.com/rust-lang/rust/issues/26035
+    test("ΑΣ");
+    test("Α'Σ");
+    test("Α''Σ");
+
+    test("ΑΣ Α");
+    test("Α'Σ Α");
+    test("Α''Σ Α");
+
+    test("ΑΣ' Α");
+    test("ΑΣ'' Α");
+
+    test("Α'Σ' Α");
+    test("Α''Σ'' Α");
+
+    test("Α Σ");
+    test("Α 'Σ");
+    test("Α ''Σ");
+
+    test("Σ");
+    test("'Σ");
+    test("''Σ");
+
+    test("ΑΣΑ");
+    test("ΑΣ'Α");
+    test("ΑΣ''Α");
+
+    // https://github.com/rust-lang/rust/issues/124714
+    // input lengths around the boundary of the chunk size used by the ascii prefix optimization
+    test("abcdefghijklmnoΣ");
+    test("abcdefghijklmnopΣ");
+    test("abcdefghijklmnopqΣ");
+
+    // a really long string that has it's lowercase form
+    // even longer. this tests that implementations don't assume
+    // an incorrect upper bound on allocations
+    let upper = str::repeat("İ", 512);
+    test(&upper);
+
+    // a really long ascii-only string.
+    // This test that the ascii hot-path
+    // functions correctly
+    let upper = str::repeat("A", 511);
+    test(&upper);
+}
diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs
@@ -57,7 +57,7 @@ pub use lossy::{Utf8Chunk, Utf8Chunks};
 #[stable(feature = "rust1", since = "1.0.0")]
 pub use traits::FromStr;
 #[unstable(feature = "str_internals", issue = "none")]
-pub use validations::{next_code_point, utf8_char_width};
+pub use validations::{next_code_point, next_code_point_with_width, utf8_char_width};
 
 #[inline(never)]
 #[cold]

diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs
@@ -26,18 +26,21 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
 }
 
 /// Reads the next code point out of a byte iterator (assuming a
-/// UTF-8-like encoding).
+/// UTF-8-like encoding) and returns it along with its width.
 ///
 /// # Safety
 ///
 /// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
 #[unstable(feature = "str_internals", issue = "none")]
 #[inline]
-pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
+#[allow(dead_code)]
+pub unsafe fn next_code_point_with_width<'a, I: Iterator<Item = &'a u8>>(
+    bytes: &mut I,
+) -> Option<(u32, usize)> {
     // Decode UTF-8
     let x = *bytes.next()?;
     if x < 128 {
-        return Some(x as u32);
+        return Some((x as u32, 1));
     }
 
     // Multibyte case follows
@@ -47,13 +50,15 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) ->
     // SAFETY: `bytes` produces an UTF-8-like string,
     // so the iterator must produce a value here.
     let y = unsafe { *bytes.next().unwrap_unchecked() };
+    let mut width = 2;
     let mut ch = utf8_acc_cont_byte(init, y);
     if x >= 0xE0 {
         // [[x y z] w] case
         // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
         // SAFETY: `bytes` produces an UTF-8-like string,
         // so the iterator must produce a value here.
         let z = unsafe { *bytes.next().unwrap_unchecked() };
+        width = 3;
         let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
         ch = init << 12 | y_z;
         if x >= 0xF0 {
@@ -62,11 +67,25 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) ->
             // SAFETY: `bytes` produces an UTF-8-like string,
             // so the iterator must produce a value here.
             let w = unsafe { *bytes.next().unwrap_unchecked() };
+            width = 4;
             ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
         }
     }
 
-    Some(ch)
+    Some((ch, width))
+}
+
+/// Reads the next code point out of a byte iterator (assuming a
+/// UTF-8-like encoding).
+///
+/// # Safety
+///
+/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
+#[unstable(feature = "str_internals", issue = "none")]
+#[inline]
+pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
+    // SAFETY: same call condition
+    Some(unsafe { next_code_point_with_width(bytes) }?.0)
 }
 
 /// Reads the last code point out of a byte iterator (assuming a