Skip to content

Commit 9bbd949

Browse files
committed
First prototype of make_uppercase
1 parent 252b07b commit 9bbd949

File tree

6 files changed

+97
-5
lines changed

6 files changed

+97
-5
lines changed

library/alloc/src/slice.rs

+42
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ use crate::alloc::Global;
7979
#[cfg(not(no_global_oom_handling))]
8080
use crate::borrow::ToOwned;
8181
use crate::boxed::Box;
82+
use crate::collections::VecDeque;
8283
use crate::vec::Vec;
8384

8485
// HACK(japaric): With cfg(test) `impl [T]` is not available, these three
@@ -665,6 +666,47 @@ impl<T> [T] {
665666
}
666667
}
667668

669+
pub(crate) fn blblb() {}
670+
671+
#[cfg(not(test))]
672+
impl [u8] {
673+
#[rustc_allow_incoherent_impl]
674+
#[unstable(issue = "none", feature = "std_internals")]
675+
#[allow(dead_code)]
676+
/// Safety:
677+
/// - Must be UTF-8
678+
pub unsafe fn make_utf8_uppercase(&mut self) -> Result<usize, VecDeque<u8>> {
679+
let mut queue = VecDeque::new();
680+
681+
let mut read_offset = 0;
682+
let mut write_offset = 0;
683+
684+
let mut buffer = [0; 4];
685+
while let Some((codepoint, width)) =
686+
unsafe { core::str::next_code_point_with_width(&mut self[read_offset..].iter()) }
687+
{
688+
read_offset += width;
689+
let lowercase_char = unsafe { char::from_u32_unchecked(codepoint) };
690+
for c in lowercase_char.to_uppercase() {
691+
let l = c.len_utf8();
692+
c.encode_utf8(&mut buffer);
693+
queue.extend(&buffer[..l]);
694+
}
695+
while write_offset < read_offset {
696+
match queue.pop_front() {
697+
Some(b) => {
698+
self[write_offset] = b;
699+
write_offset += 1;
700+
}
701+
None => break,
702+
}
703+
}
704+
}
705+
assert_eq!(read_offset, self.len());
706+
if write_offset < read_offset { Ok(write_offset) } else { Err(queue) }
707+
}
708+
}
709+
668710
#[cfg(not(test))]
669711
impl [u8] {
670712
/// Returns a vector containing a copy of this slice where each byte

library/alloc/src/string.rs

+13
Original file line numberDiff line numberDiff line change
@@ -1127,6 +1127,19 @@ impl String {
11271127
self.vec.extend_from_slice(string.as_bytes())
11281128
}
11291129

1130+
#[unstable(feature = "string_make_uplowercase", issue = "135885")]
1131+
#[allow(missing_docs)]
1132+
pub fn make_uppercase(&mut self) {
1133+
super::slice::blblb();
1134+
let mut v = core::mem::take(self).vec;
1135+
let res = unsafe { v.make_utf8_uppercase() };
1136+
match res {
1137+
Ok(n) => v.truncate(n),
1138+
Err(queue) => v.extend(queue),
1139+
}
1140+
*self = unsafe { Self::from_utf8_unchecked(v) }
1141+
}
1142+
11301143
/// Copies elements from `src` range to the end of the string.
11311144
///
11321145
/// # Panics

library/alloc/tests/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
#![allow(internal_features)]
4444
#![deny(fuzzy_provenance_casts)]
4545
#![deny(unsafe_op_in_unsafe_fn)]
46+
#![feature(string_make_uplowercase)]
4647

4748
extern crate test;
4849

library/alloc/tests/string.rs

+17
Original file line numberDiff line numberDiff line change
@@ -903,3 +903,20 @@ fn test_str_concat() {
903903
let s: String = format!("{a}{b}");
904904
assert_eq!(s.as_bytes()[9], 'd' as u8);
905905
}
906+
907+
#[test]
908+
fn make_uppercase() {
909+
fn test(s: &str) {
910+
let ground_truth = s.to_uppercase();
911+
let mut tested = s.to_owned();
912+
tested.make_uppercase();
913+
assert_eq!(tested, ground_truth);
914+
}
915+
test("");
916+
test("abcde");
917+
// 4 to 9 bytes
918+
test("ǰΐ");
919+
// 10*3 to 10*2 bytes
920+
test("ⱥⱥⱥⱥⱥⱥⱥⱥⱥⱥ");
921+
test("aéDžßfiᾀ");
922+
}

library/core/src/str/mod.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ pub use lossy::{Utf8Chunk, Utf8Chunks};
5757
#[stable(feature = "rust1", since = "1.0.0")]
5858
pub use traits::FromStr;
5959
#[unstable(feature = "str_internals", issue = "none")]
60-
pub use validations::{next_code_point, utf8_char_width};
60+
pub use validations::{next_code_point, next_code_point_with_width, utf8_char_width};
6161

6262
#[inline(never)]
6363
#[cold]

library/core/src/str/validations.rs

+23-4
Original file line numberDiff line numberDiff line change
@@ -26,18 +26,21 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
2626
}
2727

2828
/// Reads the next code point out of a byte iterator (assuming a
29-
/// UTF-8-like encoding).
29+
/// UTF-8-like encoding) and returns it along with its width.
3030
///
3131
/// # Safety
3232
///
3333
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
3434
#[unstable(feature = "str_internals", issue = "none")]
3535
#[inline]
36-
pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
36+
#[allow(dead_code)]
37+
pub unsafe fn next_code_point_with_width<'a, I: Iterator<Item = &'a u8>>(
38+
bytes: &mut I,
39+
) -> Option<(u32, usize)> {
3740
// Decode UTF-8
3841
let x = *bytes.next()?;
3942
if x < 128 {
40-
return Some(x as u32);
43+
return Some((x as u32, 1));
4144
}
4245

4346
// Multibyte case follows
@@ -47,13 +50,15 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) ->
4750
// SAFETY: `bytes` produces an UTF-8-like string,
4851
// so the iterator must produce a value here.
4952
let y = unsafe { *bytes.next().unwrap_unchecked() };
53+
let mut width = 2;
5054
let mut ch = utf8_acc_cont_byte(init, y);
5155
if x >= 0xE0 {
5256
// [[x y z] w] case
5357
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
5458
// SAFETY: `bytes` produces an UTF-8-like string,
5559
// so the iterator must produce a value here.
5660
let z = unsafe { *bytes.next().unwrap_unchecked() };
61+
width = 3;
5762
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
5863
ch = init << 12 | y_z;
5964
if x >= 0xF0 {
@@ -62,11 +67,25 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) ->
6267
// SAFETY: `bytes` produces an UTF-8-like string,
6368
// so the iterator must produce a value here.
6469
let w = unsafe { *bytes.next().unwrap_unchecked() };
70+
width = 4;
6571
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
6672
}
6773
}
6874

69-
Some(ch)
75+
Some((ch, width))
76+
}
77+
78+
/// Reads the next code point out of a byte iterator (assuming a
79+
/// UTF-8-like encoding).
80+
///
81+
/// # Safety
82+
///
83+
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
84+
#[unstable(feature = "str_internals", issue = "none")]
85+
#[inline]
86+
pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
87+
// SAFETY: same call condition
88+
Some(unsafe { next_code_point_with_width(bytes) }?.0)
7089
}
7190

7291
/// Reads the last code point out of a byte iterator (assuming a

0 commit comments

Comments
 (0)