Skip to content

Commit fffbb33

Browse files
committed
First prototype of make_lowercase
1 parent c5b0e91 commit fffbb33

File tree

3 files changed

+172
-0
lines changed

3 files changed

+172
-0
lines changed

library/alloc/src/slice.rs

+100
Original file line numberDiff line numberDiff line change
@@ -703,6 +703,106 @@ impl [u8] {
703703
assert_eq!(read_offset, self.len());
704704
if write_offset < read_offset { Ok(write_offset) } else { Err(queue) }
705705
}
706+
707+
#[rustc_allow_incoherent_impl]
708+
#[unstable(issue = "none", feature = "std_internals")]
709+
#[allow(dead_code)]
710+
/// Safety:
711+
/// - Must be UTF-8
712+
pub unsafe fn make_utf8_lowercase(&mut self) -> Result<usize, VecDeque<u8>> {
713+
let mut queue = VecDeque::new();
714+
715+
let mut read_offset = 0;
716+
let mut write_offset = 0;
717+
718+
let mut buffer = [0; 4];
719+
let mut final_sigma_automata = FinalSigmaAutomata::new();
720+
while let Some((codepoint, width)) =
721+
unsafe { core::str::next_code_point_with_width(&mut self[read_offset..].iter()) }
722+
{
723+
read_offset += width;
724+
let uppercase_char = unsafe { char::from_u32_unchecked(codepoint) };
725+
if uppercase_char == 'Σ' {
726+
// Σ maps to σ, except at the end of a word where it maps to ς.
727+
// See core::str::to_lowercase
728+
let rest = unsafe { core::str::from_utf8_unchecked(&self[read_offset..]) };
729+
let is_word_final =
730+
final_sigma_automata.is_accepting() && !case_ignorable_then_cased(rest.chars());
731+
let sigma_lowercase = if is_word_final { 'ς' } else { 'σ' };
732+
let l = sigma_lowercase.len_utf8();
733+
sigma_lowercase.encode_utf8(&mut buffer);
734+
queue.extend(&buffer[..l]);
735+
} else {
736+
for c in uppercase_char.to_lowercase() {
737+
let l = c.len_utf8();
738+
c.encode_utf8(&mut buffer);
739+
queue.extend(&buffer[..l]);
740+
}
741+
}
742+
final_sigma_automata.step(uppercase_char);
743+
while write_offset < read_offset {
744+
match queue.pop_front() {
745+
Some(b) => {
746+
self[write_offset] = b;
747+
write_offset += 1;
748+
}
749+
None => break,
750+
}
751+
}
752+
}
753+
assert_eq!(read_offset, self.len());
754+
return if write_offset < read_offset { Ok(write_offset) } else { Err(queue) };
755+
756+
// For now this is copy pasted from core::str, FIXME: DRY
757+
fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
758+
use core::unicode::{Case_Ignorable, Cased};
759+
match iter.skip_while(|&c| Case_Ignorable(c)).next() {
760+
Some(c) => Cased(c),
761+
None => false,
762+
}
763+
}
764+
}
765+
}
766+
767+
#[derive(Clone)]
768+
enum FinalSigmaAutomata {
769+
Init,
770+
Accepted,
771+
}
772+
773+
impl FinalSigmaAutomata {
774+
fn new() -> Self {
775+
Self::Init
776+
}
777+
778+
fn is_accepting(&self) -> bool {
779+
match self {
780+
FinalSigmaAutomata::Accepted => true,
781+
FinalSigmaAutomata::Init => false,
782+
}
783+
}
784+
785+
fn step(&mut self, c: char) {
786+
use core::unicode::{Case_Ignorable, Cased};
787+
788+
use FinalSigmaAutomata::*;
789+
*self = match self {
790+
Init => {
791+
if Cased(c) {
792+
Accepted
793+
} else {
794+
Init
795+
}
796+
}
797+
Accepted => {
798+
if Cased(c) || Case_Ignorable(c) {
799+
Accepted
800+
} else {
801+
Init
802+
}
803+
}
804+
}
805+
}
706806
}
707807

708808
#[cfg(not(test))]

library/alloc/src/string.rs

+12
Original file line numberDiff line numberDiff line change
@@ -1139,6 +1139,18 @@ impl String {
11391139
*self = unsafe { Self::from_utf8_unchecked(v) }
11401140
}
11411141

1142+
#[unstable(feature = "string_make_uplowercase", issue = "135885")]
1143+
#[allow(missing_docs)]
1144+
pub fn make_lowercase(&mut self) {
1145+
let mut v = core::mem::take(self).vec;
1146+
let res = unsafe { v.make_utf8_lowercase() };
1147+
match res {
1148+
Ok(n) => v.truncate(n),
1149+
Err(queue) => v.extend(queue),
1150+
}
1151+
*self = unsafe { Self::from_utf8_unchecked(v) }
1152+
}
1153+
11421154
/// Copies elements from `src` range to the end of the string.
11431155
///
11441156
/// # Panics

library/alloc/tests/string.rs

+60
Original file line numberDiff line numberDiff line change
@@ -923,3 +923,63 @@ fn make_uppercase() {
923923
test("ⱥⱥⱥⱥⱥⱥⱥⱥⱥⱥ");
924924
test("aéDžßfiᾀ");
925925
}
926+
927+
#[test]
928+
fn make_lowercase() {
929+
fn test(s: &str) {
930+
let ground_truth = s.to_lowercase();
931+
let mut tested = s.to_owned();
932+
tested.make_lowercase();
933+
assert!(
934+
tested == ground_truth,
935+
r#"When lowercased "{s}" gave "{tested}" while "{ground_truth}" was expected"#
936+
);
937+
}
938+
test("");
939+
test("AÉDžaé ");
940+
941+
// https://github.com/rust-lang/rust/issues/26035
942+
test("ΑΣ");
943+
test("Α'Σ");
944+
test("Α''Σ");
945+
946+
test("ΑΣ Α");
947+
test("Α'Σ Α");
948+
test("Α''Σ Α");
949+
950+
test("ΑΣ' Α");
951+
test("ΑΣ'' Α");
952+
953+
test("Α'Σ' Α");
954+
test("Α''Σ'' Α");
955+
956+
test("Α Σ");
957+
test("Α 'Σ");
958+
test("Α ''Σ");
959+
960+
test("Σ");
961+
test("'Σ");
962+
test("''Σ");
963+
964+
test("ΑΣΑ");
965+
test("ΑΣ'Α");
966+
test("ΑΣ''Α");
967+
968+
// https://github.com/rust-lang/rust/issues/124714
969+
// input lengths around the boundary of the chunk size used by the ascii prefix optimization
970+
test("abcdefghijklmnoΣ");
971+
test("abcdefghijklmnopΣ");
972+
test("abcdefghijklmnopqΣ");
973+
974+
// a really long string that has it's lowercase form
975+
// even longer. this tests that implementations don't assume
976+
// an incorrect upper bound on allocations
977+
let upper = str::repeat("İ", 512);
978+
test(&upper);
979+
980+
// a really long ascii-only string.
981+
// This test that the ascii hot-path
982+
// functions correctly
983+
let upper = str::repeat("A", 511);
984+
test(&upper);
985+
}

0 commit comments

Comments
 (0)