-
Notifications
You must be signed in to change notification settings - Fork 13.3k
/
Copy pathbyte_slice_make_case.rs
162 lines (148 loc) · 5.39 KB
/
byte_slice_make_case.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
use crate::collections::VecDeque;
#[cfg(not(test))]
impl [u8] {
#[rustc_allow_incoherent_impl]
#[unstable(issue = "none", feature = "std_internals")]
#[allow(dead_code)]
/// Safety:
/// - Must be valid UTF-8
pub unsafe fn make_utf8_uppercase(&mut self) -> Result<usize, VecDeque<u8>> {
let mut queue = VecDeque::new();
let mut read_offset = 0;
let mut write_offset = 0;
while let Some((codepoint, width)) =
unsafe { core::str::next_code_point_with_width(&mut self[read_offset..].iter()) }
{
read_offset += width;
// Queue must be flushed before encode_to_slice_or_else_to_queue is
// called to ensure proper order of bytes
dump_queue(&mut queue, &mut self[..read_offset], &mut write_offset);
let lowercase_char = unsafe { char::from_u32_unchecked(codepoint) };
for c in lowercase_char.to_uppercase() {
encode_to_slice_or_else_to_queue(
c,
&mut queue,
&mut self[..read_offset],
&mut write_offset,
);
}
}
assert_eq!(read_offset, self.len());
if write_offset < read_offset { Ok(write_offset) } else { Err(queue) }
}
#[rustc_allow_incoherent_impl]
#[unstable(issue = "none", feature = "std_internals")]
#[allow(dead_code)]
/// Safety:
/// - Must be valid UTF-8
pub unsafe fn make_utf8_lowercase(&mut self) -> Result<usize, VecDeque<u8>> {
let mut queue = VecDeque::new();
let mut read_offset = 0;
let mut write_offset = 0;
let mut final_sigma_automata = FinalSigmaAutomata::new();
while let Some((codepoint, width)) =
unsafe { core::str::next_code_point_with_width(&mut self[read_offset..].iter()) }
{
read_offset += width;
// Queue must be flushed before encode_to_slice_or_else_to_queue is
// called to ensure proper order of bytes
dump_queue(&mut queue, &mut self[..read_offset], &mut write_offset);
let uppercase_char = unsafe { char::from_u32_unchecked(codepoint) };
if uppercase_char == 'Σ' {
// Σ maps to σ, except at the end of a word where it maps to ς.
// See core::str::to_lowercase
let rest = unsafe { core::str::from_utf8_unchecked(&self[read_offset..]) };
let is_word_final =
final_sigma_automata.is_accepting() && !case_ignorable_then_cased(rest.chars());
let sigma_lowercase = if is_word_final { 'ς' } else { 'σ' };
encode_to_slice_or_else_to_queue(
sigma_lowercase,
&mut queue,
&mut self[..read_offset],
&mut write_offset,
);
} else {
for c in uppercase_char.to_lowercase() {
encode_to_slice_or_else_to_queue(
c,
&mut queue,
&mut self[..read_offset],
&mut write_offset,
);
}
}
final_sigma_automata.step(uppercase_char);
}
assert_eq!(read_offset, self.len());
return if write_offset < read_offset { Ok(write_offset) } else { Err(queue) };
// For now this is copy pasted from core::str, FIXME: DRY
fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
use core::unicode::{Case_Ignorable, Cased};
match iter.skip_while(|&c| Case_Ignorable(c)).next() {
Some(c) => Cased(c),
None => false,
}
}
}
}
fn encode_to_slice_or_else_to_queue(
c: char,
queue: &mut VecDeque<u8>,
slice: &mut [u8],
write_offset: &mut usize,
) {
let mut buffer = [0; 4];
let len = c.encode_utf8(&mut buffer).len();
let writable_slice = &mut slice[*write_offset..];
let direct_copy_length = core::cmp::min(len, writable_slice.len());
writable_slice[..direct_copy_length].copy_from_slice(&buffer[..direct_copy_length]);
*write_offset += direct_copy_length;
queue.extend(&buffer[direct_copy_length..len]);
}
fn dump_queue(queue: &mut VecDeque<u8>, slice: &mut [u8], write_offset: &mut usize) {
while *write_offset < slice.len() {
match queue.pop_front() {
Some(b) => {
slice[*write_offset] = b;
*write_offset += 1;
}
None => break,
}
}
}
#[derive(Clone)]
enum FinalSigmaAutomata {
Init,
Accepted,
}
impl FinalSigmaAutomata {
fn new() -> Self {
Self::Init
}
fn is_accepting(&self) -> bool {
match self {
FinalSigmaAutomata::Accepted => true,
FinalSigmaAutomata::Init => false,
}
}
fn step(&mut self, c: char) {
use core::unicode::{Case_Ignorable, Cased};
use FinalSigmaAutomata::*;
*self = match self {
Init => {
if Cased(c) {
Accepted
} else {
Init
}
}
Accepted => {
if Cased(c) || Case_Ignorable(c) {
Accepted
} else {
Init
}
}
}
}
}