Skip to content

modify next_code_point() to accept an Iterator<u8> instead of Iterator<&u8>. Old code that calls it invokes .copied() #96019

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions library/alloc/tests/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#![feature(panic_update_hook)]
#![feature(slice_flatten)]
#![feature(thin_box)]
#![feature(str_internals)]

use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
Expand Down
69 changes: 69 additions & 0 deletions library/alloc/tests/str.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use core::str::next_code_point;
use std::assert_matches::assert_matches;
use std::borrow::Cow;
use std::cmp::Ordering::{Equal, Greater, Less};
Expand Down Expand Up @@ -2367,3 +2368,71 @@ fn ceil_char_boundary() {
fn ceil_char_boundary_above_len_panic() {
let _ = "x".ceil_char_boundary(2);
}

fn check_decoded_string<I: Iterator<Item = u8>>(mut iter: &mut I, expected: &str) {
for char in expected.chars() {
assert_eq!(Some(char as u32), unsafe { next_code_point(&mut iter) });
}
assert_eq!(None, unsafe { next_code_point(&mut iter) });
}

#[test]
pub fn dirt_simple_code_point() {
unsafe {
let src = b"banana";
let mut iter = src.iter().copied();
for char in "banana".chars() {
assert_eq!(Some(char as u32), next_code_point(&mut iter));
}

let tmp = next_code_point(&mut iter);
assert_eq!(None, tmp);
}
{
let src = [
b'd', b'a', b'i', b' ', 229, 164, 167, 232, 179, 162, 232, 128, 133, b' ', b'k', b'e',
b'n', b'j', b'a',
];

let mut iter = src.into_iter();

check_decoded_string(&mut iter, "dai 大賢者 kenja");
}
}

struct Shenanigans {
compressed: Vec<u8>,
cursor: usize,
}

// This provides an example of a u8 iterator which can not use Item=&u8.
// A real-world case is a string stored in progmem on an AVR, which can
// not be incorporated into a unit test
impl Iterator for Shenanigans {
type Item = u8;
fn next(&mut self) -> Option<<Self as Iterator>::Item> {
let end = self.cursor + 6;
let i1 = self.cursor / 8;
let i2 = (end - 1) / 8;
if i2 >= self.compressed.len() {
return None;
}
let base64 = if i1 == i2 {
self.compressed[i1] >> (2 - self.cursor % 8) & 0x3f
} else {
0x3f & ((self.compressed[i1] << (self.cursor % 8 - 2))
| (self.compressed[i2] >> (10 - self.cursor % 8)))
};
self.cursor += 6;
Some(base64 + b' ')
}
}

#[test]
pub fn fancy_code_point() {
let mut iter =
Shenanigans { compressed: vec![142, 139, 236, 228, 10, 238, 166, 122, 52], cursor: 0 };
for char in "CHOLY KNIGHT".chars() {
assert_eq!(Some(char as u32), unsafe { next_code_point(&mut iter) });
}
}
9 changes: 7 additions & 2 deletions library/core/src/str/iter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@ impl<'a> Iterator for Chars<'a> {
fn next(&mut self) -> Option<char> {
// SAFETY: `str` invariant says `self.iter` is a valid UTF-8 string and
// the resulting `ch` is a valid Unicode Scalar Value.
unsafe { next_code_point(&mut self.iter).map(|ch| char::from_u32_unchecked(ch)) }
unsafe {
next_code_point(&mut (&mut self.iter).copied()).map(|ch| char::from_u32_unchecked(ch))
}
}

#[inline]
Expand Down Expand Up @@ -81,7 +83,10 @@ impl<'a> DoubleEndedIterator for Chars<'a> {
fn next_back(&mut self) -> Option<char> {
// SAFETY: `str` invariant says `self.iter` is a valid UTF-8 string and
// the resulting `ch` is a valid Unicode Scalar Value.
unsafe { next_code_point_reverse(&mut self.iter).map(|ch| char::from_u32_unchecked(ch)) }
unsafe {
next_code_point_reverse(&mut (&mut self.iter).copied())
.map(|ch| char::from_u32_unchecked(ch))
}
}
}

Expand Down
22 changes: 11 additions & 11 deletions library/core/src/str/validations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
#[unstable(feature = "str_internals", issue = "none")]
#[inline]
pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
pub unsafe fn next_code_point<I: Iterator<Item = u8>>(bytes: &mut I) -> Option<u32> {
// Decode UTF-8
let x = *bytes.next()?;
let x = bytes.next()?;
if x < 128 {
return Some(x as u32);
}
Expand All @@ -46,22 +46,22 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) ->
let init = utf8_first_byte(x, 2);
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let y = unsafe { *bytes.next().unwrap_unchecked() };
let y = unsafe { bytes.next().unwrap_unchecked() };
let mut ch = utf8_acc_cont_byte(init, y);
if x >= 0xE0 {
// [[x y z] w] case
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let z = unsafe { *bytes.next().unwrap_unchecked() };
let z = unsafe { bytes.next().unwrap_unchecked() };
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
ch = init << 12 | y_z;
if x >= 0xF0 {
// [x y z w] case
// use only the lower 3 bits of `init`
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let w = unsafe { *bytes.next().unwrap_unchecked() };
let w = unsafe { bytes.next().unwrap_unchecked() };
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
}
}
Expand All @@ -76,12 +76,12 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) ->
///
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
#[inline]
pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
pub(super) unsafe fn next_code_point_reverse<I>(bytes: &mut I) -> Option<u32>
where
I: DoubleEndedIterator<Item = &'a u8>,
I: DoubleEndedIterator<Item = u8>,
{
// Decode UTF-8
let w = match *bytes.next_back()? {
let w = match bytes.next_back()? {
next_byte if next_byte < 128 => return Some(next_byte as u32),
back_byte => back_byte,
};
Expand All @@ -91,17 +91,17 @@ where
let mut ch;
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let z = unsafe { *bytes.next_back().unwrap_unchecked() };
let z = unsafe { bytes.next_back().unwrap_unchecked() };
ch = utf8_first_byte(z, 2);
if utf8_is_cont_byte(z) {
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let y = unsafe { *bytes.next_back().unwrap_unchecked() };
let y = unsafe { bytes.next_back().unwrap_unchecked() };
ch = utf8_first_byte(y, 3);
if utf8_is_cont_byte(y) {
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let x = unsafe { *bytes.next_back().unwrap_unchecked() };
let x = unsafe { bytes.next_back().unwrap_unchecked() };
ch = utf8_first_byte(x, 4);
ch = utf8_acc_cont_byte(ch, y);
}
Expand Down
2 changes: 1 addition & 1 deletion library/std/src/sys_common/wtf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -847,7 +847,7 @@ impl<'a> Iterator for Wtf8CodePoints<'a> {
#[inline]
fn next(&mut self) -> Option<CodePoint> {
// SAFETY: `self.bytes` has been created from a WTF-8 string
unsafe { next_code_point(&mut self.bytes).map(|c| CodePoint { value: c }) }
unsafe { next_code_point(&mut (&mut self.bytes).copied()).map(|c| CodePoint { value: c }) }
}

#[inline]
Expand Down