From 3c6b68f8bb1660c13d2a6662e4b9d86107e71bc9 Mon Sep 17 00:00:00 2001 From: Robert Forsman Date: Wed, 13 Apr 2022 14:36:27 -0400 Subject: [PATCH] modify next_code_point() to accept an Iterator instead of Iterator<&u8>. Old code that calls it invokes .copied() --- library/alloc/tests/lib.rs | 1 + library/alloc/tests/str.rs | 69 +++++++++++++++++++++++++++++ library/core/src/str/iter.rs | 9 +++- library/core/src/str/validations.rs | 22 ++++----- library/std/src/sys_common/wtf8.rs | 2 +- 5 files changed, 89 insertions(+), 14 deletions(-) diff --git a/library/alloc/tests/lib.rs b/library/alloc/tests/lib.rs index 16d3b36859570..f43aadeea1c70 100644 --- a/library/alloc/tests/lib.rs +++ b/library/alloc/tests/lib.rs @@ -40,6 +40,7 @@ #![feature(panic_update_hook)] #![feature(slice_flatten)] #![feature(thin_box)] +#![feature(str_internals)] use std::collections::hash_map::DefaultHasher; use std::hash::{Hash, Hasher}; diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs index 273b39aa45a48..15b2ef3c4d2dc 100644 --- a/library/alloc/tests/str.rs +++ b/library/alloc/tests/str.rs @@ -1,3 +1,4 @@ +use core::str::next_code_point; use std::assert_matches::assert_matches; use std::borrow::Cow; use std::cmp::Ordering::{Equal, Greater, Less}; @@ -2367,3 +2368,71 @@ fn ceil_char_boundary() { fn ceil_char_boundary_above_len_panic() { let _ = "x".ceil_char_boundary(2); } + +fn check_decoded_string>(mut iter: &mut I, expected: &str) { + for char in expected.chars() { + assert_eq!(Some(char as u32), unsafe { next_code_point(&mut iter) }); + } + assert_eq!(None, unsafe { next_code_point(&mut iter) }); +} + +#[test] +pub fn dirt_simple_code_point() { + unsafe { + let src = b"banana"; + let mut iter = src.iter().copied(); + for char in "banana".chars() { + assert_eq!(Some(char as u32), next_code_point(&mut iter)); + } + + let tmp = next_code_point(&mut iter); + assert_eq!(None, tmp); + } + { + let src = [ + b'd', b'a', b'i', b' ', 229, 164, 167, 232, 179, 162, 232, 128, 133, b' ', b'k', b'e', + b'n', b'j', b'a', + ]; + + let mut iter = src.into_iter(); + + check_decoded_string(&mut iter, "dai 大賢者 kenja"); + } +} + +struct Shenanigans { + compressed: Vec, + cursor: usize, +} + +// This provides an example of a u8 iterator which can not use Item=&u8. +// A real-world case is a string stored in progmem on an AVR, which can +// not be incorporated into a unit test +impl Iterator for Shenanigans { + type Item = u8; + fn next(&mut self) -> Option<::Item> { + let end = self.cursor + 6; + let i1 = self.cursor / 8; + let i2 = (end - 1) / 8; + if i2 >= self.compressed.len() { + return None; + } + let base64 = if i1 == i2 { + self.compressed[i1] >> (2 - self.cursor % 8) & 0x3f + } else { + 0x3f & ((self.compressed[i1] << (self.cursor % 8 - 2)) + | (self.compressed[i2] >> (10 - self.cursor % 8))) + }; + self.cursor += 6; + Some(base64 + b' ') + } +} + +#[test] +pub fn fancy_code_point() { + let mut iter = + Shenanigans { compressed: vec![142, 139, 236, 228, 10, 238, 166, 122, 52], cursor: 0 }; + for char in "CHOLY KNIGHT".chars() { + assert_eq!(Some(char as u32), unsafe { next_code_point(&mut iter) }); + } +} diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs index e529bccbc7999..84e5a78136dda 100644 --- a/library/core/src/str/iter.rs +++ b/library/core/src/str/iter.rs @@ -41,7 +41,9 @@ impl<'a> Iterator for Chars<'a> { fn next(&mut self) -> Option { // SAFETY: `str` invariant says `self.iter` is a valid UTF-8 string and // the resulting `ch` is a valid Unicode Scalar Value. - unsafe { next_code_point(&mut self.iter).map(|ch| char::from_u32_unchecked(ch)) } + unsafe { + next_code_point(&mut (&mut self.iter).copied()).map(|ch| char::from_u32_unchecked(ch)) + } } #[inline] @@ -81,7 +83,10 @@ impl<'a> DoubleEndedIterator for Chars<'a> { fn next_back(&mut self) -> Option { // SAFETY: `str` invariant says `self.iter` is a valid UTF-8 string and // the resulting `ch` is a valid Unicode Scalar Value. - unsafe { next_code_point_reverse(&mut self.iter).map(|ch| char::from_u32_unchecked(ch)) } + unsafe { + next_code_point_reverse(&mut (&mut self.iter).copied()) + .map(|ch| char::from_u32_unchecked(ch)) + } } } diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs index 0d3dc856be577..957738a9c29b6 100644 --- a/library/core/src/str/validations.rs +++ b/library/core/src/str/validations.rs @@ -33,9 +33,9 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool { /// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string #[unstable(feature = "str_internals", issue = "none")] #[inline] -pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> Option { +pub unsafe fn next_code_point>(bytes: &mut I) -> Option { // Decode UTF-8 - let x = *bytes.next()?; + let x = bytes.next()?; if x < 128 { return Some(x as u32); } @@ -46,14 +46,14 @@ pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> let init = utf8_first_byte(x, 2); // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. - let y = unsafe { *bytes.next().unwrap_unchecked() }; + let y = unsafe { bytes.next().unwrap_unchecked() }; let mut ch = utf8_acc_cont_byte(init, y); if x >= 0xE0 { // [[x y z] w] case // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. - let z = unsafe { *bytes.next().unwrap_unchecked() }; + let z = unsafe { bytes.next().unwrap_unchecked() }; let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); ch = init << 12 | y_z; if x >= 0xF0 { @@ -61,7 +61,7 @@ pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> // use only the lower 3 bits of `init` // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. - let w = unsafe { *bytes.next().unwrap_unchecked() }; + let w = unsafe { bytes.next().unwrap_unchecked() }; ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); } } @@ -76,12 +76,12 @@ pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> /// /// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string #[inline] -pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option +pub(super) unsafe fn next_code_point_reverse(bytes: &mut I) -> Option where - I: DoubleEndedIterator, + I: DoubleEndedIterator, { // Decode UTF-8 - let w = match *bytes.next_back()? { + let w = match bytes.next_back()? { next_byte if next_byte < 128 => return Some(next_byte as u32), back_byte => back_byte, }; @@ -91,17 +91,17 @@ where let mut ch; // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. - let z = unsafe { *bytes.next_back().unwrap_unchecked() }; + let z = unsafe { bytes.next_back().unwrap_unchecked() }; ch = utf8_first_byte(z, 2); if utf8_is_cont_byte(z) { // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. - let y = unsafe { *bytes.next_back().unwrap_unchecked() }; + let y = unsafe { bytes.next_back().unwrap_unchecked() }; ch = utf8_first_byte(y, 3); if utf8_is_cont_byte(y) { // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. - let x = unsafe { *bytes.next_back().unwrap_unchecked() }; + let x = unsafe { bytes.next_back().unwrap_unchecked() }; ch = utf8_first_byte(x, 4); ch = utf8_acc_cont_byte(ch, y); } diff --git a/library/std/src/sys_common/wtf8.rs b/library/std/src/sys_common/wtf8.rs index 10ef6662115c1..58bd33647a956 100644 --- a/library/std/src/sys_common/wtf8.rs +++ b/library/std/src/sys_common/wtf8.rs @@ -847,7 +847,7 @@ impl<'a> Iterator for Wtf8CodePoints<'a> { #[inline] fn next(&mut self) -> Option { // SAFETY: `self.bytes` has been created from a WTF-8 string - unsafe { next_code_point(&mut self.bytes).map(|c| CodePoint { value: c }) } + unsafe { next_code_point(&mut (&mut self.bytes).copied()).map(|c| CodePoint { value: c }) } } #[inline]