rust-lang · mutantbob · Apr 13, 2022
diff --git a/library/alloc/tests/lib.rs b/library/alloc/tests/lib.rs
@@ -40,6 +40,7 @@
 #![feature(panic_update_hook)]
 #![feature(slice_flatten)]
 #![feature(thin_box)]
+#![feature(str_internals)]
 
 use std::collections::hash_map::DefaultHasher;
 use std::hash::{Hash, Hasher};

diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs
@@ -1,3 +1,4 @@
+use core::str::next_code_point;
 use std::assert_matches::assert_matches;
 use std::borrow::Cow;
 use std::cmp::Ordering::{Equal, Greater, Less};
@@ -2367,3 +2368,71 @@ fn ceil_char_boundary() {
 fn ceil_char_boundary_above_len_panic() {
     let _ = "x".ceil_char_boundary(2);
 }
+
+fn check_decoded_string<I: Iterator<Item = u8>>(mut iter: &mut I, expected: &str) {
+    for char in expected.chars() {
+        assert_eq!(Some(char as u32), unsafe { next_code_point(&mut iter) });
+    }
+    assert_eq!(None, unsafe { next_code_point(&mut iter) });
+}
+
+#[test]
+pub fn dirt_simple_code_point() {
+    unsafe {
+        let src = b"banana";
+        let mut iter = src.iter().copied();
+        for char in "banana".chars() {
+            assert_eq!(Some(char as u32), next_code_point(&mut iter));
+        }
+
+        let tmp = next_code_point(&mut iter);
+        assert_eq!(None, tmp);
+    }
+    {
+        let src = [
+            b'd', b'a', b'i', b' ', 229, 164, 167, 232, 179, 162, 232, 128, 133, b' ', b'k', b'e',
+            b'n', b'j', b'a',
+        ];
+
+        let mut iter = src.into_iter();
+
+        check_decoded_string(&mut iter, "dai 大賢者 kenja");
+    }
+}
+
+struct Shenanigans {
+    compressed: Vec<u8>,
+    cursor: usize,
+}
+
+// This provides an example of a u8 iterator which can not use Item=&u8.
+// A real-world case is a string stored in progmem on an AVR, which can
+// not be incorporated into a unit test
+impl Iterator for Shenanigans {
+    type Item = u8;
+    fn next(&mut self) -> Option<<Self as Iterator>::Item> {
+        let end = self.cursor + 6;
+        let i1 = self.cursor / 8;
+        let i2 = (end - 1) / 8;
+        if i2 >= self.compressed.len() {
+            return None;
+        }
+        let base64 = if i1 == i2 {
+            self.compressed[i1] >> (2 - self.cursor % 8) & 0x3f
+        } else {
+            0x3f & ((self.compressed[i1] << (self.cursor % 8 - 2))
+                | (self.compressed[i2] >> (10 - self.cursor % 8)))
+        };
+        self.cursor += 6;
+        Some(base64 + b' ')
+    }
+}
+
+#[test]
+pub fn fancy_code_point() {
+    let mut iter =
+        Shenanigans { compressed: vec![142, 139, 236, 228, 10, 238, 166, 122, 52], cursor: 0 };
+    for char in "CHOLY KNIGHT".chars() {
+        assert_eq!(Some(char as u32), unsafe { next_code_point(&mut iter) });
+    }
+}
diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs
@@ -41,7 +41,9 @@ impl<'a> Iterator for Chars<'a> {
     fn next(&mut self) -> Option<char> {
         // SAFETY: `str` invariant says `self.iter` is a valid UTF-8 string and
         // the resulting `ch` is a valid Unicode Scalar Value.
-        unsafe { next_code_point(&mut self.iter).map(|ch| char::from_u32_unchecked(ch)) }
+        unsafe {
+            next_code_point(&mut (&mut self.iter).copied()).map(|ch| char::from_u32_unchecked(ch))
+        }
     }
 
     #[inline]
@@ -81,7 +83,10 @@ impl<'a> DoubleEndedIterator for Chars<'a> {
     fn next_back(&mut self) -> Option<char> {
         // SAFETY: `str` invariant says `self.iter` is a valid UTF-8 string and
         // the resulting `ch` is a valid Unicode Scalar Value.
-        unsafe { next_code_point_reverse(&mut self.iter).map(|ch| char::from_u32_unchecked(ch)) }
+        unsafe {
+            next_code_point_reverse(&mut (&mut self.iter).copied())
+                .map(|ch| char::from_u32_unchecked(ch))
+        }
     }
 }
 

diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs
@@ -33,9 +33,9 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
 /// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
 #[unstable(feature = "str_internals", issue = "none")]
 #[inline]
-pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
+pub unsafe fn next_code_point<I: Iterator<Item = u8>>(bytes: &mut I) -> Option<u32> {
     // Decode UTF-8
-    let x = *bytes.next()?;
+    let x = bytes.next()?;
     if x < 128 {
         return Some(x as u32);
     }
@@ -46,22 +46,22 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) ->
     let init = utf8_first_byte(x, 2);
     // SAFETY: `bytes` produces an UTF-8-like string,
     // so the iterator must produce a value here.
-    let y = unsafe { *bytes.next().unwrap_unchecked() };
+    let y = unsafe { bytes.next().unwrap_unchecked() };
     let mut ch = utf8_acc_cont_byte(init, y);
     if x >= 0xE0 {
         // [[x y z] w] case
         // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
         // SAFETY: `bytes` produces an UTF-8-like string,
         // so the iterator must produce a value here.
-        let z = unsafe { *bytes.next().unwrap_unchecked() };
+        let z = unsafe { bytes.next().unwrap_unchecked() };
         let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
         ch = init << 12 | y_z;
         if x >= 0xF0 {
             // [x y z w] case
             // use only the lower 3 bits of `init`
             // SAFETY: `bytes` produces an UTF-8-like string,
             // so the iterator must produce a value here.
-            let w = unsafe { *bytes.next().unwrap_unchecked() };
+            let w = unsafe { bytes.next().unwrap_unchecked() };
             ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
         }
     }
@@ -76,12 +76,12 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) ->
 ///
 /// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
 #[inline]
-pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
+pub(super) unsafe fn next_code_point_reverse<I>(bytes: &mut I) -> Option<u32>
 where
-    I: DoubleEndedIterator<Item = &'a u8>,
+    I: DoubleEndedIterator<Item = u8>,
 {
     // Decode UTF-8
-    let w = match *bytes.next_back()? {
+    let w = match bytes.next_back()? {
         next_byte if next_byte < 128 => return Some(next_byte as u32),
         back_byte => back_byte,
     };
@@ -91,17 +91,17 @@ where
     let mut ch;
     // SAFETY: `bytes` produces an UTF-8-like string,
     // so the iterator must produce a value here.
-    let z = unsafe { *bytes.next_back().unwrap_unchecked() };
+    let z = unsafe { bytes.next_back().unwrap_unchecked() };
     ch = utf8_first_byte(z, 2);
     if utf8_is_cont_byte(z) {
         // SAFETY: `bytes` produces an UTF-8-like string,
         // so the iterator must produce a value here.
-        let y = unsafe { *bytes.next_back().unwrap_unchecked() };
+        let y = unsafe { bytes.next_back().unwrap_unchecked() };
         ch = utf8_first_byte(y, 3);
         if utf8_is_cont_byte(y) {
             // SAFETY: `bytes` produces an UTF-8-like string,
             // so the iterator must produce a value here.
-            let x = unsafe { *bytes.next_back().unwrap_unchecked() };
+            let x = unsafe { bytes.next_back().unwrap_unchecked() };
             ch = utf8_first_byte(x, 4);
             ch = utf8_acc_cont_byte(ch, y);
         }

diff --git a/library/std/src/sys_common/wtf8.rs b/library/std/src/sys_common/wtf8.rs
@@ -847,7 +847,7 @@ impl<'a> Iterator for Wtf8CodePoints<'a> {
     #[inline]
     fn next(&mut self) -> Option<CodePoint> {
         // SAFETY: `self.bytes` has been created from a WTF-8 string
-        unsafe { next_code_point(&mut self.bytes).map(|c| CodePoint { value: c }) }
+        unsafe { next_code_point(&mut (&mut self.bytes).copied()).map(|c| CodePoint { value: c }) }
     }
 
     #[inline]