@@ -26,18 +26,21 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
26
26
}
27
27
28
28
/// Reads the next code point out of a byte iterator (assuming a
29
- /// UTF-8-like encoding).
29
+ /// UTF-8-like encoding) and returns it along with its width .
30
30
///
31
31
/// # Safety
32
32
///
33
33
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
34
34
#[ unstable( feature = "str_internals" , issue = "none" ) ]
35
35
#[ inline]
36
- pub unsafe fn next_code_point < ' a , I : Iterator < Item = & ' a u8 > > ( bytes : & mut I ) -> Option < u32 > {
36
+ #[ allow( dead_code) ]
37
+ pub unsafe fn next_code_point_with_width < ' a , I : Iterator < Item = & ' a u8 > > (
38
+ bytes : & mut I ,
39
+ ) -> Option < ( u32 , usize ) > {
37
40
// Decode UTF-8
38
41
let x = * bytes. next ( ) ?;
39
42
if x < 128 {
40
- return Some ( x as u32 ) ;
43
+ return Some ( ( x as u32 , 1 ) ) ;
41
44
}
42
45
43
46
// Multibyte case follows
@@ -47,13 +50,15 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) ->
47
50
// SAFETY: `bytes` produces an UTF-8-like string,
48
51
// so the iterator must produce a value here.
49
52
let y = unsafe { * bytes. next ( ) . unwrap_unchecked ( ) } ;
53
+ let mut width = 2 ;
50
54
let mut ch = utf8_acc_cont_byte ( init, y) ;
51
55
if x >= 0xE0 {
52
56
// [[x y z] w] case
53
57
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
54
58
// SAFETY: `bytes` produces an UTF-8-like string,
55
59
// so the iterator must produce a value here.
56
60
let z = unsafe { * bytes. next ( ) . unwrap_unchecked ( ) } ;
61
+ width = 3 ;
57
62
let y_z = utf8_acc_cont_byte ( ( y & CONT_MASK ) as u32 , z) ;
58
63
ch = init << 12 | y_z;
59
64
if x >= 0xF0 {
@@ -62,11 +67,25 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) ->
62
67
// SAFETY: `bytes` produces an UTF-8-like string,
63
68
// so the iterator must produce a value here.
64
69
let w = unsafe { * bytes. next ( ) . unwrap_unchecked ( ) } ;
70
+ width = 4 ;
65
71
ch = ( init & 7 ) << 18 | utf8_acc_cont_byte ( y_z, w) ;
66
72
}
67
73
}
68
74
69
- Some ( ch)
75
+ Some ( ( ch, width) )
76
+ }
77
+
78
+ /// Reads the next code point out of a byte iterator (assuming a
79
+ /// UTF-8-like encoding).
80
+ ///
81
+ /// # Safety
82
+ ///
83
+ /// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
84
+ #[ unstable( feature = "str_internals" , issue = "none" ) ]
85
+ #[ inline]
86
+ pub unsafe fn next_code_point < ' a , I : Iterator < Item = & ' a u8 > > ( bytes : & mut I ) -> Option < u32 > {
87
+ // SAFETY: same call condition
88
+ Some ( unsafe { next_code_point_with_width ( bytes) } ?. 0 )
70
89
}
71
90
72
91
/// Reads the last code point out of a byte iterator (assuming a
0 commit comments