@@ -1795,39 +1795,71 @@ const fn len_utf16(code: u32) -> usize {
1795
1795
#[ inline]
1796
1796
pub const fn encode_utf8_raw ( code : u32 , dst : & mut [ u8 ] ) -> & mut [ u8 ] {
1797
1797
let len = len_utf8 ( code) ;
1798
- match ( len, & mut * dst) {
1799
- ( 1 , [ a, ..] ) => {
1800
- * a = code as u8 ;
1801
- }
1802
- ( 2 , [ a, b, ..] ) => {
1803
- * a = ( code >> 6 & 0x1F ) as u8 | TAG_TWO_B ;
1804
- * b = ( code & 0x3F ) as u8 | TAG_CONT ;
1805
- }
1806
- ( 3 , [ a, b, c, ..] ) => {
1807
- * a = ( code >> 12 & 0x0F ) as u8 | TAG_THREE_B ;
1808
- * b = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
1809
- * c = ( code & 0x3F ) as u8 | TAG_CONT ;
1810
- }
1811
- ( 4 , [ a, b, c, d, ..] ) => {
1812
- * a = ( code >> 18 & 0x07 ) as u8 | TAG_FOUR_B ;
1813
- * b = ( code >> 12 & 0x3F ) as u8 | TAG_CONT ;
1814
- * c = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
1815
- * d = ( code & 0x3F ) as u8 | TAG_CONT ;
1816
- }
1817
- _ => {
1818
- const_panic ! (
1819
- "encode_utf8: buffer does not have enough bytes to encode code point" ,
1820
- "encode_utf8: need {len} bytes to encode U+{code:04X} but buffer has just {dst_len}" ,
1821
- code: u32 = code,
1822
- len: usize = len,
1823
- dst_len: usize = dst. len( ) ,
1824
- )
1825
- }
1826
- } ;
1798
+ if dst. len ( ) < len {
1799
+ const_panic ! (
1800
+ "encode_utf8: buffer does not have enough bytes to encode code point" ,
1801
+ "encode_utf8: need {len} bytes to encode U+{code:04X} but buffer has just {dst_len}" ,
1802
+ code: u32 = code,
1803
+ len: usize = len,
1804
+ dst_len: usize = dst. len( ) ,
1805
+ ) ;
1806
+ }
1807
+
1808
+ // SAFETY: `dst` is checked to be at least the length needed to encode the codepoint.
1809
+ unsafe { encode_utf8_raw_unchecked ( code, dst. as_mut_ptr ( ) ) } ;
1810
+
1827
1811
// SAFETY: `<&mut [u8]>::as_mut_ptr` is guaranteed to return a valid pointer and `len` has been tested to be within bounds.
1828
1812
unsafe { slice:: from_raw_parts_mut ( dst. as_mut_ptr ( ) , len) }
1829
1813
}
1830
1814
1815
+ /// Encodes a raw u32 value as UTF-8 to the provided destination buffer.
1816
+ ///
1817
+ /// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range.
1818
+ /// (Creating a `char` in the surrogate range is UB.)
1819
+ /// The result is valid [generalized UTF-8] but not valid UTF-8.
1820
+ ///
1821
+ /// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8
1822
+ ///
1823
+ /// # Safety
1824
+ ///
1825
+ /// The behavior is undefined if the buffer pointed to by `dst` is not
1826
+ /// large enough to hold the encoded codepoint. A buffer of length four
1827
+ /// is large enough to encode any `char`.
1828
+ ///
1829
+ /// For a safe version of this function, see the [`encode_utf8_raw`] function.
1830
+ #[ unstable( feature = "char_internals" , reason = "exposed only for libstd" , issue = "none" ) ]
1831
+ #[ doc( hidden) ]
1832
+ #[ inline]
1833
+ pub const unsafe fn encode_utf8_raw_unchecked ( code : u32 , dst : * mut u8 ) {
1834
+ let len = len_utf8 ( code) ;
1835
+ // SAFETY: The caller must guarantee that the buffer pointed to by `dst`
1836
+ // is at least `len` bytes long.
1837
+ unsafe {
1838
+ match len {
1839
+ 1 => {
1840
+ * dst = code as u8 ;
1841
+ }
1842
+ 2 => {
1843
+ * dst = ( code >> 6 & 0x1F ) as u8 | TAG_TWO_B ;
1844
+ * dst. add ( 1 ) = ( code & 0x3F ) as u8 | TAG_CONT ;
1845
+ }
1846
+ 3 => {
1847
+ * dst = ( code >> 12 & 0x0F ) as u8 | TAG_THREE_B ;
1848
+ * dst. add ( 1 ) = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
1849
+ * dst. add ( 2 ) = ( code & 0x3F ) as u8 | TAG_CONT ;
1850
+ }
1851
+ 4 => {
1852
+ * dst = ( code >> 18 & 0x07 ) as u8 | TAG_FOUR_B ;
1853
+ * dst. add ( 1 ) = ( code >> 12 & 0x3F ) as u8 | TAG_CONT ;
1854
+ * dst. add ( 2 ) = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
1855
+ * dst. add ( 3 ) = ( code & 0x3F ) as u8 | TAG_CONT ;
1856
+ }
1857
+ // SAFETY: `char` always takes between 1 and 4 bytes to encode in UTF-8.
1858
+ _ => crate :: hint:: unreachable_unchecked ( ) ,
1859
+ }
1860
+ }
1861
+ }
1862
+
1831
1863
/// Encodes a raw `u32` value as UTF-16 into the provided `u16` buffer,
1832
1864
/// and then returns the subslice of the buffer that contains the encoded character.
1833
1865
///
0 commit comments