@@ -1789,32 +1789,66 @@ pub const fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] {
17891789 ) ;
17901790 }
17911791 let len = len_utf8 ( code) ;
1792- match ( len, & mut * dst) {
1793- ( 1 , [ a, ..] ) => {
1794- * a = code as u8 ;
1795- }
1796- ( 2 , [ a, b, ..] ) => {
1797- * a = ( code >> 6 & 0x1F ) as u8 | TAG_TWO_B ;
1798- * b = ( code & 0x3F ) as u8 | TAG_CONT ;
1799- }
1800- ( 3 , [ a, b, c, ..] ) => {
1801- * a = ( code >> 12 & 0x0F ) as u8 | TAG_THREE_B ;
1802- * b = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
1803- * c = ( code & 0x3F ) as u8 | TAG_CONT ;
1804- }
1805- ( 4 , [ a, b, c, d, ..] ) => {
1806- * a = ( code >> 18 & 0x07 ) as u8 | TAG_FOUR_B ;
1807- * b = ( code >> 12 & 0x3F ) as u8 | TAG_CONT ;
1808- * c = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
1809- * d = ( code & 0x3F ) as u8 | TAG_CONT ;
1810- }
1792+ if dst. len ( ) < len {
18111793 // FIXME(const-hack): We would prefer to have streamlined panics when formatters become const-friendly.
1812- _ => const_eval_select ( ( code, len, dst. len ( ) ) , panic_at_const, panic_at_rt) ,
1813- } ;
1794+ const_eval_select ( ( code, len, dst. len ( ) ) , panic_at_const, panic_at_rt) ;
1795+ }
1796+
1797+ // SAFETY: `dst` is checked to be at least the length needed to encode the codepoint.
1798+ unsafe { encode_utf8_raw_unchecked ( code, dst. as_mut_ptr ( ) ) } ;
1799+
18141800 // SAFETY: `<&mut [u8]>::as_mut_ptr` is guaranteed to return a valid pointer and `len` has been tested to be within bounds.
18151801 unsafe { slice:: from_raw_parts_mut ( dst. as_mut_ptr ( ) , len) }
18161802}
18171803
1804+ /// Encodes a raw u32 value as UTF-8 to the provided destination buffer.
1805+ ///
1806+ /// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range.
1807+ /// (Creating a `char` in the surrogate range is UB.)
1808+ /// The result is valid [generalized UTF-8] but not valid UTF-8.
1809+ ///
1810+ /// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8
1811+ ///
1812+ /// # Safety
1813+ ///
1814+ /// The behavior is undefined if the buffer pointed to by `dst` is not
1815+ /// large enough to hold the encoded codepoint. A buffer of length four
1816+ /// is large enough to encode any `char`.
1817+ ///
1818+ /// For a safe version of this function, see the [`encode_utf8_raw`] function.
1819+ #[ unstable( feature = "char_internals" , reason = "exposed only for libstd" , issue = "none" ) ]
1820+ #[ doc( hidden) ]
1821+ #[ inline]
1822+ pub const unsafe fn encode_utf8_raw_unchecked ( code : u32 , dst : * mut u8 ) {
1823+ let len = len_utf8 ( code) ;
1824+ // SAFETY: The caller must guarantee that the buffer pointed to by `dst`
1825+ // is at least `len` bytes long.
1826+ unsafe {
1827+ match len {
1828+ 1 => {
1829+ * dst = code as u8 ;
1830+ }
1831+ 2 => {
1832+ * dst = ( code >> 6 & 0x1F ) as u8 | TAG_TWO_B ;
1833+ * dst. add ( 1 ) = ( code & 0x3F ) as u8 | TAG_CONT ;
1834+ }
1835+ 3 => {
1836+ * dst = ( code >> 12 & 0x0F ) as u8 | TAG_THREE_B ;
1837+ * dst. add ( 1 ) = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
1838+ * dst. add ( 2 ) = ( code & 0x3F ) as u8 | TAG_CONT ;
1839+ }
1840+ 4 => {
1841+ * dst = ( code >> 18 & 0x07 ) as u8 | TAG_FOUR_B ;
1842+ * dst. add ( 1 ) = ( code >> 12 & 0x3F ) as u8 | TAG_CONT ;
1843+ * dst. add ( 2 ) = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
1844+ * dst. add ( 3 ) = ( code & 0x3F ) as u8 | TAG_CONT ;
1845+ }
1846+ // SAFETY: `char` always takes between 1 and 4 bytes to encode in UTF-8.
1847+ _ => crate :: hint:: unreachable_unchecked ( ) ,
1848+ }
1849+ }
1850+ }
1851+
18181852/// Encodes a raw `u32` value as UTF-16 into the provided `u16` buffer,
18191853/// and then returns the subslice of the buffer that contains the encoded character.
18201854///
0 commit comments