|
5 | 5 | use WP_HTML_Span; |
6 | 6 | use WP_HTML_Text_Replacement; |
7 | 7 |
|
8 | | -use function WordPress\Encoding\utf8_codepoint_at; |
| 8 | +use function WordPress\Encoding\compat\_wp_scan_utf8; |
| 9 | +use function WordPress\Encoding\utf8_ord; |
9 | 10 |
|
10 | 11 | /** |
11 | 12 | * XML API: XMLProcessor class |
|
17 | 18 | * It implements a subset of the XML 1.0 specification (https://www.w3.org/TR/xml/) |
18 | 19 | * and supports XML documents with the following characteristics: |
19 | 20 | * |
20 | | - * * XML 1.0 |
21 | | - * * Well-formed |
22 | | - * * UTF-8 encoded |
23 | | - * * Not standalone (so can use external entities) |
24 | | - * * No external DTD subset expansion (external entities may exist but are not fetched). |
| 21 | + * – XML 1.0 |
| 22 | + * – Well-formed |
| 23 | + * – UTF-8 encoded |
| 24 | + * – Not standalone (so can use external entities) |
| 25 | + * – No external DTD subset expansion (external entities may exist but are not fetched). |
| 26 | + * |
| 27 | + * XML 1.1 is explicitly not a design goal here. Version 1.1 is |
| 28 | + * more complex specification and not so widely supported. |
25 | 29 | * |
26 | 30 | * ### Possible future direction for this module |
27 | 31 | * |
|
35 | 39 | * numbers, indexes, and other debugging info. |
36 | 40 | * |
37 | 41 | * |
38 | | - * @TODO: Support XML 1.1. |
39 | | - * |
40 | | - * @TODO: Evaluate the performance of utf8_codepoint_at() against using the mbstring |
41 | | - * extension. If mbstring is faster, then use it whenever it's available with |
42 | | - * utf8_codepoint_at() as a fallback. |
43 | | - * |
44 | 42 | * @package WordPress |
45 | 43 | * @subpackage HTML-API |
46 | 44 | * @since WP_VERSION |
@@ -1192,8 +1190,8 @@ protected function parse_next_token() { |
1192 | 1190 | /** |
1193 | 1191 | * Compute fully qualified attributes and assert: |
1194 | 1192 | * |
1195 | | - * * All attributes have valid namespaces. |
1196 | | - * * No two attributes have the same (local name, namespace) pair. |
| 1193 | + * – All attributes have valid namespaces. |
| 1194 | + * – No two attributes have the same (local name, namespace) pair. |
1197 | 1195 | * |
1198 | 1196 | * @see https://www.w3.org/TR/2006/REC-xml-names11-20060816/#uniqAttrs |
1199 | 1197 | */ |
@@ -1684,8 +1682,8 @@ private function parse_next_tag() { |
1684 | 1682 | * names. |
1685 | 1683 | * |
1686 | 1684 | * Reference: |
1687 | | - * * https://www.w3.org/TR/xml/#NT-STag |
1688 | | - * * https://www.w3.org/TR/xml/#NT-Name |
| 1685 | + * – https://www.w3.org/TR/xml/#NT-STag |
| 1686 | + * – https://www.w3.org/TR/xml/#NT-Name |
1689 | 1687 | */ |
1690 | 1688 | $tag_name_length = $this->parse_name( $at + 1 ); |
1691 | 1689 | if ( false === $tag_name_length ) { |
@@ -2762,48 +2760,100 @@ private function skip_parameter_entity_reference( &$offset ) { |
2762 | 2760 | * @return int |
2763 | 2761 | */ |
2764 | 2762 | private function parse_name( $offset ) { |
2765 | | - static $i = 0; |
2766 | 2763 | $name_byte_length = 0; |
| 2764 | + $at = $offset; |
| 2765 | + |
| 2766 | + // Fast path: consume any ASCII NameStartChar bytes. |
| 2767 | + $name_byte_length += strspn( |
| 2768 | + $this->xml, |
| 2769 | + ':ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz', |
| 2770 | + $offset + $name_byte_length, |
| 2771 | + 1 |
| 2772 | + ); |
| 2773 | + |
2767 | 2774 | while ( true ) { |
2768 | 2775 | /** |
2769 | 2776 | * Parse the next unicode codepoint. |
2770 | 2777 | * |
2771 | | - * We use a custom UTF-8 decoder here. No other method |
2772 | | - * is reliable and available enough to depend on it in |
2773 | | - * WordPress core: |
| 2778 | + * We use a the `_wp_scan_utf8` UTF-8 decoder introduced in WordPress 6.9. No other method |
| 2779 | + * is reliable and available enough to depend on it in WordPress core: |
2774 | 2780 | * |
2775 | | - * * mb_ord() – is not available on all hosts. |
2776 | | - * * iconv_substr() – is not available on all hosts. |
2777 | | - * * preg_match() – can fail with PREG_BAD_UTF8_ERROR when the input |
| 2781 | + * – mb_ord() – is available on 99.5%+ or more of hosts, but not on all hosts. |
| 2782 | + * – iconv_substr() – is not available on all hosts. |
| 2783 | + * – preg_match() – can fail with PREG_BAD_UTF8_ERROR when the input |
2778 | 2784 | * contains an incomplete UTF-8 byte sequence – even |
2779 | 2785 | * when that sequence comes after a valid match. This |
2780 | 2786 | * failure mode cannot be reproduced with just any string. |
2781 | 2787 | * The runtime must be in a specific state. It's unclear |
2782 | 2788 | * how to reliably reproduce this failure mode in a |
2783 | 2789 | * unit test. |
2784 | 2790 | * |
2785 | | - * Performance-wise, character-by-character processing via utf8_codepoint_at |
2786 | | - * is still much faster than relying on preg_match(). The mbstring extension |
2787 | | - * is likely faster. It would be interesting to evaluate the performance |
2788 | | - * and prefer mbstring whenever it's available. |
| 2791 | + * Performance-wise, character-by-character processing via _wp_scan_utf8 |
| 2792 | + * is pretty slow. The ASCII fast path below enables skipping most of the |
| 2793 | + * UTF-8 decoder calls. |
| 2794 | + * |
| 2795 | + * If the UTF-8 decoder performance ever becomes a bottleneck, there are a |
| 2796 | + * few ways to significantly improve it: |
| 2797 | + * |
| 2798 | + * – Call a native grapheme_ function when available. |
| 2799 | + * – Introduce a custom UTF-8 decoder optimized for codepoint-by-codepoint processing. |
| 2800 | + * It could be the streaming version of the UTF-8 decoder, such as `_wp_iterate_utf8`, |
| 2801 | + * that avoids the repeated strspn() calls. Alternatively, the older `utf8_codepoint_at` |
| 2802 | + * function could be restored if its codepoint-by-codepoint decoding performance is |
| 2803 | + * better than the _wp_scan_utf8. |
| 2804 | + */ |
| 2805 | + |
| 2806 | + /** |
| 2807 | + * The ASCII speedup includes all ASCII NameStartChar, which are also valid |
| 2808 | + * NameChar, making it possible to quickly scan past these bytes without |
| 2809 | + * further processing. |
| 2810 | + */ |
| 2811 | + $name_byte_length += strspn( $this->xml, ":ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz-.0123456789\u{B7}", $offset + $name_byte_length ); |
| 2812 | + |
| 2813 | + /* |
| 2814 | + * Quickly check if the next byte is an ASCII byte that is not allowed in XML |
| 2815 | + * NameStartChar. If so, we can break out of the loop without calling the UTF-8 decoder. |
| 2816 | + * |
| 2817 | + * Even though this does not seem to be different from the ASCII fast path in the |
| 2818 | + * _wp_scan_utf8 function, skipping that function call still provides a ~50% speed |
| 2819 | + * improvement. |
2789 | 2820 | */ |
2790 | | - $codepoint = utf8_codepoint_at( |
| 2821 | + $is_non_name_ascii_byte = strspn( |
2791 | 2822 | $this->xml, |
| 2823 | + "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" . |
| 2824 | + "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" . |
| 2825 | + " !\"#$%&'()*+,./;<=>?@[\\]^`{|}~\x7f", |
2792 | 2826 | $offset + $name_byte_length, |
2793 | | - $bytes_parsed |
2794 | | - ); |
2795 | | - if ( |
2796 | | - // Byte sequence is not a valid UTF-8 codepoint. |
2797 | | - ( 0xFFFD === $codepoint && 0 === $bytes_parsed ) || |
2798 | | - // No codepoint at the given offset. |
2799 | | - null === $codepoint || |
2800 | | - // The codepoint is not a valid part of an XML NameChar or NameStartChar. |
2801 | | - ! $this->is_valid_name_codepoint( $codepoint, 0 === $name_byte_length ) |
2802 | | - ) { |
| 2827 | + 1 |
| 2828 | + ) > 0; |
| 2829 | + if ( $is_non_name_ascii_byte ) { |
| 2830 | + break; |
| 2831 | + } |
| 2832 | + |
| 2833 | + // EOF. |
| 2834 | + if ( $offset + $name_byte_length >= strlen( $this->xml ) ) { |
| 2835 | + break; |
| 2836 | + } |
| 2837 | + |
| 2838 | + // The next byte sequence is, very likely, a UTF-8 codepoint. Let's |
| 2839 | + // try to decode it. |
| 2840 | + $at = $offset + $name_byte_length; |
| 2841 | + $new_at = $at; |
| 2842 | + $invalid_length = 0; |
| 2843 | + if ( 1 !== _wp_scan_utf8( $this->xml, $new_at, $invalid_length, null, 1 ) ) { |
| 2844 | + // EOF or invalid utf-8 byte sequence. |
| 2845 | + break; |
| 2846 | + } |
| 2847 | + |
| 2848 | + $codepoint_byte_length = $new_at - $at; |
| 2849 | + $codepoint = utf8_ord( substr( $this->xml, $at, $codepoint_byte_length ) ); |
| 2850 | + |
| 2851 | + // The codepoint is not a valid part of an XML NameChar or NameStartChar. |
| 2852 | + if ( ! $this->is_valid_name_codepoint( $codepoint, 0 === $name_byte_length ) ) { |
2803 | 2853 | break; |
2804 | 2854 | } |
2805 | | - $codepoint = null; |
2806 | | - $name_byte_length += $bytes_parsed; |
| 2855 | + $name_byte_length += $codepoint_byte_length; |
| 2856 | + $at = $new_at; |
2807 | 2857 | } |
2808 | 2858 |
|
2809 | 2859 | return $name_byte_length; |
|
0 commit comments