|
5 | 5 | use WP_HTML_Span; |
6 | 6 | use WP_HTML_Text_Replacement; |
7 | 7 |
|
8 | | -use function WordPress\Encoding\utf8_codepoint_at; |
| 8 | +use function WordPress\Encoding\compat\_wp_scan_utf8; |
| 9 | +use function WordPress\Encoding\utf8_ord; |
9 | 10 |
|
10 | 11 | /** |
11 | 12 | * XML API: XMLProcessor class |
|
17 | 18 | * It implements a subset of the XML 1.0 specification (https://www.w3.org/TR/xml/) |
18 | 19 | * and supports XML documents with the following characteristics: |
19 | 20 | * |
20 | | - * * XML 1.0 |
21 | | - * * Well-formed |
22 | | - * * UTF-8 encoded |
23 | | - * * Not standalone (so can use external entities) |
24 | | - * * No DTD, DOCTYPE, ATTLIST, ENTITY, or conditional sections (will fail on them) |
| 21 | + * – XML 1.0 |
| 22 | + * – Well-formed |
| 23 | + * – UTF-8 encoded |
| 24 | + * – Not standalone (so can use external entities) |
| 25 | + * – No DTD, DOCTYPE, ATTLIST, ENTITY, or conditional sections (will fail on them) |
| 26 | + * |
| 27 | + * XML 1.1 is explicitly not a design goal here. Version 1.1 is |
| 28 | + * more complex specification and not so widely supported. |
25 | 29 | * |
26 | 30 | * ### Possible future direction for this module |
27 | 31 | * |
|
41 | 45 | * * <!NOTATION, see https://www.w3.org/TR/xml/#sec-entity-decl |
42 | 46 | * * Conditional sections, see https://www.w3.org/TR/xml/#sec-condition-sect |
43 | 47 | * |
44 | | - * @TODO: Support XML 1.1. |
45 | | - * |
46 | | - * @TODO: Evaluate the performance of utf8_codepoint_at() against using the mbstring |
47 | | - * extension. If mbstring is faster, then use it whenever it's available with |
48 | | - * utf8_codepoint_at() as a fallback. |
49 | | - * |
50 | 48 | * @package WordPress |
51 | 49 | * @subpackage HTML-API |
52 | 50 | * @since WP_VERSION |
@@ -1198,8 +1196,8 @@ protected function parse_next_token() { |
1198 | 1196 | /** |
1199 | 1197 | * Compute fully qualified attributes and assert: |
1200 | 1198 | * |
1201 | | - * * All attributes have valid namespaces. |
1202 | | - * * No two attributes have the same (local name, namespace) pair. |
| 1199 | + * – All attributes have valid namespaces. |
| 1200 | + * – No two attributes have the same (local name, namespace) pair. |
1203 | 1201 | * |
1204 | 1202 | * @see https://www.w3.org/TR/2006/REC-xml-names11-20060816/#uniqAttrs |
1205 | 1203 | */ |
@@ -1690,8 +1688,8 @@ private function parse_next_tag() { |
1690 | 1688 | * names. |
1691 | 1689 | * |
1692 | 1690 | * Reference: |
1693 | | - * * https://www.w3.org/TR/xml/#NT-STag |
1694 | | - * * https://www.w3.org/TR/xml/#NT-Name |
| 1691 | + * – https://www.w3.org/TR/xml/#NT-STag |
| 1692 | + * – https://www.w3.org/TR/xml/#NT-Name |
1695 | 1693 | */ |
1696 | 1694 | $tag_name_length = $this->parse_name( $at + 1 ); |
1697 | 1695 | if ( false === $tag_name_length ) { |
@@ -2328,48 +2326,100 @@ private function skip_whitespace() { |
2328 | 2326 | * @return int |
2329 | 2327 | */ |
2330 | 2328 | private function parse_name( $offset ) { |
2331 | | - static $i = 0; |
2332 | 2329 | $name_byte_length = 0; |
| 2330 | + $at = $offset; |
| 2331 | + |
| 2332 | + // Fast path: consume any ASCII NameStartChar bytes. |
| 2333 | + $name_byte_length += strspn( |
| 2334 | + $this->xml, |
| 2335 | + ':ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz', |
| 2336 | + $offset + $name_byte_length, |
| 2337 | + 1 |
| 2338 | + ); |
| 2339 | + |
2333 | 2340 | while ( true ) { |
2334 | 2341 | /** |
2335 | 2342 | * Parse the next unicode codepoint. |
2336 | 2343 | * |
2337 | | - * We use a custom UTF-8 decoder here. No other method |
2338 | | - * is reliable and available enough to depend on it in |
2339 | | - * WordPress core: |
| 2344 | + * We use a the `_wp_scan_utf8` UTF-8 decoder introduced in WordPress 6.9. No other method |
| 2345 | + * is reliable and available enough to depend on it in WordPress core: |
2340 | 2346 | * |
2341 | | - * * mb_ord() – is not available on all hosts. |
2342 | | - * * iconv_substr() – is not available on all hosts. |
2343 | | - * * preg_match() – can fail with PREG_BAD_UTF8_ERROR when the input |
| 2347 | + * – mb_ord() – is available on 99.5%+ or more of hosts, but not on all hosts. |
| 2348 | + * – iconv_substr() – is not available on all hosts. |
| 2349 | + * – preg_match() – can fail with PREG_BAD_UTF8_ERROR when the input |
2344 | 2350 | * contains an incomplete UTF-8 byte sequence – even |
2345 | 2351 | * when that sequence comes after a valid match. This |
2346 | 2352 | * failure mode cannot be reproduced with just any string. |
2347 | 2353 | * The runtime must be in a specific state. It's unclear |
2348 | 2354 | * how to reliably reproduce this failure mode in a |
2349 | 2355 | * unit test. |
2350 | 2356 | * |
2351 | | - * Performance-wise, character-by-character processing via utf8_codepoint_at |
2352 | | - * is still much faster than relying on preg_match(). The mbstring extension |
2353 | | - * is likely faster. It would be interesting to evaluate the performance |
2354 | | - * and prefer mbstring whenever it's available. |
| 2357 | + * Performance-wise, character-by-character processing via _wp_scan_utf8 |
| 2358 | + * is pretty slow. The ASCII fast path below enables skipping most of the |
| 2359 | + * UTF-8 decoder calls. |
| 2360 | + * |
| 2361 | + * If the UTF-8 decoder performance ever becomes a bottleneck, there are a |
| 2362 | + * few ways to significantly improve it: |
| 2363 | + * |
| 2364 | + * – Call a native grapheme_ function when available. |
| 2365 | + * – Introduce a custom UTF-8 decoder optimized for codepoint-by-codepoint processing. |
| 2366 | + * It could be the streaming version of the UTF-8 decoder, such as `_wp_iterate_utf8`, |
| 2367 | + * that avoids the repeated strspn() calls. Alternatively, the older `utf8_codepoint_at` |
| 2368 | + * function could be restored if its codepoint-by-codepoint decoding performance is |
| 2369 | + * better than the _wp_scan_utf8. |
| 2370 | + */ |
| 2371 | + |
| 2372 | + /** |
| 2373 | + * The ASCII speedup includes all ASCII NameStartChar, which are also valid |
| 2374 | + * NameChar, making it possible to quickly scan past these bytes without |
| 2375 | + * further processing. |
| 2376 | + */ |
| 2377 | + $name_byte_length += strspn( $this->xml, ":ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz-.0123456789\u{B7}", $offset + $name_byte_length ); |
| 2378 | + |
| 2379 | + /* |
| 2380 | + * Quickly check if the next byte is an ASCII byte that is not allowed in XML |
| 2381 | + * NameStartChar. If so, we can break out of the loop without calling the UTF-8 decoder. |
| 2382 | + * |
| 2383 | + * Even though this does not seem to be different from the ASCII fast path in the |
| 2384 | + * _wp_scan_utf8 function, skipping that function call still provides a ~50% speed |
| 2385 | + * improvement. |
2355 | 2386 | */ |
2356 | | - $codepoint = utf8_codepoint_at( |
| 2387 | + $is_non_name_ascii_byte = strspn( |
2357 | 2388 | $this->xml, |
| 2389 | + "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" . |
| 2390 | + "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" . |
| 2391 | + " !\"#$%&'()*+,./;<=>?@[\\]^`{|}~\x7f", |
2358 | 2392 | $offset + $name_byte_length, |
2359 | | - $bytes_parsed |
2360 | | - ); |
2361 | | - if ( |
2362 | | - // Byte sequence is not a valid UTF-8 codepoint. |
2363 | | - ( 0xFFFD === $codepoint && 0 === $bytes_parsed ) || |
2364 | | - // No codepoint at the given offset. |
2365 | | - null === $codepoint || |
2366 | | - // The codepoint is not a valid part of an XML NameChar or NameStartChar. |
2367 | | - ! $this->is_valid_name_codepoint( $codepoint, 0 === $name_byte_length ) |
2368 | | - ) { |
| 2393 | + 1 |
| 2394 | + ) > 0; |
| 2395 | + if ( $is_non_name_ascii_byte ) { |
| 2396 | + break; |
| 2397 | + } |
| 2398 | + |
| 2399 | + // EOF. |
| 2400 | + if ( $offset + $name_byte_length >= strlen( $this->xml ) ) { |
| 2401 | + break; |
| 2402 | + } |
| 2403 | + |
| 2404 | + // The next byte sequence is, very likely, a UTF-8 codepoint. Let's |
| 2405 | + // try to decode it. |
| 2406 | + $at = $offset + $name_byte_length; |
| 2407 | + $new_at = $at; |
| 2408 | + $invalid_length = 0; |
| 2409 | + if ( 1 !== _wp_scan_utf8( $this->xml, $new_at, $invalid_length, null, 1 ) ) { |
| 2410 | + // EOF or invalid utf-8 byte sequence. |
| 2411 | + break; |
| 2412 | + } |
| 2413 | + |
| 2414 | + $codepoint_byte_length = $new_at - $at; |
| 2415 | + $codepoint = utf8_ord( substr( $this->xml, $at, $codepoint_byte_length ) ); |
| 2416 | + |
| 2417 | + // The codepoint is not a valid part of an XML NameChar or NameStartChar. |
| 2418 | + if ( ! $this->is_valid_name_codepoint( $codepoint, 0 === $name_byte_length ) ) { |
2369 | 2419 | break; |
2370 | 2420 | } |
2371 | | - $codepoint = null; |
2372 | | - $name_byte_length += $bytes_parsed; |
| 2421 | + $name_byte_length += $codepoint_byte_length; |
| 2422 | + $at = $new_at; |
2373 | 2423 | } |
2374 | 2424 |
|
2375 | 2425 | return $name_byte_length; |
|
0 commit comments