Skip to content

Commit 7759434

Browse files
authored
Merge branch 'trunk' into xml-skip-dtd
2 parents 9ffdf1a + 7241121 commit 7759434

File tree

2 files changed

+92
-42
lines changed

2 files changed

+92
-42
lines changed

components/DataLiberation/URL/class-cssprocessor.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22

33
namespace WordPress\DataLiberation\URL;
44

5-
use function WordPress\Encoding\utf8_codepoint_at;
65
use function WordPress\Encoding\codepoint_to_utf8_bytes;
76
use function WordPress\Encoding\compat\_wp_scan_utf8;
7+
use function WordPress\Encoding\utf8_ord;
88
use function WordPress\Encoding\wp_scrub_utf8;
99

1010
/**
@@ -1506,7 +1506,7 @@ private function consume_ident_start_codepoint( $at ): int {
15061506
}
15071507

15081508
$codepoint_byte_length = $new_at - $at;
1509-
$codepoint = utf8_codepoint_at( $this->css, $at );
1509+
$codepoint = utf8_ord( substr( $this->css, $at, $codepoint_byte_length ) );
15101510
if ( null !== $codepoint && $codepoint >= 0x80 ) {
15111511
return $codepoint_byte_length;
15121512
}

components/XML/class-xmlprocessor.php

Lines changed: 90 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
use WP_HTML_Span;
66
use WP_HTML_Text_Replacement;
77

8-
use function WordPress\Encoding\utf8_codepoint_at;
8+
use function WordPress\Encoding\compat\_wp_scan_utf8;
9+
use function WordPress\Encoding\utf8_ord;
910

1011
/**
1112
* XML API: XMLProcessor class
@@ -17,11 +18,14 @@
1718
* It implements a subset of the XML 1.0 specification (https://www.w3.org/TR/xml/)
1819
* and supports XML documents with the following characteristics:
1920
*
20-
* * XML 1.0
21-
* * Well-formed
22-
* * UTF-8 encoded
23-
* * Not standalone (so can use external entities)
24-
* * No external DTD subset expansion (external entities may exist but are not fetched).
21+
* – XML 1.0
22+
* – Well-formed
23+
* – UTF-8 encoded
24+
* – Not standalone (so can use external entities)
25+
* – No external DTD subset expansion (external entities may exist but are not fetched).
26+
*
27+
* XML 1.1 is explicitly not a design goal here. Version 1.1 is
28+
* more complex specification and not so widely supported.
2529
*
2630
* ### Possible future direction for this module
2731
*
@@ -35,12 +39,6 @@
3539
* numbers, indexes, and other debugging info.
3640
*
3741
*
38-
* @TODO: Support XML 1.1.
39-
*
40-
* @TODO: Evaluate the performance of utf8_codepoint_at() against using the mbstring
41-
* extension. If mbstring is faster, then use it whenever it's available with
42-
* utf8_codepoint_at() as a fallback.
43-
*
4442
* @package WordPress
4543
* @subpackage HTML-API
4644
* @since WP_VERSION
@@ -1192,8 +1190,8 @@ protected function parse_next_token() {
11921190
/**
11931191
* Compute fully qualified attributes and assert:
11941192
*
1195-
* * All attributes have valid namespaces.
1196-
* * No two attributes have the same (local name, namespace) pair.
1193+
* All attributes have valid namespaces.
1194+
* No two attributes have the same (local name, namespace) pair.
11971195
*
11981196
* @see https://www.w3.org/TR/2006/REC-xml-names11-20060816/#uniqAttrs
11991197
*/
@@ -1684,8 +1682,8 @@ private function parse_next_tag() {
16841682
* names.
16851683
*
16861684
* Reference:
1687-
* * https://www.w3.org/TR/xml/#NT-STag
1688-
* * https://www.w3.org/TR/xml/#NT-Name
1685+
* https://www.w3.org/TR/xml/#NT-STag
1686+
* https://www.w3.org/TR/xml/#NT-Name
16891687
*/
16901688
$tag_name_length = $this->parse_name( $at + 1 );
16911689
if ( false === $tag_name_length ) {
@@ -2762,48 +2760,100 @@ private function skip_parameter_entity_reference( &$offset ) {
27622760
* @return int
27632761
*/
27642762
private function parse_name( $offset ) {
2765-
static $i = 0;
27662763
$name_byte_length = 0;
2764+
$at = $offset;
2765+
2766+
// Fast path: consume any ASCII NameStartChar bytes.
2767+
$name_byte_length += strspn(
2768+
$this->xml,
2769+
':ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz',
2770+
$offset + $name_byte_length,
2771+
1
2772+
);
2773+
27672774
while ( true ) {
27682775
/**
27692776
* Parse the next unicode codepoint.
27702777
*
2771-
* We use a custom UTF-8 decoder here. No other method
2772-
* is reliable and available enough to depend on it in
2773-
* WordPress core:
2778+
* We use a the `_wp_scan_utf8` UTF-8 decoder introduced in WordPress 6.9. No other method
2779+
* is reliable and available enough to depend on it in WordPress core:
27742780
*
2775-
* * mb_ord() – is not available on all hosts.
2776-
* * iconv_substr() – is not available on all hosts.
2777-
* * preg_match() – can fail with PREG_BAD_UTF8_ERROR when the input
2781+
* mb_ord() – is available on 99.5%+ or more of hosts, but not on all hosts.
2782+
* iconv_substr() – is not available on all hosts.
2783+
* preg_match() – can fail with PREG_BAD_UTF8_ERROR when the input
27782784
* contains an incomplete UTF-8 byte sequence – even
27792785
* when that sequence comes after a valid match. This
27802786
* failure mode cannot be reproduced with just any string.
27812787
* The runtime must be in a specific state. It's unclear
27822788
* how to reliably reproduce this failure mode in a
27832789
* unit test.
27842790
*
2785-
* Performance-wise, character-by-character processing via utf8_codepoint_at
2786-
* is still much faster than relying on preg_match(). The mbstring extension
2787-
* is likely faster. It would be interesting to evaluate the performance
2788-
* and prefer mbstring whenever it's available.
2791+
* Performance-wise, character-by-character processing via _wp_scan_utf8
2792+
* is pretty slow. The ASCII fast path below enables skipping most of the
2793+
* UTF-8 decoder calls.
2794+
*
2795+
* If the UTF-8 decoder performance ever becomes a bottleneck, there are a
2796+
* few ways to significantly improve it:
2797+
*
2798+
* – Call a native grapheme_ function when available.
2799+
* – Introduce a custom UTF-8 decoder optimized for codepoint-by-codepoint processing.
2800+
* It could be the streaming version of the UTF-8 decoder, such as `_wp_iterate_utf8`,
2801+
* that avoids the repeated strspn() calls. Alternatively, the older `utf8_codepoint_at`
2802+
* function could be restored if its codepoint-by-codepoint decoding performance is
2803+
* better than the _wp_scan_utf8.
2804+
*/
2805+
2806+
/**
2807+
* The ASCII speedup includes all ASCII NameStartChar, which are also valid
2808+
* NameChar, making it possible to quickly scan past these bytes without
2809+
* further processing.
2810+
*/
2811+
$name_byte_length += strspn( $this->xml, ":ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz-.0123456789\u{B7}", $offset + $name_byte_length );
2812+
2813+
/*
2814+
* Quickly check if the next byte is an ASCII byte that is not allowed in XML
2815+
* NameStartChar. If so, we can break out of the loop without calling the UTF-8 decoder.
2816+
*
2817+
* Even though this does not seem to be different from the ASCII fast path in the
2818+
* _wp_scan_utf8 function, skipping that function call still provides a ~50% speed
2819+
* improvement.
27892820
*/
2790-
$codepoint = utf8_codepoint_at(
2821+
$is_non_name_ascii_byte = strspn(
27912822
$this->xml,
2823+
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
2824+
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
2825+
" !\"#$%&'()*+,./;<=>?@[\\]^`{|}~\x7f",
27922826
$offset + $name_byte_length,
2793-
$bytes_parsed
2794-
);
2795-
if (
2796-
// Byte sequence is not a valid UTF-8 codepoint.
2797-
( 0xFFFD === $codepoint && 0 === $bytes_parsed ) ||
2798-
// No codepoint at the given offset.
2799-
null === $codepoint ||
2800-
// The codepoint is not a valid part of an XML NameChar or NameStartChar.
2801-
! $this->is_valid_name_codepoint( $codepoint, 0 === $name_byte_length )
2802-
) {
2827+
1
2828+
) > 0;
2829+
if ( $is_non_name_ascii_byte ) {
2830+
break;
2831+
}
2832+
2833+
// EOF.
2834+
if ( $offset + $name_byte_length >= strlen( $this->xml ) ) {
2835+
break;
2836+
}
2837+
2838+
// The next byte sequence is, very likely, a UTF-8 codepoint. Let's
2839+
// try to decode it.
2840+
$at = $offset + $name_byte_length;
2841+
$new_at = $at;
2842+
$invalid_length = 0;
2843+
if ( 1 !== _wp_scan_utf8( $this->xml, $new_at, $invalid_length, null, 1 ) ) {
2844+
// EOF or invalid utf-8 byte sequence.
2845+
break;
2846+
}
2847+
2848+
$codepoint_byte_length = $new_at - $at;
2849+
$codepoint = utf8_ord( substr( $this->xml, $at, $codepoint_byte_length ) );
2850+
2851+
// The codepoint is not a valid part of an XML NameChar or NameStartChar.
2852+
if ( ! $this->is_valid_name_codepoint( $codepoint, 0 === $name_byte_length ) ) {
28032853
break;
28042854
}
2805-
$codepoint = null;
2806-
$name_byte_length += $bytes_parsed;
2855+
$name_byte_length += $codepoint_byte_length;
2856+
$at = $new_at;
28072857
}
28082858

28092859
return $name_byte_length;

0 commit comments

Comments
 (0)