Merge branch 'trunk' into xml-skip-dtd

adamziel · web-flow · commit 7759434a8e57 · 2025-11-02T01:46:15.000+01:00
diff --git a/components/DataLiberation/URL/class-cssprocessor.php b/components/DataLiberation/URL/class-cssprocessor.php
@@ -2,9 +2,9 @@
 
 namespace WordPress\DataLiberation\URL;
 
-use function WordPress\Encoding\utf8_codepoint_at;
 use function WordPress\Encoding\codepoint_to_utf8_bytes;
 use function WordPress\Encoding\compat\_wp_scan_utf8;
+use function WordPress\Encoding\utf8_ord;
 use function WordPress\Encoding\wp_scrub_utf8;
 
 /**
@@ -1506,7 +1506,7 @@ private function consume_ident_start_codepoint( $at ): int {
 		}
 
 		$codepoint_byte_length = $new_at - $at;
-		$codepoint             = utf8_codepoint_at( $this->css, $at );
+		$codepoint             = utf8_ord( substr( $this->css, $at, $codepoint_byte_length ) );
 		if ( null !== $codepoint && $codepoint >= 0x80 ) {
 			return $codepoint_byte_length;
 		}
diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php
@@ -5,7 +5,8 @@
 use WP_HTML_Span;
 use WP_HTML_Text_Replacement;
 
-use function WordPress\Encoding\utf8_codepoint_at;
+use function WordPress\Encoding\compat\_wp_scan_utf8;
+use function WordPress\Encoding\utf8_ord;
 
 /**
  * XML API: XMLProcessor class
@@ -17,11 +18,14 @@
  * It implements a subset of the XML 1.0 specification (https://www.w3.org/TR/xml/)
  * and supports XML documents with the following characteristics:
  *
- * * XML 1.0
- * * Well-formed
- * * UTF-8 encoded
- * * Not standalone (so can use external entities)
- * * No external DTD subset expansion (external entities may exist but are not fetched).
+ * – XML 1.0
+ * – Well-formed
+ * – UTF-8 encoded
+ * – Not standalone (so can use external entities)
+ * – No external DTD subset expansion (external entities may exist but are not fetched).
+ *
+ * XML 1.1 is explicitly not a design goal here. Version 1.1 is
+ * more complex specification and not so widely supported.
  *
  * ### Possible future direction for this module
  *
@@ -35,12 +39,6 @@
  *        numbers, indexes, and other debugging info.
  *
  *
- * @TODO: Support XML 1.1.
- *
- * @TODO: Evaluate the performance of utf8_codepoint_at() against using the mbstring
- *        extension. If mbstring is faster, then use it whenever it's available with
- *        utf8_codepoint_at() as a fallback.
- *
  * @package WordPress
  * @subpackage HTML-API
  * @since WP_VERSION
@@ -1192,8 +1190,8 @@ protected function parse_next_token() {
 			/**
 			 * Compute fully qualified attributes and assert:
 			 *
-			 * * All attributes have valid namespaces.
-			 * * No two attributes have the same (local name, namespace) pair.
+			 * – All attributes have valid namespaces.
+			 * – No two attributes have the same (local name, namespace) pair.
 			 *
 			 * @see https://www.w3.org/TR/2006/REC-xml-names11-20060816/#uniqAttrs
 			 */
@@ -1684,8 +1682,8 @@ private function parse_next_tag() {
 			 * names.
 			 *
 			 * Reference:
-			 * * https://www.w3.org/TR/xml/#NT-STag
-			 * * https://www.w3.org/TR/xml/#NT-Name
+			 * – https://www.w3.org/TR/xml/#NT-STag
+			 * – https://www.w3.org/TR/xml/#NT-Name
 			 */
 			$tag_name_length = $this->parse_name( $at + 1 );
 			if ( false === $tag_name_length ) {
@@ -2762,48 +2760,100 @@ private function skip_parameter_entity_reference( &$offset ) {
 	 * @return int
 	 */
 	private function parse_name( $offset ) {
-		static $i         = 0;
 		$name_byte_length = 0;
+		$at               = $offset;
+
+		// Fast path: consume any ASCII NameStartChar bytes.
+		$name_byte_length += strspn(
+			$this->xml,
+			':ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz',
+			$offset + $name_byte_length,
+			1
+		);
+
 		while ( true ) {
 			/**
 			 * Parse the next unicode codepoint.
 			 *
-			 * We use a custom UTF-8 decoder here. No other method
-			 * is reliable and available enough to depend on it in
-			 * WordPress core:
+			 * We use a the `_wp_scan_utf8` UTF-8 decoder introduced in WordPress 6.9. No other method
+			 * is reliable and available enough to depend on it in WordPress core:
 			 *
-			 * * mb_ord() – is not available on all hosts.
-			 * * iconv_substr() – is not available on all hosts.
-			 * * preg_match() – can fail with PREG_BAD_UTF8_ERROR when the input
+			 * – mb_ord() – is available on 99.5%+ or more of hosts, but not on all hosts.
+			 * – iconv_substr() – is not available on all hosts.
+			 * – preg_match() – can fail with PREG_BAD_UTF8_ERROR when the input
 			 *                  contains an incomplete UTF-8 byte sequence – even
 			 *                  when that sequence comes after a valid match. This
 			 *                  failure mode cannot be reproduced with just any string.
 			 *                  The runtime must be in a specific state. It's unclear
 			 *                  how to reliably reproduce this failure mode in a
 			 *                  unit test.
 			 *
-			 * Performance-wise, character-by-character processing via utf8_codepoint_at
-			 * is still much faster than relying on preg_match(). The mbstring extension
-			 * is likely faster. It would be interesting to evaluate the performance
-			 * and prefer mbstring whenever it's available.
+			 * Performance-wise, character-by-character processing via _wp_scan_utf8
+			 * is pretty slow. The ASCII fast path below enables skipping most of the
+			 * UTF-8 decoder calls.
+			 *
+			 * If the UTF-8 decoder performance ever becomes a bottleneck, there are a
+			 * few ways to significantly improve it:
+			 *
+			 * – Call a native grapheme_ function when available.
+			 * – Introduce a custom UTF-8 decoder optimized for codepoint-by-codepoint processing.
+			 *   It could be the streaming version of the UTF-8 decoder, such as `_wp_iterate_utf8`,
+			 *   that avoids the repeated strspn() calls. Alternatively, the older `utf8_codepoint_at`
+			 *   function could be restored if its codepoint-by-codepoint decoding performance is
+			 *   better than the _wp_scan_utf8.
+			 */
+
+			/**
+			 * The ASCII speedup includes all ASCII NameStartChar, which are also valid
+			 * NameChar, making it possible to quickly scan past these bytes without
+			 * further processing.
+			 */
+			$name_byte_length += strspn( $this->xml, ":ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz-.0123456789\u{B7}", $offset + $name_byte_length );
+
+			/*
+			 * Quickly check if the next byte is an ASCII byte that is not allowed in XML
+			 * NameStartChar. If so, we can break out of the loop without calling the UTF-8 decoder.
+			 *
+			 * Even though this does not seem to be different from the ASCII fast path in the
+			 * _wp_scan_utf8 function, skipping that function call still provides a ~50% speed
+			 * improvement.
 			 */
-			$codepoint = utf8_codepoint_at(
+			$is_non_name_ascii_byte = strspn(
 				$this->xml,
+				"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
+				"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
+				" !\"#$%&'()*+,./;<=>?@[\\]^`{|}~\x7f",
 				$offset + $name_byte_length,
-				$bytes_parsed
-			);
-			if (
-				// Byte sequence is not a valid UTF-8 codepoint.
-				( 0xFFFD === $codepoint && 0 === $bytes_parsed ) ||
-				// No codepoint at the given offset.
-				null === $codepoint ||
-				// The codepoint is not a valid part of an XML NameChar or NameStartChar.
-				! $this->is_valid_name_codepoint( $codepoint, 0 === $name_byte_length )
-			) {
+				1
+			) > 0;
+			if ( $is_non_name_ascii_byte ) {
+				break;
+			}
+
+			// EOF.
+			if ( $offset + $name_byte_length >= strlen( $this->xml ) ) {
+				break;
+			}
+
+			// The next byte sequence is, very likely, a UTF-8 codepoint. Let's
+			// try to decode it.
+			$at             = $offset + $name_byte_length;
+			$new_at         = $at;
+			$invalid_length = 0;
+			if ( 1 !== _wp_scan_utf8( $this->xml, $new_at, $invalid_length, null, 1 ) ) {
+				// EOF or invalid utf-8 byte sequence.
+				break;
+			}
+
+			$codepoint_byte_length = $new_at - $at;
+			$codepoint             = utf8_ord( substr( $this->xml, $at, $codepoint_byte_length ) );
+
+			// The codepoint is not a valid part of an XML NameChar or NameStartChar.
+			if ( ! $this->is_valid_name_codepoint( $codepoint, 0 === $name_byte_length ) ) {
 				break;
 			}
-			$codepoint         = null;
-			$name_byte_length += $bytes_parsed;
+			$name_byte_length += $codepoint_byte_length;
+			$at                = $new_at;
 		}
 
 		return $name_byte_length;