Implement XML parser; fixes to XMLSerializer and CDATASection support

Bug: w3c/DOM-Parsing#29 Bug: w3c/DOM-Parsing#38 Bug: w3c/DOM-Parsing#47 Bug: w3c/DOM-Parsing#48 Bug: w3c/DOM-Parsing#50 Bug: w3c/DOM-Parsing#52 Bug: w3c/DOM-Parsing#59 Bug: w3c/DOM-Parsing#71 Change-Id: I76735c4be1d9738c690417207301f737e3a3c9ff
wikimedia · Jul 3, 2021 · 4852c2a · 4852c2a
1 parent 5272a63
commit 4852c2a
Show file tree

Hide file tree

Showing 18 changed files with 307 additions and 111 deletions.
diff --git a/src/CDATASection.php b/src/CDATASection.php
@@ -4,6 +4,8 @@
 
 namespace Wikimedia\Dodo;
 
+use Wikimedia\Dodo\Internal\BadXMLException;
+use Wikimedia\Dodo\Internal\NamespacePrefixMap;
 use Wikimedia\Dodo\Internal\UnimplementedTrait;
 
 class CDATASection extends Text implements \Wikimedia\IDLeDOM\CDATASection {
@@ -27,4 +29,21 @@ final public function getNodeType() : int {
 	public function getNodeName() : string {
 		return "#cdata-section";
 	}
+
+	/** @inheritDoc */
+	public function _xmlSerialize(
+		?string $namespace, NamespacePrefixMap $prefixMap, int &$prefixIndex,
+		bool $requireWellFormed, array &$markup
+	) : void {
+		// See https://github.com/w3c/DOM-Parsing/issues/38
+		$data = $this->getData();
+		if ( $requireWellFormed ) {
+			if ( strpos( $data, ']]>' ) !== false ) {
+				throw new BadXMLException();
+			}
+		}
+		$markup[] = '<![CDATA[';
+		$markup[] = $this->getData();
+		$markup[] = ']]>';
+	}
 }
diff --git a/src/DOMImplementation.php b/src/DOMImplementation.php
@@ -76,7 +76,6 @@ public function createDocumentType( $qualifiedName, $publicId, $systemId ) {
 			$qualifiedName,
 			$publicId,
 			$systemId );
-		/* TEMPORARY STUB */
 	}
 
 	/**
@@ -147,7 +146,7 @@ public function createDocument( ?string $namespace, ?string $qualifiedName = '',
 
 	/** @inheritDoc */
 	public function createHTMLDocument( ?string $titleText = null ) {
-		$d = new Document( $this->_contextObject, 'html', null );
+		$d = new Document( $this->_contextObject, 'html', 'text/html', null );
 
 		$d->appendChild( new DocumentType( $d, "html" ) );
 

diff --git a/src/DOMParser.php b/src/DOMParser.php
@@ -4,12 +4,14 @@
 
 namespace Wikimedia\Dodo;
 
+use Exception;
 use RemexHtml\DOM\DOMBuilder;
+use RemexHtml\Tokenizer\NullTokenHandler;
 use RemexHtml\Tokenizer\Tokenizer;
 use RemexHtml\TreeBuilder\Dispatcher;
 use RemexHtml\TreeBuilder\TreeBuilder;
-use Wikimedia\Dodo\Internal\UnimplementedException;
 use Wikimedia\IDLeDOM\DOMParserSupportedType;
+use XMLReader;
 
 /**
  * DOMParser
@@ -24,9 +26,23 @@ class DOMParser implements \Wikimedia\IDLeDOM\DOMParser {
 	 */
 	public function parseFromString( string $string, /* DOMParserSupportedType */ string $type ) {
 		$type = DOMParserSupportedType::cast( $type );
-		if ( $type !== DOMParserSupportedType::text_html ) {
-			throw new UnimplementedException( __METHOD__ . "( '$type' )" );
+		switch ( $type ) {
+		case DOMParserSupportedType::text_html:
+			return $this->_parseHtml( $string );
+		default:
+			// XXX if we throw an XML well-formedness error here, we're
+			/// supposed to make a document describing it, instead of
+			// throwing an exception.
+			return $this->_parseXml( $string, $type );
 		}
+	}
+
+	/**
+	 * Create an HTML parser, parsing the string as UTF-8.
+	 * @param string $string
+	 * @return Document
+	 */
+	private function _parseHtml( string $string ) {
 		$domBuilder = new class( [
 			'suppressHtmlNamespace' => true,
 			'suppressIdAttribute' => true,
@@ -42,7 +58,7 @@ protected function createDocument(
 					string $system = null
 				) {
 					// Force this to be an HTML document (not an XML document)
-					$this->doc = new Document( null, 'html' );
+					$this->doc = new Document( null, 'html', 'text/html' );
 					return $this->doc;
 				}
 
@@ -76,4 +92,113 @@ public function doctype( $name, $public, $system, $quirks, $sourceStart, $source
 		return $result;
 	}
 
+	/**
+	 * An XML parser ... is a construct that follows the rules given in
+	 * XML to map a string of bytes or characters into a Document
+	 * object.
+	 *
+	 * The spec then follows that up with:
+	 * "Note: At the time of writing, no such rules actually exist."
+	 *
+	 * Use the enabled-by-default PHP XMLReader class to do our
+	 * parsing and cram it into a Document somehow, and hope we don't
+	 * mangle things too badly.
+	 *
+	 * @see https://html.spec.whatwg.org/multipage/xhtml.html#xml-parser
+	 *
+	 * @param string $s The string to parse
+	 * @param string $contentType
+	 * @return Document
+	 */
+	private function _parseXML( string $s, string $contentType ) {
+		$reader = new XMLReader();
+		$reader->XML(
+			$s, 'utf-8',
+			LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING | LIBXML_PARSEHUGE
+		);
+		# According to spec, this is a Document not an XMLDocument
+		$doc = new Document( null, 'xml', $contentType );
+		$node = $doc;
+		$attrNode = null;
+		while ( $reader->moveToNextAttribute() || $reader->read() ) {
+			switch ( $reader->nodeType ) {
+			case XMLReader::END_ELEMENT:
+				$node = $node->getParentNode();
+				// Workaround to prevent us from visiting the attributes again
+				while ( $reader->moveToNextAttribute() ) {
+					/* skip */
+				}
+				break;
+			case XMLReader::ELEMENT:
+				$qname = $reader->prefix ?? '';
+				if ( $qname !== '' ) {
+					$qname .= ':';
+				}
+				$qname .= $reader->localName;
+				// This will be the node we'll attach attributes to!
+				$attrNode = $doc->createElementNS( $reader->namespaceURI, $qname );
+				$node->appendChild( $attrNode );
+				// We don't get an END_ELEMENT from the reader if this is
+				// an empty element (sigh)
+				if ( !$reader->isEmptyElement ) {
+					$node = $attrNode;
+				}
+				break;
+			case XMLReader::ATTRIBUTE:
+				$qname = $reader->prefix ?? '';
+				if ( $qname !== '' ) {
+					$qname .= ':';
+				}
+				$qname .= $reader->localName;
+				'@phan-var Element $attrNode';
+				$attrNode->setAttributeNS(
+					$reader->namespaceURI, $qname, $reader->value
+				);
+				break;
+			case XMLReader::TEXT:
+				$nn = $doc->createTextNode( $reader->value );
+				$node->appendChild( $nn );
+				break;
+			case XMLReader::CDATA:
+				$nn = $doc->createCDATASection( $reader->value );
+				$node->appendChild( $nn );
+				break;
+			case XMLReader::DOC_TYPE:
+				# This is a hack: the PHP XMLReader interface provides no
+				# way to extract the contents of a DOC_TYPE node!  So we're
+				# going to give it to the HTML tokenizer to interpret.
+				$tokenHandler = new class extends NullTokenHandler {
+					/** @var string */
+					public $name;
+					/** @var string */
+					public $publicId;
+					/** @var string */
+					public $systemId;
+
+					/** @inheritDoc */
+					public function doctype(
+						$name, $publicId, $systemId,
+						$quirks, $sourceStart, $sourceLength
+					) {
+						$this->name = $name;
+						$this->publicId = $publicId;
+						$this->systemId = $systemId;
+					}
+				};
+				( new Tokenizer(
+					$tokenHandler, $reader->readOuterXml(), []
+				) )->execute( [] );
+				$nn = $doc->getImplementation()->createDocumentType(
+					$tokenHandler->name,
+					$tokenHandler->publicId,
+					$tokenHandler->systemId
+				);
+				$node->appendChild( $nn );
+				break;
+			default:
+				throw new Exception( "Unknown node type: " . $reader->nodeType );
+			}
+		}
+		return $doc;
+	}
 }
diff --git a/src/Document.php b/src/Document.php
@@ -233,11 +233,13 @@ private function _updateDoctypeAndDocumentElement(): void {
 	/**
 	 * @param ?Document $originDoc
 	 * @param string $type
+	 * @param string $contentType
 	 * @param ?string $url
 	 */
 	public function __construct(
 		?Document $originDoc = null,
 		string $type = "xml",
+		string $contentType = 'text/xml',
 		?string $url = null
 	) {
 		parent::__construct( $this );
@@ -249,6 +251,9 @@ public function __construct(
 		if ( $type === 'html' ) {
 			$this->_contentType = 'text/html';
 			$this->_typeIsHtml = true;
+		} else {
+			$this->_contentType = $contentType;
+			$this->_typeIsHtml = false;
 		}
 
 		/* DOM-LS: used by the documentURI and URL method */
@@ -280,6 +285,7 @@ public function _getTemplateDoc() {
 			$newDoc = new Document(
 				$this,
 				$this->_typeIsHtml ? 'html' : 'xml',
+				$this->_contentType,
 				$this->_URL
 			);
 			$this->_templateDocCache = $newDoc->_templateDocCache = $newDoc;
@@ -465,6 +471,17 @@ public function createTextNode( string $data ) : Text {
 		return new Text( $this, $data );
 	}
 
+	/** @inheritDoc */
+	public function createCDATASection( string $data ) : CDATASection {
+		if ( $this->_isHTMLDocument() ) {
+			Util::error( 'NotSupportedError' );
+		}
+		if ( strpos( $data, ']]>' ) !== false ) {
+			Util::error( 'InvalidCharacterError' );
+		}
+		return new CDATASection( $this, $data );
+	}
+
 	/** @inheritDoc */
 	public function createComment( string $data ) : Comment {
 		return new Comment( $this, $data );
@@ -836,10 +853,10 @@ protected function _subclassCloneNodeShallow(): Node {
 		$shallow = new Document(
 			$this,
 			$this->_typeIsHtml ? 'html' : 'xml',
+			$this->_contentType,
 			$this->_URL
 		);
 		$shallow->_mode = $this->_mode;
-		$shallow->_contentType = $this->_contentType;
 		return $shallow;
 	}
 
@@ -870,6 +887,10 @@ public function _xmlSerialize(
 				throw new BadXMLException();
 			}
 		}
+		// Emitting the XML declaration is not yet in the spec:
+		// https://github.com/w3c/DOM-Parsing/issues/50
+		$markup[] = '<?xml version="1.0" encoding="UTF-8"?>';
+
 		for ( $child = $this->getFirstChild(); $child !== null; $child = $child->getNextSibling() ) {
 			$child->_xmlSerialize(
 				$namespace, $prefixMap, $prefixIndex, $requireWellFormed,

diff --git a/src/DocumentType.php b/src/DocumentType.php
@@ -141,6 +141,7 @@ public function _xmlSerialize(
 			if ( $this->_publicId === '' ) {
 				$markup[] = " SYSTEM";
 			}
+			// https://github.com/w3c/DOM-Parsing/issues/71
 			$quote = strpos( $this->_systemId, '"' ) === false ? '"' : "'";
 			$markup[] = ' ' . $quote . $this->_systemId . $quote;
 		}

diff --git a/src/Internal/NamespacePrefixMap.php b/src/Internal/NamespacePrefixMap.php
@@ -108,9 +108,6 @@ public function retrievePreferredPrefix(
 		?string $namespace,
 		?string $preferredPrefix
 	) : ?string {
-		if ( $preferredPrefix === null ) {
-			return null;
-		}
 		$last = null;
 		$candidatesList = $this->map[self::makeKey( $namespace )] ?? [];
 		foreach ( $candidatesList as $prefix ) {
@@ -149,5 +146,4 @@ public function generatePrefix(
 		$this->add( $newNamespace, $generatedPrefix );
 		return $generatedPrefix;
 	}
-
 }