Skip to content

Commit

Permalink
Implement XML parser; fixes to XMLSerializer and CDATASection support
Browse files Browse the repository at this point in the history
Bug: w3c/DOM-Parsing#29
Bug: w3c/DOM-Parsing#38
Bug: w3c/DOM-Parsing#47
Bug: w3c/DOM-Parsing#48
Bug: w3c/DOM-Parsing#50
Bug: w3c/DOM-Parsing#52
Bug: w3c/DOM-Parsing#59
Bug: w3c/DOM-Parsing#71
Change-Id: I76735c4be1d9738c690417207301f737e3a3c9ff
  • Loading branch information
cscott committed Jul 3, 2021
1 parent 5272a63 commit 4852c2a
Show file tree
Hide file tree
Showing 18 changed files with 307 additions and 111 deletions.
19 changes: 19 additions & 0 deletions src/CDATASection.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

namespace Wikimedia\Dodo;

use Wikimedia\Dodo\Internal\BadXMLException;
use Wikimedia\Dodo\Internal\NamespacePrefixMap;
use Wikimedia\Dodo\Internal\UnimplementedTrait;

class CDATASection extends Text implements \Wikimedia\IDLeDOM\CDATASection {
Expand All @@ -27,4 +29,21 @@ final public function getNodeType() : int {
public function getNodeName() : string {
return "#cdata-section";
}

/** @inheritDoc */
public function _xmlSerialize(
?string $namespace, NamespacePrefixMap $prefixMap, int &$prefixIndex,
bool $requireWellFormed, array &$markup
) : void {
// See https://github.com/w3c/DOM-Parsing/issues/38
$data = $this->getData();
if ( $requireWellFormed ) {
if ( strpos( $data, ']]>' ) !== false ) {
throw new BadXMLException();
}
}
$markup[] = '<![CDATA[';
$markup[] = $this->getData();
$markup[] = ']]>';
}
}
3 changes: 1 addition & 2 deletions src/DOMImplementation.php
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ public function createDocumentType( $qualifiedName, $publicId, $systemId ) {
$qualifiedName,
$publicId,
$systemId );
/* TEMPORARY STUB */
}

/**
Expand Down Expand Up @@ -147,7 +146,7 @@ public function createDocument( ?string $namespace, ?string $qualifiedName = '',

/** @inheritDoc */
public function createHTMLDocument( ?string $titleText = null ) {
$d = new Document( $this->_contextObject, 'html', null );
$d = new Document( $this->_contextObject, 'html', 'text/html', null );

$d->appendChild( new DocumentType( $d, "html" ) );

Expand Down
133 changes: 129 additions & 4 deletions src/DOMParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@

namespace Wikimedia\Dodo;

use Exception;
use RemexHtml\DOM\DOMBuilder;
use RemexHtml\Tokenizer\NullTokenHandler;
use RemexHtml\Tokenizer\Tokenizer;
use RemexHtml\TreeBuilder\Dispatcher;
use RemexHtml\TreeBuilder\TreeBuilder;
use Wikimedia\Dodo\Internal\UnimplementedException;
use Wikimedia\IDLeDOM\DOMParserSupportedType;
use XMLReader;

/**
* DOMParser
Expand All @@ -24,9 +26,23 @@ class DOMParser implements \Wikimedia\IDLeDOM\DOMParser {
*/
public function parseFromString( string $string, /* DOMParserSupportedType */ string $type ) {
$type = DOMParserSupportedType::cast( $type );
if ( $type !== DOMParserSupportedType::text_html ) {
throw new UnimplementedException( __METHOD__ . "( '$type' )" );
switch ( $type ) {
case DOMParserSupportedType::text_html:
return $this->_parseHtml( $string );
default:
// XXX if we throw an XML well-formedness error here, we're
/// supposed to make a document describing it, instead of
// throwing an exception.
return $this->_parseXml( $string, $type );
}
}

/**
* Create an HTML parser, parsing the string as UTF-8.
* @param string $string
* @return Document
*/
private function _parseHtml( string $string ) {
$domBuilder = new class( [
'suppressHtmlNamespace' => true,
'suppressIdAttribute' => true,
Expand All @@ -42,7 +58,7 @@ protected function createDocument(
string $system = null
) {
// Force this to be an HTML document (not an XML document)
$this->doc = new Document( null, 'html' );
$this->doc = new Document( null, 'html', 'text/html' );
return $this->doc;
}

Expand Down Expand Up @@ -76,4 +92,113 @@ public function doctype( $name, $public, $system, $quirks, $sourceStart, $source
return $result;
}

/**
* An XML parser ... is a construct that follows the rules given in
* XML to map a string of bytes or characters into a Document
* object.
*
* The spec then follows that up with:
* "Note: At the time of writing, no such rules actually exist."
*
* Use the enabled-by-default PHP XMLReader class to do our
* parsing and cram it into a Document somehow, and hope we don't
* mangle things too badly.
*
* @see https://html.spec.whatwg.org/multipage/xhtml.html#xml-parser
*
* @param string $s The string to parse
* @param string $contentType
* @return Document
*/
private function _parseXML( string $s, string $contentType ) {
$reader = new XMLReader();
$reader->XML(
$s, 'utf-8',
LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING | LIBXML_PARSEHUGE
);
# According to spec, this is a Document not an XMLDocument
$doc = new Document( null, 'xml', $contentType );
$node = $doc;
$attrNode = null;
while ( $reader->moveToNextAttribute() || $reader->read() ) {
switch ( $reader->nodeType ) {
case XMLReader::END_ELEMENT:
$node = $node->getParentNode();
// Workaround to prevent us from visiting the attributes again
while ( $reader->moveToNextAttribute() ) {
/* skip */
}
break;
case XMLReader::ELEMENT:
$qname = $reader->prefix ?? '';
if ( $qname !== '' ) {
$qname .= ':';
}
$qname .= $reader->localName;
// This will be the node we'll attach attributes to!
$attrNode = $doc->createElementNS( $reader->namespaceURI, $qname );
$node->appendChild( $attrNode );
// We don't get an END_ELEMENT from the reader if this is
// an empty element (sigh)
if ( !$reader->isEmptyElement ) {
$node = $attrNode;
}
break;
case XMLReader::ATTRIBUTE:
$qname = $reader->prefix ?? '';
if ( $qname !== '' ) {
$qname .= ':';
}
$qname .= $reader->localName;
'@phan-var Element $attrNode';
$attrNode->setAttributeNS(
$reader->namespaceURI, $qname, $reader->value
);
break;
case XMLReader::TEXT:
$nn = $doc->createTextNode( $reader->value );
$node->appendChild( $nn );
break;
case XMLReader::CDATA:
$nn = $doc->createCDATASection( $reader->value );
$node->appendChild( $nn );
break;
case XMLReader::DOC_TYPE:
# This is a hack: the PHP XMLReader interface provides no
# way to extract the contents of a DOC_TYPE node! So we're
# going to give it to the HTML tokenizer to interpret.
$tokenHandler = new class extends NullTokenHandler {
/** @var string */
public $name;
/** @var string */
public $publicId;
/** @var string */
public $systemId;

/** @inheritDoc */
public function doctype(
$name, $publicId, $systemId,
$quirks, $sourceStart, $sourceLength
) {
$this->name = $name;
$this->publicId = $publicId;
$this->systemId = $systemId;
}
};
( new Tokenizer(
$tokenHandler, $reader->readOuterXml(), []
) )->execute( [] );
$nn = $doc->getImplementation()->createDocumentType(
$tokenHandler->name,
$tokenHandler->publicId,
$tokenHandler->systemId
);
$node->appendChild( $nn );
break;
default:
throw new Exception( "Unknown node type: " . $reader->nodeType );
}
}
return $doc;
}
}
23 changes: 22 additions & 1 deletion src/Document.php
Original file line number Diff line number Diff line change
Expand Up @@ -233,11 +233,13 @@ private function _updateDoctypeAndDocumentElement(): void {
/**
* @param ?Document $originDoc
* @param string $type
* @param string $contentType
* @param ?string $url
*/
public function __construct(
?Document $originDoc = null,
string $type = "xml",
string $contentType = 'text/xml',
?string $url = null
) {
parent::__construct( $this );
Expand All @@ -249,6 +251,9 @@ public function __construct(
if ( $type === 'html' ) {
$this->_contentType = 'text/html';
$this->_typeIsHtml = true;
} else {
$this->_contentType = $contentType;
$this->_typeIsHtml = false;
}

/* DOM-LS: used by the documentURI and URL method */
Expand Down Expand Up @@ -280,6 +285,7 @@ public function _getTemplateDoc() {
$newDoc = new Document(
$this,
$this->_typeIsHtml ? 'html' : 'xml',
$this->_contentType,
$this->_URL
);
$this->_templateDocCache = $newDoc->_templateDocCache = $newDoc;
Expand Down Expand Up @@ -465,6 +471,17 @@ public function createTextNode( string $data ) : Text {
return new Text( $this, $data );
}

/** @inheritDoc */
public function createCDATASection( string $data ) : CDATASection {
if ( $this->_isHTMLDocument() ) {
Util::error( 'NotSupportedError' );
}
if ( strpos( $data, ']]>' ) !== false ) {
Util::error( 'InvalidCharacterError' );
}
return new CDATASection( $this, $data );
}

/** @inheritDoc */
public function createComment( string $data ) : Comment {
return new Comment( $this, $data );
Expand Down Expand Up @@ -836,10 +853,10 @@ protected function _subclassCloneNodeShallow(): Node {
$shallow = new Document(
$this,
$this->_typeIsHtml ? 'html' : 'xml',
$this->_contentType,
$this->_URL
);
$shallow->_mode = $this->_mode;
$shallow->_contentType = $this->_contentType;
return $shallow;
}

Expand Down Expand Up @@ -870,6 +887,10 @@ public function _xmlSerialize(
throw new BadXMLException();
}
}
// Emitting the XML declaration is not yet in the spec:
// https://github.com/w3c/DOM-Parsing/issues/50
$markup[] = '<?xml version="1.0" encoding="UTF-8"?>';

for ( $child = $this->getFirstChild(); $child !== null; $child = $child->getNextSibling() ) {
$child->_xmlSerialize(
$namespace, $prefixMap, $prefixIndex, $requireWellFormed,
Expand Down
1 change: 1 addition & 0 deletions src/DocumentType.php
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ public function _xmlSerialize(
if ( $this->_publicId === '' ) {
$markup[] = " SYSTEM";
}
// https://github.com/w3c/DOM-Parsing/issues/71
$quote = strpos( $this->_systemId, '"' ) === false ? '"' : "'";
$markup[] = ' ' . $quote . $this->_systemId . $quote;
}
Expand Down
4 changes: 0 additions & 4 deletions src/Internal/NamespacePrefixMap.php
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,6 @@ public function retrievePreferredPrefix(
?string $namespace,
?string $preferredPrefix
) : ?string {
if ( $preferredPrefix === null ) {
return null;
}
$last = null;
$candidatesList = $this->map[self::makeKey( $namespace )] ?? [];
foreach ( $candidatesList as $prefix ) {
Expand Down Expand Up @@ -149,5 +146,4 @@ public function generatePrefix(
$this->add( $newNamespace, $generatedPrefix );
return $generatedPrefix;
}

}
Loading

0 comments on commit 4852c2a

Please sign in to comment.