Skip to content

Commit 3980ed8

Browse files
authored
XMLProcessor: Support namespaces (#126)
## Add Namespace Support to XMLProcessor This PR upgrades `XMLProcessor` to fully support [[XML Namespaces 1.1](https://www.w3.org/TR/xml-names11/)](https://www.w3.org/TR/xml-names11/). Tags and attributes are now consistently interpreted according to their declared namespaces, fixing compatibility with WordPress WXR files and EPUB metadata. New methods signatures: ```php public function next_tag( $query_or_namespace = null, $local_name_maybe = null ); public function get_tag_local_name(); public function get_tag_namespace(); public function get_attribute( $namespace, $local_name ); public function get_attribute_names_with_prefix( $full_namespace_prefix, $local_name_prefix ); public function set_attribute( $namespace, $local_name, $value ); ``` Usage comparison: ```php // Before $processor->next_tag( 'wp:content' ); $processor->get_attribute( 'wp:post-type' ); // After $processor->next_tag( 'http://wordpress.org/export/1.2/', 'content' ); // or $processor->next_tag( [ 'http://wordpress.org/export/1.2/', 'content' ] ); $processor->next_tag( [ '*', 'content' ] ); $processor->get_attribute( 'http://wordpress.org/export/1.2/', 'post-type' ); ``` ## Rationale The old parser treated tag and attribute names as opaque strings (`wp:postmeta`, `wp:tag`, etc.), ignoring that these were syntactic sugar for `{namespace}local-name`. This made it impossible to reliably parse WXR files, which may use different namespace URIs for the same `wp:` prefix. ## Implementation Details * `$stack_of_open_elements` tracks the hierarchy of `XMLElement` frames and the namespaces they define and remove. * `set_attribute($ns, $attr, $value)` and `get_attribute($ns, $attr)` accept the full namespace string as their first argument to force the developer to take it into consideration. * `next_tag()` and `matches_breadcrumbs()` accept two-tuples `{$namespace, $local_tag_name}` instead of string-based tag names. Tag names are still accepted. `*` wildcards are supported, too. * `get_breadcrumbs()` return an array of two-tuples `{$namespace, $local_tag_name}`, e.g. `[['', 'root'], ['http://wp.org/export/1.2/', 'post']]` ## Testing instructions Confirm most of the CI tests pass (aside of the flaky network-related ones)
1 parent 6222e29 commit 3980ed8

File tree

10 files changed

+2267
-713
lines changed

10 files changed

+2267
-713
lines changed

components/DataLiberation/DataFormatConsumer/MarkupProcessorConsumer.php

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ public function consume() {
6060
break;
6161
}
6262
$this->append_rich_text( htmlspecialchars( $this->markup_processor->get_modifiable_text() ) );
63-
if ( in_array( $this->markup_processor->get_tag(), array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) {
63+
if ( in_array( $this->get_tag_name(), array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) {
6464
$this->on_title_candidate( $this->markup_processor->get_modifiable_text() );
6565
}
6666
break;
@@ -90,7 +90,11 @@ public function consume() {
9090

9191
private function handle_tag() {
9292
$html = $this->markup_processor;
93-
$tag = strtoupper( $html->get_tag() );
93+
if($html instanceof WP_HTML_Processor) {
94+
$tag = strtoupper( $html->get_tag() );
95+
} else {
96+
$tag = strtoupper( $html->get_tag_local_name() );
97+
}
9498
$tag_lowercase = strtolower( $tag );
9599

96100
$is_void_tag = ! $html->expects_closer() && ! $html->is_tag_closer();
@@ -100,14 +104,14 @@ private function handle_tag() {
100104
$this->on_title_candidate( $html->get_modifiable_text() );
101105
break;
102106
case 'META':
103-
$key = $html->get_attribute( 'name' );
104-
$value = $html->get_attribute( 'content' );
107+
$key = $this->get_attribute( 'name' );
108+
$value = $this->get_attribute( 'content' );
105109
if ( ! array_key_exists( $key, $this->metadata ) ) {
106110
if ( $key ) {
107111
$this->metadata[ $key ] = array();
108112
}
109113
}
110-
switch ( $html->get_attribute( 'type' ) ) {
114+
switch ( $this->get_attribute( 'type' ) ) {
111115
case 'integer':
112116
$value = (int) $value;
113117
break;
@@ -124,8 +128,8 @@ private function handle_tag() {
124128
$template = new WP_HTML_Tag_Processor( '<img>' );
125129
$template->next_tag();
126130
foreach ( array( 'alt', 'title', 'src' ) as $attr ) {
127-
if ( $html->get_attribute( $attr ) ) {
128-
$template->set_attribute( $attr, $html->get_attribute( $attr ) );
131+
if ( $this->get_attribute( $attr ) ) {
132+
$template->set_attribute( $attr, $this->get_attribute( $attr ) );
129133
}
130134
}
131135
/**
@@ -211,8 +215,8 @@ private function handle_tag() {
211215
case 'A':
212216
$template = new WP_HTML_Tag_Processor( '<a>' );
213217
$template->next_tag();
214-
if ( $html->get_attribute( 'href' ) ) {
215-
$template->set_attribute( 'href', $html->get_attribute( 'href' ) );
218+
if ( $this->get_attribute( 'href' ) ) {
219+
$template->set_attribute( 'href', $this->get_attribute( 'href' ) );
216220
}
217221
/**
218222
*
@@ -297,6 +301,22 @@ private function handle_tag() {
297301
}
298302
}
299303

304+
private function get_tag_name() {
305+
if($this->markup_processor instanceof WP_HTML_Processor) {
306+
return $this->markup_processor->get_tag();
307+
} else {
308+
return $this->markup_processor->get_tag_local_name();
309+
}
310+
}
311+
312+
private function get_attribute($key) {
313+
if($this->markup_processor instanceof WP_HTML_Processor) {
314+
return $this->markup_processor->get_attribute($key);
315+
} else {
316+
return $this->markup_processor->get_attribute('', $key);
317+
}
318+
}
319+
300320
private function on_title_candidate( $text ) {
301321
if ( ! array_key_exists( 'post_title', $this->metadata ) ) {
302322
$this->metadata['post_title'] = array(

components/DataLiberation/EntityReader/EPubEntityReader.php

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ public function next_entity() {
5454
return false;
5555
}
5656

57+
$this->remaining_html_files = [];
5758
foreach ( $this->manifest['items'] as $item ) {
5859
if ( $item['media-type'] !== 'application/xhtml+xml' ) {
5960
continue;
@@ -137,11 +138,12 @@ private function parse_manifest() {
137138
$xml = XMLProcessor::create_from_string(
138139
$this->zip->get_contents( 'META-INF/container.xml' )
139140
);
140-
if ( false === $xml->next_tag( 'rootfile' ) ) {
141+
142+
if ( false === $xml->next_tag( ['urn:oasis:names:tc:opendocument:xmlns:container', 'rootfile'] ) ) {
141143
return false;
142144
}
143145

144-
$full_path = $xml->get_attribute( 'full-path' );
146+
$full_path = $xml->get_attribute( '', 'full-path' );
145147
if ( ! $full_path ) {
146148
return false;
147149
}
@@ -161,16 +163,16 @@ private function parse_manifest() {
161163
);
162164
while ( $xml->next_tag() ) {
163165
$parsed_entry = array();
164-
$keys = $xml->get_attribute_names_with_prefix( '' );
165-
foreach ( $keys as $key ) {
166-
$parsed_entry[ $key ] = $xml->get_attribute( $key );
166+
$keys = $xml->get_attribute_names_with_prefix( '', '' );
167+
foreach ( $keys as list($ns, $key) ) {
168+
$parsed_entry[ $key ] = $xml->get_attribute( $ns, $key );
167169
}
168-
if ( $xml->matches_breadcrumbs( array( 'metadata', '*' ) ) ) {
170+
if ( $xml->matches_breadcrumbs( array( 'package', 'metadata', '*' ) ) ) {
169171
$parsed['metadata'][] = array(
170-
'tag' => $xml->get_tag(),
172+
'tag' => $xml->get_tag_local_name(),
171173
'attributes' => $parsed_entry,
172174
);
173-
} elseif ( $xml->matches_breadcrumbs( array( 'manifest', 'item' ) ) ) {
175+
} elseif ( $xml->matches_breadcrumbs( array( 'package', 'manifest', 'item' ) ) ) {
174176
$parsed_entry['type'] = 'item';
175177
$parsed['items'][] = $parsed_entry;
176178
}

0 commit comments

Comments
 (0)