From 3275280fb6659247ea310988ee798259782f9d4e Mon Sep 17 00:00:00 2001 From: Kornel Date: Mon, 23 Dec 2024 21:24:38 +0000 Subject: [PATCH 1/2] Document that /> is an unnecessary talisman in HTML #215 --- c-api/include/lol_html.h | 10 ++++++++- src/rewritable_units/element.rs | 27 ++++++++++++++++++++---- src/rewritable_units/tokens/start_tag.rs | 11 +++++++++- 3 files changed, 42 insertions(+), 6 deletions(-) diff --git a/c-api/include/lol_html.h b/c-api/include/lol_html.h index 27e37931..a259fc0c 100644 --- a/c-api/include/lol_html.h +++ b/c-api/include/lol_html.h @@ -523,7 +523,15 @@ int lol_html_element_tag_name_set( size_t name_len ); -// Whether the element is explicitly self-closing, e.g. ``. +// Whether the tag syntactically ends with `/>`. In HTML content this is purely a decorative, unnecessary, and has no effect of any kind. +// +// The `/>` syntax only affects parsing of elements in foreign content (SVG and MathML). +// It will never close any HTML tags that aren't already defined as void in HTML. +// +// This function only reports the parsed syntax, and will not report which elements are actually void in HTML. +// Use `lol_html_element_can_have_content` to check if the element is non-void. +// +// If the `/` is part of an unquoted attribute, it's not parsed as the self-closing syntax. bool lol_html_element_is_self_closing( lol_html_element_t *element ); diff --git a/src/rewritable_units/element.rs b/src/rewritable_units/element.rs index 562d86a6..784e5103 100644 --- a/src/rewritable_units/element.rs +++ b/src/rewritable_units/element.rs @@ -121,6 +121,10 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { } /// Sets the tag name of the element. + /// + /// The new tag name must be in the same namespace, have the same content model, and be valid in its location. + /// Otherwise change of the tag name may cause the resulting document to be parsed in an unexpected way, + /// out of sync with this library. #[inline] pub fn set_tag_name(&mut self, name: &str) -> Result<(), TagNameError> { let name = self.tag_name_bytes_from_str(name)?; @@ -134,16 +138,31 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { Ok(()) } - /// Whether the element is explicitly self-closing, e.g. ``. + /// Whether the tag syntactically ends with `/>`. In HTML content this is purely a decorative, unnecessary, and has no effect of any kind. + /// + /// The `/>` syntax only affects parsing of elements in foreign content (SVG and MathML). + /// It will never close any HTML tags that aren't already defined as [void][spec] in HTML. + /// + /// This function only reports the parsed syntax, and will not report which elements are actually void in HTML. + /// Use [`can_have_content()`][Self::can_have_content] to check if the element is non-void. + /// + /// [spec]: https://html.spec.whatwg.org/multipage/syntax.html#start-tags + /// + /// If the `/` is part of an unquoted attribute, it's not parsed as the self-closing syntax. #[inline] #[must_use] pub fn is_self_closing(&self) -> bool { self.start_tag.self_closing() } - /// Whether the element can have inner content. Returns `true` unless the element is an [HTML void - /// element](https://html.spec.whatwg.org/multipage/syntax.html#void-elements) or has a - /// self-closing tag (eg, ``). + /// Whether the element can have inner content. + /// + /// Returns `true` if the element isn't a [void element in HTML][void], + /// or is in **foreign content** and doesn't have a self-closing tag (eg, ``). + /// + /// [void]: https://html.spec.whatwg.org/multipage/syntax.html#void-elements + /// + /// Note that the self-closing syntax has no effect in HTML content. #[inline] #[must_use] pub fn can_have_content(&self) -> bool { diff --git a/src/rewritable_units/tokens/start_tag.rs b/src/rewritable_units/tokens/start_tag.rs index d4adeb19..4542f303 100644 --- a/src/rewritable_units/tokens/start_tag.rs +++ b/src/rewritable_units/tokens/start_tag.rs @@ -102,7 +102,16 @@ impl<'i> StartTag<'i> { } } - /// Whether the tag is explicitly self-closing, e.g. ``. + /// Whether the tag syntactically ends with `/>`. In HTML content this is purely a decorative, unnecessary, and has no effect of any kind. + /// + /// The `/>` syntax only affects parsing of elements in foreign content (SVG and MathML). + /// It will never close any HTML tags that aren't already defined as [void](spec) in HTML. + /// + /// This function only reports the parsed syntax, and will not report which elements are actually void in HTML. + /// + /// [spec]: https://html.spec.whatwg.org/multipage/syntax.html#start-tags + /// + /// If the `/` is part of an unquoted attribute, it's not parsed as the self-closing syntax. #[inline] pub fn self_closing(&self) -> bool { self.self_closing From c20cefba20f28286829c06079239a38f217902fa Mon Sep 17 00:00:00 2001 From: Kornel Date: Tue, 24 Dec 2024 14:50:37 +0000 Subject: [PATCH 2/2] Remove invalid /> when content is added to an element #215 --- src/rewritable_units/element.rs | 3 +++ src/rewritable_units/tokens/start_tag.rs | 4 ++++ src/rewriter/mod.rs | 24 ++++++++++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/src/rewritable_units/element.rs b/src/rewritable_units/element.rs index 784e5103..b3e13cf8 100644 --- a/src/rewritable_units/element.rs +++ b/src/rewritable_units/element.rs @@ -370,6 +370,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { fn prepend_chunk(&mut self, chunk: StringChunk) { if self.can_have_content { + self.start_tag.set_self_closing_syntax(false); self.start_tag .mutations .mutate() @@ -434,6 +435,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { fn append_chunk(&mut self, chunk: StringChunk) { if self.can_have_content { + self.start_tag.set_self_closing_syntax(false); self.end_tag_mutations_mut().content_before.push_back(chunk); } } @@ -492,6 +494,7 @@ impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { fn set_inner_content_chunk(&mut self, chunk: StringChunk) { if self.can_have_content { + self.start_tag.set_self_closing_syntax(false); self.remove_content(); self.start_tag .mutations diff --git a/src/rewritable_units/tokens/start_tag.rs b/src/rewritable_units/tokens/start_tag.rs index 4542f303..a026f6eb 100644 --- a/src/rewritable_units/tokens/start_tag.rs +++ b/src/rewritable_units/tokens/start_tag.rs @@ -117,6 +117,10 @@ impl<'i> StartTag<'i> { self.self_closing } + pub(crate) fn set_self_closing_syntax(&mut self, has_slash: bool) { + self.self_closing = has_slash; + } + /// Inserts `content` before the start tag. /// /// Consequent calls to the method append `content` to the previously inserted content. diff --git a/src/rewriter/mod.rs b/src/rewriter/mod.rs index 73511b61..ac336960 100644 --- a/src/rewriter/mod.rs +++ b/src/rewriter/mod.rs @@ -446,6 +446,30 @@ mod tests { assert_eq!(res, ""); } + #[test] + fn rewrite_incorrect_self_closing() { + let res = rewrite_str::( + "
+

", + RewriteStrSettings { + element_content_handlers: vec![element!("*:not(svg)", |el| { + el.set_attribute("s", if el.is_self_closing() { "y" } else { "n" })?; + el.set_attribute("c", if el.can_have_content() { "y" } else { "n" })?; + el.append("…", ContentType::Text); + Ok(()) + })], + ..RewriteStrSettings::new() + }, + ) + .unwrap(); + + assert_eq!( + res, + r#"
+

"# + ); + } + #[test] fn rewrite_arbitrary_settings() { let res = rewrite_str("Some text", Settings::new()).unwrap();