From 2094a85f5260b23397c4a89fbf50dcba883219e6 Mon Sep 17 00:00:00 2001 From: Joachim Schiele Date: Tue, 11 Mar 2025 18:35:12 +0000 Subject: [PATCH 1/3] Attempt of getting
 to not parse inner contents
 similar to ";
+    println!("-------- {} ----------", input);
     let dom = parse_document(RcDom::default(), opts)
-        .from_utf8()
-        .read_from(&mut stdin.lock())
-        .unwrap();
+    .one(input);
+
+    // let stdin = io::stdin();
+    // let dom = parse_document(RcDom::default(), opts)
+    //     .from_utf8()
+    //     .read_from(&mut stdin.lock())
+    //     .unwrap();
+
 
     // The validator.nu HTML2HTML always prints a doctype at the very beginning.
     io::stdout()
diff --git a/rcdom/examples/print-rcdom.rs b/rcdom/examples/print-rcdom.rs
index e1d8fc68..c50399f0 100644
--- a/rcdom/examples/print-rcdom.rs
+++ b/rcdom/examples/print-rcdom.rs
@@ -34,7 +34,7 @@ fn walk(indent: usize, handle: &Handle) {
         } => println!(""),
 
         NodeData::Text { ref contents } => {
-            println!("#text: {}", contents.borrow().escape_default())
+            print!("{}", contents.borrow().escape_default())
         },
 
         NodeData::Comment { ref contents } => println!("", contents.escape_default()),
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
new file mode 100644
index 00000000..a44815df
--- /dev/null
+++ b/rust-toolchain.toml
@@ -0,0 +1,6 @@
+[toolchain]
+channel = "stable"
+targets = [
+  "x86_64-unknown-linux-musl", # used for the backend
+  "wasm32-unknown-unknown"     # used for the frontend
+]
diff --git a/trunk.nix b/trunk.nix
new file mode 100644
index 00000000..ec598ea9
--- /dev/null
+++ b/trunk.nix
@@ -0,0 +1,36 @@
+{ lib, stdenv, rustPlatform, fetchFromGitHub, pkg-config, openssl, libiconv,
+  CoreServices, Security, SystemConfiguration
+}:
+
+rustPlatform.buildRustPackage rec {
+  pname = "trunk";
+  version = "0.21.4";
+
+  src = fetchFromGitHub {
+    owner = "trunk-rs";
+    repo = "trunk";
+    rev = "v${version}";
+    sha256 = "sha256-tU0Xob0dS1+rrfRVitwOe0K1AG05LHlGPHhFL0yOjxM=";
+  };
+
+  nativeBuildInputs = [ pkg-config ];
+  buildInputs = if stdenv.isDarwin
+    then [ libiconv CoreServices Security SystemConfiguration]
+    else [ openssl ];
+
+  # requires network
+  checkFlags = [ "--skip=tools::tests::download_and_install_binaries" ];
+
+  cargoHash = "sha256-iuxACtr91qWzojKWaieAd6kk/q9j5JSD1Fa50oCKogA=";
+
+  postConfigure = ''
+    cargo metadata --offline
+  '';
+
+  meta = with lib; {
+    homepage = "https://github.com/trunk-rs/trunk";
+    description = "Build, bundle & ship your Rust WASM application to the web";
+    maintainers = with maintainers; [ freezeboy flosse ];
+    license = with licenses; [ asl20 ];
+  };
+}

From 35b479e3136682a56f232c3fe1afbb68ed7eb1a4 Mon Sep 17 00:00:00 2001
From: Joachim Schiele 
Date: Tue, 11 Mar 2025 20:45:53 +0000
Subject: [PATCH 2/3] process_to_completion now processes PreData correctly

---
 html5ever/src/tree_builder/mod.rs   |  1 -
 html5ever/src/tree_builder/rules.rs | 15 +++++----------
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/html5ever/src/tree_builder/mod.rs b/html5ever/src/tree_builder/mod.rs
index f3bf1051..41d2f4cc 100644
--- a/html5ever/src/tree_builder/mod.rs
+++ b/html5ever/src/tree_builder/mod.rs
@@ -394,7 +394,6 @@ where
                 },
                 PreData(pre_data) => {
                     println!("here1 PreData(node)");
-                    todo!();
                     assert!(more_tokens.is_empty());
                     return tokenizer::TokenSinkResult::PreData(pre_data);
                 }
diff --git a/html5ever/src/tree_builder/rules.rs b/html5ever/src/tree_builder/rules.rs
index 025dbad1..f5afc3db 100644
--- a/html5ever/src/tree_builder/rules.rs
+++ b/html5ever/src/tree_builder/rules.rs
@@ -41,6 +41,7 @@ where
     Sink: TreeSink,
 {
     pub(crate) fn step(&self, mode: InsertionMode, token: Token) -> ProcessResult {
+        println!("step");
         self.debug_step(mode, &token);
 
         match mode {
@@ -402,11 +403,6 @@ where
                 }
 
                 tag @ 
 => {
-                    // self.close_p_element_in_button_scope();
-                    // self.insert_element_for(tag);
-                    // self.ignore_lf.set(true);
-                    // self.frameset_ok.set(false);
-                    // self.to_raw_text_mode(PreData)
                     let elem = create_element(
                         &self.sink, QualName::new(None, ns!(html), local_name!("pre")),
                         tag.attrs);
@@ -416,7 +412,6 @@ where
                     self.insert_appropriately(AppendNode(elem.clone()), None);
                     self.open_elems.borrow_mut().push(elem);
                     self.to_raw_text_mode(PreData)
-
                 }
 
                 tag @ 
=> { @@ -796,10 +791,9 @@ where if tag.name == local_name!("script") { return Script(node); } - // if tag.name == local_name!("pre") { - // println!("here be pre"); - // return PreData(node); - // } + if tag.name == local_name!("pre") { + return ProcessResult::PreData(node); + } Done } @@ -1438,6 +1432,7 @@ where } pub(crate) fn step_foreign(&self, token: Token) -> ProcessResult { + println!("step_foreign"); match_token!(token { NullCharacterToken => { self.unexpected(&token); From f0e4e4a14d2f64cf67746914d2f88a989ef58829 Mon Sep 17 00:00:00 2001 From: Joachim Schiele Date: Tue, 11 Mar 2025 21:20:45 +0000 Subject: [PATCH 3/3] parse_pre option support for TreeBuilderOpts --- html5ever/src/tokenizer/mod.rs | 48 ++++++++++++++--------------- html5ever/src/tree_builder/mod.rs | 20 ++++++------ html5ever/src/tree_builder/rules.rs | 28 +++++++++++------ rcdom/examples/hello_xml.rs | 3 +- rcdom/examples/html2html.rs | 5 ++- 5 files changed, 56 insertions(+), 48 deletions(-) diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs index ab05cf78..880a3120 100644 --- a/html5ever/src/tokenizer/mod.rs +++ b/html5ever/src/tokenizer/mod.rs @@ -14,11 +14,11 @@ pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token}; pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind}; pub use self::interface::{TokenSink, TokenSinkResult}; +use self::char_ref::{CharRef, CharRefTokenizer}; use self::states::{DoctypeIdKind, Public, System}; use self::states::{DoubleEscaped, Escaped}; use self::states::{DoubleQuoted, SingleQuoted, Unquoted}; -use self::states::{Rawtext, Rcdata, ScriptData, PreData, ScriptDataEscaped}; -use self::char_ref::{CharRef, CharRefTokenizer}; +use self::states::{PreData, Rawtext, Rcdata, ScriptData, ScriptDataEscaped}; use crate::util::str::lower_ascii_letter; @@ -364,12 +364,8 @@ impl Tokenizer { match run { ProcessResult::Continue => (), ProcessResult::Suspend => break, - ProcessResult::Script(node) => return { - TokenizerResult::Script(node) - }, - ProcessResult::PreData(node) => return { - TokenizerResult::PreData(node) - }, + ProcessResult::Script(node) => return { TokenizerResult::Script(node) }, + ProcessResult::PreData(node) => return { TokenizerResult::PreData(node) }, } } } else { @@ -377,14 +373,18 @@ impl Tokenizer { match self.step(input) { ProcessResult::Continue => (), ProcessResult::Suspend => break, - ProcessResult::Script(node) => return { - println!(" TokenizerResult::Script(node)"); - TokenizerResult::Script(node) + ProcessResult::Script(node) => { + return { + println!(" TokenizerResult::Script(node)"); + TokenizerResult::Script(node) + } + }, + ProcessResult::PreData(node) => { + return { + println!(" TokenizerResult::PreData(node)"); + TokenizerResult::PreData(node) + } }, - ProcessResult::PreData(node) => return { - println!(" TokenizerResult::PreData(node)"); - TokenizerResult::PreData(node) - }, } } } @@ -461,7 +461,7 @@ impl Tokenizer { ProcessResult::Continue }, TokenSinkResult::Script(node) => { - println!("match self.process_token(token) for script"); + println!("match self.process_token(token) for script"); self.state.set(states::Data); ProcessResult::Script(node) }, @@ -786,13 +786,14 @@ impl Tokenizer { states::RawData(PreData) => { println!("====== states::RawData(PreData)"); loop { - match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) { - FromSet('\0') => go!(self: error; emit '\u{fffd}'), - FromSet('<') => go!(self: to RawLessThanSign PreData), - FromSet(c) => go!(self: emit c), - NotFromSet(b) => self.emit_chars(b), + match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) { + FromSet('\0') => go!(self: error; emit '\u{fffd}'), + FromSet('<') => go!(self: to RawLessThanSign PreData), + FromSet(c) => go!(self: emit c), + NotFromSet(b) => self.emit_chars(b), + } } - }}, + }, //§ script-data-double-escaped-state states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop { @@ -835,7 +836,7 @@ impl Tokenizer { states::EndTagOpen => loop { match get_char!(self, input) { '>' => { - //println!("tttt {}", cl); + //println!("tttt {}", cl); go!(self: error; to Data) }, c => match lower_ascii_letter(c) { @@ -845,7 +846,6 @@ impl Tokenizer { } }, - //§ tag-name-state states::TagName => loop { match get_char!(self, input) { diff --git a/html5ever/src/tree_builder/mod.rs b/html5ever/src/tree_builder/mod.rs index 41d2f4cc..1f107570 100644 --- a/html5ever/src/tree_builder/mod.rs +++ b/html5ever/src/tree_builder/mod.rs @@ -31,12 +31,12 @@ use std::iter::{Enumerate, Rev}; use std::{fmt, slice}; use crate::tokenizer::states::RawKind; +use crate::tokenizer::TagKind; use crate::tree_builder::tag_sets::*; use crate::util::str::to_escaped_string; use log::{debug, log_enabled, warn, Level}; use mac::format_if; use markup5ever::{expanded_name, local_name, namespace_prefix, namespace_url, ns}; -use crate::tokenizer::TagKind; pub use self::PushFlag::*; @@ -82,7 +82,7 @@ impl Default for TreeBuilderOpts { drop_doctype: false, ignore_missing_rules: false, quirks_mode: NoQuirks, - parse_pre: false, + parse_pre: true, } } } @@ -326,10 +326,10 @@ where fn debug_step(&self, mode: InsertionMode, token: &Token) { println!( - "processing {} in insertion mode {:?}", - to_escaped_string(token), - mode - ); + "processing {} in insertion mode {:?}", + to_escaped_string(token), + mode + ); } fn process_to_completion(&self, mut token: Token) -> TokenSinkResult { @@ -396,7 +396,7 @@ where println!("here1 PreData(node)"); assert!(more_tokens.is_empty()); return tokenizer::TokenSinkResult::PreData(pre_data); - } + }, ToPlaintext => { assert!(more_tokens.is_empty()); return tokenizer::TokenSinkResult::Plaintext; @@ -524,15 +524,15 @@ where return tokenizer::TokenSinkResult::Continue; } }, -// qknight + // qknight tokenizer::TagToken(x) => { println!("TagToken: {}", x.name); if *x.name == *"pre" { - if x.kind == TagKind::StartTag { + if x.kind == TagKind::StartTag { println!("start tag pre"); // Read everything until
as raw text // let mut pre_content = String::new(); - + // while let Some(token) = self.sink.next() { // match token { // Token::TagToken(ref tag) if tag.kind == TagKind::End && tag.name == local_name!("pre") => { diff --git a/html5ever/src/tree_builder/rules.rs b/html5ever/src/tree_builder/rules.rs index f5afc3db..7d4baecd 100644 --- a/html5ever/src/tree_builder/rules.rs +++ b/html5ever/src/tree_builder/rules.rs @@ -10,7 +10,7 @@ // The tree builder rules, as a single, enormous nested match expression. use crate::interface::Quirks; -use crate::tokenizer::states::{Rawtext, Rcdata, ScriptData, PreData}; +use crate::tokenizer::states::{PreData, Rawtext, Rcdata, ScriptData}; use crate::tokenizer::TagKind::{EndTag, StartTag}; use crate::tree_builder::tag_sets::*; use crate::tree_builder::types::*; @@ -403,15 +403,23 @@ where } tag @
 => {
-                    let elem = create_element(
-                        &self.sink, QualName::new(None, ns!(html), local_name!("pre")),
-                        tag.attrs);
-                    if self.is_fragment() {
-                        self.sink.mark_script_already_started(&elem);
+                    if self.opts.parse_pre {
+                        self.close_p_element_in_button_scope();
+                        self.insert_element_for(tag);
+                        self.ignore_lf.set(true);
+                        self.frameset_ok.set(false);
+                        Done
+                    } else {
+                        let elem = create_element(
+                            &self.sink, QualName::new(None, ns!(html), local_name!("pre")),
+                            tag.attrs);
+                        if self.is_fragment() {
+                            self.sink.mark_script_already_started(&elem);
+                        }
+                        self.insert_appropriately(AppendNode(elem.clone()), None);
+                        self.open_elems.borrow_mut().push(elem);
+                        self.to_raw_text_mode(PreData)
                     }
-                    self.insert_appropriately(AppendNode(elem.clone()), None);
-                    self.open_elems.borrow_mut().push(elem);
-                    self.to_raw_text_mode(PreData)
                 }
 
                 tag @  => {
@@ -789,7 +797,7 @@ where
                     let node = self.pop();
                     self.mode.set(self.orig_mode.take().unwrap());
                     if tag.name == local_name!("script") {
-                        return Script(node);
+                        return ProcessResult::Script(node);
                     }
                     if tag.name == local_name!("pre") {
                         return ProcessResult::PreData(node);
diff --git a/rcdom/examples/hello_xml.rs b/rcdom/examples/hello_xml.rs
index 995b92c3..6bacecc5 100644
--- a/rcdom/examples/hello_xml.rs
+++ b/rcdom/examples/hello_xml.rs
@@ -19,7 +19,8 @@ use xml5ever::tree_builder::TreeSink;
 fn main() {
     // To parse a string into a tree of nodes, we need to invoke
     // `parse_document` and supply it with a TreeSink implementation (RcDom).
-    let dom: RcDom = parse_document(RcDom::default(), Default::default()).one("XML
asdf
"); + let dom: RcDom = parse_document(RcDom::default(), Default::default()) + .one("XML
asdf
"); // Do some processing let doc = &dom.document; diff --git a/rcdom/examples/html2html.rs b/rcdom/examples/html2html.rs index 01fe7fa8..fda39787 100644 --- a/rcdom/examples/html2html.rs +++ b/rcdom/examples/html2html.rs @@ -30,6 +30,7 @@ fn main() { let opts = ParseOpts { tree_builder: TreeBuilderOpts { drop_doctype: true, + parse_pre: false, ..Default::default() }, ..Default::default() @@ -37,8 +38,7 @@ fn main() { let input = "XML
\n text-in  pre

asdf

"; println!("-------- {} ----------", input); - let dom = parse_document(RcDom::default(), opts) - .one(input); + let dom = parse_document(RcDom::default(), opts).one(input); // let stdin = io::stdin(); // let dom = parse_document(RcDom::default(), opts) @@ -46,7 +46,6 @@ fn main() { // .read_from(&mut stdin.lock()) // .unwrap(); - // The validator.nu HTML2HTML always prints a doctype at the very beginning. io::stdout() .write_all(b"\n")