Skip to content

Commit

Permalink
html-rewriter: Support streaming content replacements (#3211)
Browse files Browse the repository at this point in the history
* Upgrade lol-html to 2.1.0 (changes how C API is accessed)

* html-rewriter: Support streaming content replacements

* Update TS types
  • Loading branch information
npaun authored Dec 20, 2024
1 parent e35f34b commit c77df01
Show file tree
Hide file tree
Showing 24 changed files with 1,320 additions and 315 deletions.
8 changes: 4 additions & 4 deletions deps/rust/cargo.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ PACKAGES = {
"cxx": crate.spec(version = "1"),
"cxxbridge-cmd": crate.spec(version = "1"),
"flate2": crate.spec(version = "1"),
"lol_html_c_api": crate.spec(
git = "https://github.com/cloudflare/lol-html.git",
rev = "cac9f2f59aea8ad803286b0aae0d667926f441c7",
),
# Commit hash refers to lol-html v2.1.0. We then access the nested lol_html_c_api crate within.
# TODO(npaun): The next release of lol-html could change the way we access the nested crate.
# Check once https://github.com/cloudflare/lol-html/pull/247 is in a release.
"lol_html_c_api": crate.spec(git = "https://github.com/cloudflare/lol-html.git", rev = "cac9f2f59aea8ad803286b0aae0d667926f441c7"),
"nix": crate.spec(version = "0"),
"pico-args": crate.spec(version = "0"),
"proc-macro2": crate.spec(version = "1"),
Expand Down
297 changes: 249 additions & 48 deletions src/workerd/api/html-rewriter.c++

Large diffs are not rendered by default.

80 changes: 37 additions & 43 deletions src/workerd/api/html-rewriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,21 @@ class HTMLRewriter: public jsg::Object {
kj::Own<Impl> impl;
};

// A chunk of text or HTML which can be passed to content token mutation functions.
using Content = kj::OneOf<kj::String, jsg::Ref<ReadableStream>, jsg::Ref<Response>>;
// TODO(soon): Support ReadableStream/Response types. Requires fibers or lol-html saveable state.

// Options bag which can be passed to content token mutation functions.
struct ContentOptions {
// True if the Content being passed to the mutation function is HTML. If false, the content will
// be escaped (HTML entity-encoded).
jsg::Optional<bool> html;

JSG_STRUCT(html);
};

class Rewriter;

// =======================================================================================
// HTML Content Tokens
//
Expand All @@ -140,23 +155,25 @@ class HTMLRewriter: public jsg::Object {
class HTMLRewriter::Token: public jsg::Object {
public:
virtual void htmlContentScopeEnd() = 0;
};

// A chunk of text or HTML which can be passed to content token mutation functions.
using Content = kj::OneOf<kj::String, jsg::Ref<ReadableStream>, jsg::Ref<Response>>;
// TODO(soon): Support ReadableStream/Response types. Requires fibers or lol-html saveable state.
// Each Token subclass has an inner ImplBase subclass which holds a reference
// to the rewriter, and the actual underlying lol-html C API handle for the token.
template <typename CType>
struct ImplBase {
ImplBase(CType& element, Rewriter& rewriter);
KJ_DISALLOW_COPY_AND_MOVE(ImplBase);
~ImplBase() noexcept(false);

// Options bag which can be passed to content token mutation functions.
struct ContentOptions {
// True if the Content being passed to the mutation function is HTML. If false, the content will
// be escaped (HTML entity-encoded).
jsg::Optional<bool> html;
// Dispatches calls to the underlying lol_html methods for each event (e.g. before, after, replace).
// Handles replacements of each supported type (string, ReadableStream, Body).
template <auto Func, auto StreamingFunc>
void rewriteContentGeneric(Content content, jsg::Optional<ContentOptions> options);

JSG_STRUCT(html);
CType& element;
Rewriter& rewriter;
};
};

class Rewriter;

class Element final: public HTMLRewriter::Token {
public:
using CType = lol_html_Element;
Expand Down Expand Up @@ -214,28 +231,16 @@ class Element final: public HTMLRewriter::Token {

JSG_TS_ROOT();
JSG_TS_OVERRIDE({
before(content: string, options?: ContentOptions): Element;
after(content: string, options?: ContentOptions): Element;
prepend(content: string, options?: ContentOptions): Element;
append(content: string, options?: ContentOptions): Element;
replace(content: string, options?: ContentOptions): Element;
setInnerContent(content: string, options?: ContentOptions): Element;

onEndTag(handler: (tag: EndTag) => void | Promise<void>): void;
onEndTag(handler: (tag: EndTag) => void | Promise<void>): void;
});
// Require content to be a string, and specify parameter type for onEndTag
// callback function
// Specify parameter type for onEndTag callback function
}

private:
struct Impl {
Impl(CType& element, Rewriter&);
KJ_DISALLOW_COPY_AND_MOVE(Impl);
struct Impl: public HTMLRewriter::Token::ImplBase<CType> {
using HTMLRewriter::Token::ImplBase<CType>::ImplBase;
~Impl() noexcept(false);

CType& element;
kj::Vector<jsg::Ref<AttributesIterator>> attributesIterators;
Rewriter& rewriter;
};

kj::Maybe<Impl> impl;
Expand Down Expand Up @@ -278,7 +283,7 @@ class EndTag final: public HTMLRewriter::Token {
public:
using CType = lol_html_EndTag;

explicit EndTag(CType& tag, Rewriter&);
explicit EndTag(CType& tag, Rewriter& rewriter);

kj::String getName();
void setName(kj::String);
Expand All @@ -295,15 +300,10 @@ class EndTag final: public HTMLRewriter::Token {
JSG_METHOD(remove);

JSG_TS_ROOT();
JSG_TS_OVERRIDE({
before(content: string, options?: ContentOptions): EndTag;
after(content: string, options?: ContentOptions): EndTag;
});
// Require content to be a string
}

private:
kj::Maybe<CType&> impl;
kj::Maybe<HTMLRewriter::Token::ImplBase<CType>> impl;

void htmlContentScopeEnd() override;
};
Expand Down Expand Up @@ -352,7 +352,7 @@ class Text final: public HTMLRewriter::Token {
public:
using CType = lol_html_TextChunk;

explicit Text(CType& text, Rewriter&);
explicit Text(CType& text, Rewriter& rewriter);

kj::String getText();

Expand All @@ -376,16 +376,10 @@ class Text final: public HTMLRewriter::Token {
JSG_METHOD(remove);

JSG_TS_ROOT();
JSG_TS_OVERRIDE({
before(content: string, options?: ContentOptions): Text;
after(content: string, options?: ContentOptions): Text;
replace(content: string, options?: ContentOptions): Text;
});
// Require content to be a string
}

private:
kj::Maybe<CType&> impl;
kj::Maybe<HTMLRewriter::Token::ImplBase<CType>> impl;

void htmlContentScopeEnd() override;
};
Expand Down
150 changes: 150 additions & 0 deletions src/workerd/api/tests/htmlrewriter-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,156 @@ export const manualWriting2 = {
},
};

export const streamingReplacement = {
async test() {
const { readable, writable } = new TransformStream();

const response = new HTMLRewriter()
.on('*', {
async element(element) {
const dataStream = (
await fetch('data:,the quick brown fox jumped over the lazy dog%20')
).body;
element.prepend(dataStream, { html: false });
},
})
.transform(new Response(readable));

const writer = writable.getWriter();
const encoder = new TextEncoder();

await writer.write(encoder.encode('<html>'));
await writer.write(encoder.encode('bar'));
await writer.write(encoder.encode('</html>'));
await writer.close();

// This variation uses the JavaScript TransformStream, so we can
// initiate the read after doing the writes.
const promise = response.text();
strictEqual(
await promise,
`<html>the quick brown fox jumped over the lazy dog bar</html>`
);
},
};

export const streamingReplacementHTML = {
async test() {
const { readable, writable } = new TransformStream();

const response = new HTMLRewriter()
.on('*', {
async element(element) {
const dataStream = (
await fetch('data:,<b>such markup <i>much wow</i></b> ')
).body;
element.prepend(dataStream, { html: true });
},
})
.transform(new Response(readable));

const writer = writable.getWriter();
const encoder = new TextEncoder();

await writer.write(encoder.encode('<html>'));
await writer.write(encoder.encode('bar'));
await writer.write(encoder.encode('</html>'));
await writer.close();

// This variation uses the JavaScript TransformStream, so we can
// initiate the read after doing the writes.
const promise = response.text();
strictEqual(
await promise,
`<html><b>such markup <i>much wow</i></b>bar</html>`
);
},
};

export const streamingReplacementReplace = {
async test() {
const { readable, writable } = new TransformStream();

const response = new HTMLRewriter()
.on('.dinosaur', {
async element(element) {
const dataStream = (await fetch('data:,goodbye world')).body;
element.replace(dataStream, { html: false });
},
})
.transform(new Response(readable));

const writer = writable.getWriter();
const encoder = new TextEncoder();

await writer.write(encoder.encode('<html>'));
await writer.write(
encoder.encode('<div class="dinosaur">hello world</div>')
);
await writer.write(encoder.encode('</html>'));
await writer.close();

// This variation uses the JavaScript TransformStream, so we can
// initiate the read after doing the writes.
const promise = response.text();
strictEqual(await promise, `<html>goodbye world</html>`);
},
};

export const streamingReplacementMultiple = {
async test() {
const { readable, writable } = new TransformStream();

const response = new HTMLRewriter()
.on('*', {
async element(element) {
element.prepend(await fetch('data:,alpha%20'));
element.append(await fetch('data:,%20gamma'));
},
})
.transform(new Response(readable));

const writer = writable.getWriter();
const encoder = new TextEncoder();

await writer.write(encoder.encode('<html>'));
await writer.write(encoder.encode('beta'));
await writer.write(encoder.encode('</html>'));
await writer.close();

// This variation uses the JavaScript TransformStream, so we can
// initiate the read after doing the writes.
const promise = response.text();
strictEqual(await promise, `<html>alpha beta gamma</html>`);
},
};

export const streamingReplacementBadUTF8 = {
async test() {
const { readable, writable } = new TransformStream();

const response = new HTMLRewriter()
.on('*', {
async element(element) {
element.prepend(await fetch('data:,garbage%e2%28%a1'));
},
})
.transform(new Response(readable));

const writer = writable.getWriter();
const encoder = new TextEncoder();

await writer.write(encoder.encode('<html>'));
await writer.write(encoder.encode('bar'));
await writer.write(encoder.encode('</html>'));
await writer.close();

// This variation uses the JavaScript TransformStream, so we can
// initiate the read after doing the writes.
await rejects(response.text(), { message: 'Parser error: Invalid UTF-8' });
},
};

export const appendOnEnd = {
async test() {
const kInput =
Expand Down
55 changes: 44 additions & 11 deletions types/generated-snapshot/2021-11-03/index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1416,20 +1416,44 @@ interface Element {
hasAttribute(name: string): boolean;
setAttribute(name: string, value: string): Element;
removeAttribute(name: string): Element;
before(content: string, options?: ContentOptions): Element;
after(content: string, options?: ContentOptions): Element;
prepend(content: string, options?: ContentOptions): Element;
append(content: string, options?: ContentOptions): Element;
replace(content: string, options?: ContentOptions): Element;
before(
content: string | ReadableStream | Response,
options?: ContentOptions,
): Element;
after(
content: string | ReadableStream | Response,
options?: ContentOptions,
): Element;
prepend(
content: string | ReadableStream | Response,
options?: ContentOptions,
): Element;
append(
content: string | ReadableStream | Response,
options?: ContentOptions,
): Element;
replace(
content: string | ReadableStream | Response,
options?: ContentOptions,
): Element;
remove(): Element;
removeAndKeepContent(): Element;
setInnerContent(content: string, options?: ContentOptions): Element;
setInnerContent(
content: string | ReadableStream | Response,
options?: ContentOptions,
): Element;
onEndTag(handler: (tag: EndTag) => void | Promise<void>): void;
}
interface EndTag {
name: string;
before(content: string, options?: ContentOptions): EndTag;
after(content: string, options?: ContentOptions): EndTag;
before(
content: string | ReadableStream | Response,
options?: ContentOptions,
): EndTag;
after(
content: string | ReadableStream | Response,
options?: ContentOptions,
): EndTag;
remove(): EndTag;
}
interface Comment {
Expand All @@ -1444,9 +1468,18 @@ interface Text {
readonly text: string;
readonly lastInTextNode: boolean;
readonly removed: boolean;
before(content: string, options?: ContentOptions): Text;
after(content: string, options?: ContentOptions): Text;
replace(content: string, options?: ContentOptions): Text;
before(
content: string | ReadableStream | Response,
options?: ContentOptions,
): Text;
after(
content: string | ReadableStream | Response,
options?: ContentOptions,
): Text;
replace(
content: string | ReadableStream | Response,
options?: ContentOptions,
): Text;
remove(): Text;
}
interface DocumentEnd {
Expand Down
Loading

0 comments on commit c77df01

Please sign in to comment.