Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 61 additions & 4 deletions c_src/lazy_html.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <stdexcept>
#include <string>
#include <tuple>
#include <unordered_set>
#include <variant>

#include <lexbor/html/html.h>
Expand Down Expand Up @@ -43,8 +44,10 @@ auto resource = fine::Atom("resource");

struct DocumentRef {
lxb_html_document_t *document;
bool is_fragment;

DocumentRef(lxb_html_document_t *document) : document(document) {}
DocumentRef(lxb_html_document_t *document, bool is_fragment)
: document(document), is_fragment(is_fragment) {}

~DocumentRef() { lxb_html_document_destroy(this->document); }
};
Expand Down Expand Up @@ -97,7 +100,7 @@ ExLazyHTML from_document(ErlNifEnv *env, ErlNifBinary html) {
throw std::runtime_error("failed to parse html document");
}

auto document_ref = std::make_shared<DocumentRef>(document);
auto document_ref = std::make_shared<DocumentRef>(document, false);
document_guard.deactivate();

auto nodes = std::vector<lxb_dom_node_t *>();
Expand Down Expand Up @@ -129,7 +132,7 @@ ExLazyHTML from_fragment(ErlNifEnv *env, ErlNifBinary html) {
throw std::runtime_error("failed to parse html fragment");
}

auto document_ref = std::make_shared<DocumentRef>(document);
auto document_ref = std::make_shared<DocumentRef>(document, true);
document_guard.deactivate();

auto nodes = std::vector<lxb_dom_node_t *>();
Expand Down Expand Up @@ -522,7 +525,10 @@ ExLazyHTML from_tree(ErlNifEnv *env, std::vector<fine::Term> tree) {
nodes.push_back(node);
}

auto document_ref = std::make_shared<DocumentRef>(document);
bool is_fragment =
nodes.empty() || !lxb_html_tree_node_is(nodes.front(), LXB_TAG_HTML);

auto document_ref = std::make_shared<DocumentRef>(document, is_fragment);
document_guard.deactivate();

return ExLazyHTML(fine::make_resource<LazyHTML>(document_ref, nodes, false));
Expand Down Expand Up @@ -714,6 +720,57 @@ ExLazyHTML child_nodes(ErlNifEnv *env, ExLazyHTML ex_lazy_html) {

FINE_NIF(child_nodes, 0);

ExLazyHTML parent_node(ErlNifEnv *env, ExLazyHTML ex_lazy_html) {
bool is_document = !ex_lazy_html.resource->document_ref->is_fragment;
auto nodes = std::vector<lxb_dom_node_t *>();
auto inserted_nodes = std::unordered_set<lxb_dom_node_t *>();

for (auto node : ex_lazy_html.resource->nodes) {
auto parent = lxb_dom_node_parent(node);
if (parent != NULL && parent->type == LXB_DOM_NODE_TYPE_ELEMENT &&
(is_document || !lxb_html_tree_node_is(parent, LXB_TAG_HTML))) {
auto inserted_node = inserted_nodes.find(parent);
if (inserted_node == inserted_nodes.end()) {
inserted_nodes.insert(parent);
nodes.push_back(parent);
}
}
}
return ExLazyHTML(fine::make_resource<LazyHTML>(
ex_lazy_html.resource->document_ref, nodes, true));
}
FINE_NIF(parent_node, ERL_NIF_DIRTY_JOB_CPU_BOUND);

std::vector<int64_t> nth_child(ErlNifEnv *env, ExLazyHTML ex_lazy_html) {
auto values = std::vector<int64_t>();
for (auto node : ex_lazy_html.resource->nodes) {
if (node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
continue;
}

auto parent = lxb_dom_node_parent(node);
if (parent == NULL) {
// We're at the root, nth_child is 1
values.push_back(1);
} else {
int64_t i = 1;
for (auto child = lxb_dom_node_first_child(parent); child != NULL;
child = lxb_dom_node_next(child)) {
if (child == node) {
break;
}
if (child->type == LXB_DOM_NODE_TYPE_ELEMENT) {
i++;
}
}
values.push_back(i);
}
}

return values;
}
FINE_NIF(nth_child, ERL_NIF_DIRTY_JOB_CPU_BOUND);

std::string text(ErlNifEnv *env, ExLazyHTML ex_lazy_html) {
auto document = ex_lazy_html.resource->document_ref->document;

Expand Down
42 changes: 42 additions & 0 deletions lib/lazy_html.ex
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,48 @@ defmodule LazyHTML do
LazyHTML.NIF.child_nodes(lazy_html)
end

@doc """
Returns the (unique) parent nodes of the root nodes in `lazy_html`.

## Examples

iex> lazy_html = LazyHTML.from_fragment(~S|<div><span>Hello</span> <span>world</span></div>|)
iex> spans = LazyHTML.query(lazy_html, "span")
iex> LazyHTML.parent_node(spans)
#LazyHTML<
1 node (from selector)
#1
<div><span>Hello</span> <span>world</span></div>
>

"""
@spec parent_node(t()) :: t()
def parent_node(lazy_html) do
LazyHTML.NIF.parent_node(lazy_html)
end

@doc """
Returns the position among its siblings for every root element in `lazy_html`.

The position numbering is 1-based and only considers siblings that
are elements, as to match the `:nth-child` CSS pseudo-class.

Note that if there are text or comment root nodes, they are ignored,
and they have no corresponding number in the result.

## Examples

iex> lazy_html = LazyHTML.from_fragment(~S|<div><span>1</span><span>2</span></div>|)
iex> spans = LazyHTML.query(lazy_html, "span")
iex> LazyHTML.nth_child(spans)
[1, 2]

"""
@spec nth_child(t()) :: list(integer())
def nth_child(lazy_html) do
LazyHTML.NIF.nth_child(lazy_html)
end

@doc """
Returns the text content of all nodes in `lazy_html`.

Expand Down
2 changes: 2 additions & 0 deletions lib/lazy_html/nif.ex
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ defmodule LazyHTML.NIF do
def filter(_lazy_html, _css_selector), do: err!()
def query_by_id(_lazy_html, _id), do: err!()
def child_nodes(_lazy_html), do: err!()
def parent_node(_lazy_html), do: err!()
def nth_child(_lazy_html), do: err!()
def text(_lazy_html), do: err!()
def attribute(_lazy_html, _name), do: err!()
def attributes(_lazy_html), do: err!()
Expand Down
95 changes: 95 additions & 0 deletions test/lazy_html_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,101 @@ defmodule LazyHTMLTest do
end
end

describe "parent_node/1" do
test "from selector of nodes on different levels" do
lazy_html =
LazyHTML.from_fragment("""
<div id="a">
<div id="b">
<span>Hello</span>
</div>
<span>world</span>
</div>
""")

spans = LazyHTML.query(lazy_html, "span")
parents = LazyHTML.parent_node(spans)
parent_ids = parents |> LazyHTML.attribute("id") |> Enum.sort()
assert parent_ids == ["a", "b"]

# parent of div#id="a" is null
grandparents = LazyHTML.parent_node(parents)
assert LazyHTML.tag(grandparents) == ["div"]

great_grandparents = LazyHTML.parent_node(grandparents)
assert great_grandparents |> Enum.count() == 0
end

test "from selector of nodes on same level" do
lazy_html =
LazyHTML.from_fragment("""
<div id="a">
<div id="b">
<span>Hello</span>
</div>
<div id="c">
<span>world</span>
</div>
</div>
""")

spans = LazyHTML.query(lazy_html, "span")
parents = LazyHTML.parent_node(spans)
parent_ids = parents |> LazyHTML.attribute("id") |> Enum.sort()
assert parent_ids == ["b", "c"]

# since they share the same parent, we now only have one node left
grandparent = LazyHTML.parent_node(parents)
assert LazyHTML.attribute(grandparent, "id") == ["a"]
end

defp ancestor_chain(node) do
parent = LazyHTML.parent_node(node)

if Enum.count(node) == 0 do
[]
else
ancestor_chain(parent) ++ LazyHTML.tag(parent)
end
end

test "last parent node is <html> if instantiated via from_document and similar" do
lazy_html = LazyHTML.from_document("<html><body><div>root</div></body></html>")
assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == ["html", "body"]

lazy_html = LazyHTML.from_fragment("<div>root</div>")
assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == []

lazy_html = LazyHTML.from_tree([{"div", [], []}])
assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == []

lazy_html = LazyHTML.from_tree([{"html", [], [{"body", [], [{"div", [], []}]}]}])
assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == ["html", "body"]
end
end

describe "nth_child/1" do
test "nth_child gives position" do
lazy_html =
LazyHTML.from_fragment("""
<div>
Text isn't counted.
<span>1</span>
<!-- neither are comments -->
<span>2</span>
</div>
""")

assert LazyHTML.nth_child(lazy_html) == [1]
assert lazy_html["div"] |> LazyHTML.nth_child() == [1]
assert lazy_html["span"] |> LazyHTML.nth_child() == [1, 2]

# Verify numbering matches css selector
assert lazy_html["span:nth-child(1)"] |> LazyHTML.text() == "1"
assert lazy_html["span:nth-child(2)"] |> LazyHTML.text() == "2"
end
end

describe "query_by_id/2" do
test "raises when an empty id is given" do
assert_raise ArgumentError, ~r/id cannot be empty/, fn ->
Expand Down