diff --git a/c_src/lazy_html.cpp b/c_src/lazy_html.cpp index e283c29..ce5a8ae 100644 --- a/c_src/lazy_html.cpp +++ b/c_src/lazy_html.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -43,8 +44,10 @@ auto resource = fine::Atom("resource"); struct DocumentRef { lxb_html_document_t *document; + bool is_fragment; - DocumentRef(lxb_html_document_t *document) : document(document) {} + DocumentRef(lxb_html_document_t *document, bool is_fragment) + : document(document), is_fragment(is_fragment) {} ~DocumentRef() { lxb_html_document_destroy(this->document); } }; @@ -97,7 +100,7 @@ ExLazyHTML from_document(ErlNifEnv *env, ErlNifBinary html) { throw std::runtime_error("failed to parse html document"); } - auto document_ref = std::make_shared(document); + auto document_ref = std::make_shared(document, false); document_guard.deactivate(); auto nodes = std::vector(); @@ -129,7 +132,7 @@ ExLazyHTML from_fragment(ErlNifEnv *env, ErlNifBinary html) { throw std::runtime_error("failed to parse html fragment"); } - auto document_ref = std::make_shared(document); + auto document_ref = std::make_shared(document, true); document_guard.deactivate(); auto nodes = std::vector(); @@ -522,7 +525,10 @@ ExLazyHTML from_tree(ErlNifEnv *env, std::vector tree) { nodes.push_back(node); } - auto document_ref = std::make_shared(document); + bool is_fragment = + nodes.empty() || !lxb_html_tree_node_is(nodes.front(), LXB_TAG_HTML); + + auto document_ref = std::make_shared(document, is_fragment); document_guard.deactivate(); return ExLazyHTML(fine::make_resource(document_ref, nodes, false)); @@ -714,6 +720,57 @@ ExLazyHTML child_nodes(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { FINE_NIF(child_nodes, 0); +ExLazyHTML parent_node(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { + bool is_document = !ex_lazy_html.resource->document_ref->is_fragment; + auto nodes = std::vector(); + auto inserted_nodes = std::unordered_set(); + + for (auto node : ex_lazy_html.resource->nodes) { + auto parent = lxb_dom_node_parent(node); + if (parent != NULL && parent->type == LXB_DOM_NODE_TYPE_ELEMENT && + (is_document || !lxb_html_tree_node_is(parent, LXB_TAG_HTML))) { + auto inserted_node = inserted_nodes.find(parent); + if (inserted_node == inserted_nodes.end()) { + inserted_nodes.insert(parent); + nodes.push_back(parent); + } + } + } + return ExLazyHTML(fine::make_resource( + ex_lazy_html.resource->document_ref, nodes, true)); +} +FINE_NIF(parent_node, ERL_NIF_DIRTY_JOB_CPU_BOUND); + +std::vector nth_child(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { + auto values = std::vector(); + for (auto node : ex_lazy_html.resource->nodes) { + if (node->type != LXB_DOM_NODE_TYPE_ELEMENT) { + continue; + } + + auto parent = lxb_dom_node_parent(node); + if (parent == NULL) { + // We're at the root, nth_child is 1 + values.push_back(1); + } else { + int64_t i = 1; + for (auto child = lxb_dom_node_first_child(parent); child != NULL; + child = lxb_dom_node_next(child)) { + if (child == node) { + break; + } + if (child->type == LXB_DOM_NODE_TYPE_ELEMENT) { + i++; + } + } + values.push_back(i); + } + } + + return values; +} +FINE_NIF(nth_child, ERL_NIF_DIRTY_JOB_CPU_BOUND); + std::string text(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { auto document = ex_lazy_html.resource->document_ref->document; diff --git a/lib/lazy_html.ex b/lib/lazy_html.ex index d814697..b6a9907 100644 --- a/lib/lazy_html.ex +++ b/lib/lazy_html.ex @@ -357,6 +357,48 @@ defmodule LazyHTML do LazyHTML.NIF.child_nodes(lazy_html) end + @doc """ + Returns the (unique) parent nodes of the root nodes in `lazy_html`. + + ## Examples + + iex> lazy_html = LazyHTML.from_fragment(~S|
Hello world
|) + iex> spans = LazyHTML.query(lazy_html, "span") + iex> LazyHTML.parent_node(spans) + #LazyHTML< + 1 node (from selector) + #1 +
Hello world
+ > + + """ + @spec parent_node(t()) :: t() + def parent_node(lazy_html) do + LazyHTML.NIF.parent_node(lazy_html) + end + + @doc """ + Returns the position among its siblings for every root element in `lazy_html`. + + The position numbering is 1-based and only considers siblings that + are elements, as to match the `:nth-child` CSS pseudo-class. + + Note that if there are text or comment root nodes, they are ignored, + and they have no corresponding number in the result. + + ## Examples + + iex> lazy_html = LazyHTML.from_fragment(~S|
12
|) + iex> spans = LazyHTML.query(lazy_html, "span") + iex> LazyHTML.nth_child(spans) + [1, 2] + + """ + @spec nth_child(t()) :: list(integer()) + def nth_child(lazy_html) do + LazyHTML.NIF.nth_child(lazy_html) + end + @doc """ Returns the text content of all nodes in `lazy_html`. diff --git a/lib/lazy_html/nif.ex b/lib/lazy_html/nif.ex index e7098ac..1661af9 100644 --- a/lib/lazy_html/nif.ex +++ b/lib/lazy_html/nif.ex @@ -21,6 +21,8 @@ defmodule LazyHTML.NIF do def filter(_lazy_html, _css_selector), do: err!() def query_by_id(_lazy_html, _id), do: err!() def child_nodes(_lazy_html), do: err!() + def parent_node(_lazy_html), do: err!() + def nth_child(_lazy_html), do: err!() def text(_lazy_html), do: err!() def attribute(_lazy_html, _name), do: err!() def attributes(_lazy_html), do: err!() diff --git a/test/lazy_html_test.exs b/test/lazy_html_test.exs index 422a36e..84bee63 100644 --- a/test/lazy_html_test.exs +++ b/test/lazy_html_test.exs @@ -250,6 +250,101 @@ defmodule LazyHTMLTest do end end + describe "parent_node/1" do + test "from selector of nodes on different levels" do + lazy_html = + LazyHTML.from_fragment(""" +
+
+ Hello +
+ world +
+ """) + + spans = LazyHTML.query(lazy_html, "span") + parents = LazyHTML.parent_node(spans) + parent_ids = parents |> LazyHTML.attribute("id") |> Enum.sort() + assert parent_ids == ["a", "b"] + + # parent of div#id="a" is null + grandparents = LazyHTML.parent_node(parents) + assert LazyHTML.tag(grandparents) == ["div"] + + great_grandparents = LazyHTML.parent_node(grandparents) + assert great_grandparents |> Enum.count() == 0 + end + + test "from selector of nodes on same level" do + lazy_html = + LazyHTML.from_fragment(""" +
+
+ Hello +
+
+ world +
+
+ """) + + spans = LazyHTML.query(lazy_html, "span") + parents = LazyHTML.parent_node(spans) + parent_ids = parents |> LazyHTML.attribute("id") |> Enum.sort() + assert parent_ids == ["b", "c"] + + # since they share the same parent, we now only have one node left + grandparent = LazyHTML.parent_node(parents) + assert LazyHTML.attribute(grandparent, "id") == ["a"] + end + + defp ancestor_chain(node) do + parent = LazyHTML.parent_node(node) + + if Enum.count(node) == 0 do + [] + else + ancestor_chain(parent) ++ LazyHTML.tag(parent) + end + end + + test "last parent node is if instantiated via from_document and similar" do + lazy_html = LazyHTML.from_document("
root
") + assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == ["html", "body"] + + lazy_html = LazyHTML.from_fragment("
root
") + assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == [] + + lazy_html = LazyHTML.from_tree([{"div", [], []}]) + assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == [] + + lazy_html = LazyHTML.from_tree([{"html", [], [{"body", [], [{"div", [], []}]}]}]) + assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == ["html", "body"] + end + end + + describe "nth_child/1" do + test "nth_child gives position" do + lazy_html = + LazyHTML.from_fragment(""" +
+ Text isn't counted. + 1 + + 2 +
+ """) + + assert LazyHTML.nth_child(lazy_html) == [1] + assert lazy_html["div"] |> LazyHTML.nth_child() == [1] + assert lazy_html["span"] |> LazyHTML.nth_child() == [1, 2] + + # Verify numbering matches css selector + assert lazy_html["span:nth-child(1)"] |> LazyHTML.text() == "1" + assert lazy_html["span:nth-child(2)"] |> LazyHTML.text() == "2" + end + end + describe "query_by_id/2" do test "raises when an empty id is given" do assert_raise ArgumentError, ~r/id cannot be empty/, fn ->