From ad689524ccbc467335f4e7dea1d01a29ff96f687 Mon Sep 17 00:00:00 2001 From: spicychickensauce Date: Tue, 23 Sep 2025 11:12:40 +0200 Subject: [PATCH 01/17] Implement parent_nodes --- c_src/lazy_html.cpp | 20 +++++++++++++++ lib/lazy_html.ex | 30 +++++++++++++++++++++++ lib/lazy_html/nif.ex | 1 + test/lazy_html_test.exs | 54 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 105 insertions(+) diff --git a/c_src/lazy_html.cpp b/c_src/lazy_html.cpp index e283c29..fea9c77 100644 --- a/c_src/lazy_html.cpp +++ b/c_src/lazy_html.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -714,6 +715,25 @@ ExLazyHTML child_nodes(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { FINE_NIF(child_nodes, 0); +ExLazyHTML parent_nodes(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { + auto nodes = std::vector(); + auto inserted_nodes = std::set(); + + for (auto node : ex_lazy_html.resource->nodes) { + auto parent = node->parent; + if (parent != NULL && parent->type == LXB_DOM_NODE_TYPE_ELEMENT) { + auto inserted_node = inserted_nodes.find(parent); + if (inserted_node == inserted_nodes.end()) { + inserted_nodes.insert(parent); + nodes.push_back(parent); + } + } + } + return ExLazyHTML(fine::make_resource( + ex_lazy_html.resource->document_ref, nodes, true)); +} +FINE_NIF(parent_nodes, ERL_NIF_DIRTY_JOB_CPU_BOUND); + std::string text(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { auto document = ex_lazy_html.resource->document_ref->document; diff --git a/lib/lazy_html.ex b/lib/lazy_html.ex index d814697..fdca13e 100644 --- a/lib/lazy_html.ex +++ b/lib/lazy_html.ex @@ -357,6 +357,36 @@ defmodule LazyHTML do LazyHTML.NIF.child_nodes(lazy_html) end + @doc """ + Returns the (unique) parent nodes of the root nodes in `lazy_html`. + + ## Examples + + iex> lazy_html = LazyHTML.from_fragment(~S|
Hello world
|) + iex> spans = LazyHTML.query(lazy_html, "span") + iex> LazyHTML.parent_nodes(spans) + #LazyHTML< + 1 node (from selector) + #1 +
Hello world
+ > + + The root node is always , even if initialized via `from_fragment/1`: + + iex> lazy_html = LazyHTML.from_fragment(~S|
root
|) + iex> LazyHTML.parent_nodes(lazy_html) + #LazyHTML< + 1 node (from selector) + #1 +
root
+ > + + """ + @spec parent_nodes(t()) :: t() + def parent_nodes(lazy_html) do + LazyHTML.NIF.parent_nodes(lazy_html) + end + @doc """ Returns the text content of all nodes in `lazy_html`. diff --git a/lib/lazy_html/nif.ex b/lib/lazy_html/nif.ex index e7098ac..a1e805c 100644 --- a/lib/lazy_html/nif.ex +++ b/lib/lazy_html/nif.ex @@ -21,6 +21,7 @@ defmodule LazyHTML.NIF do def filter(_lazy_html, _css_selector), do: err!() def query_by_id(_lazy_html, _id), do: err!() def child_nodes(_lazy_html), do: err!() + def parent_nodes(_lazy_html), do: err!() def text(_lazy_html), do: err!() def attribute(_lazy_html, _name), do: err!() def attributes(_lazy_html), do: err!() diff --git a/test/lazy_html_test.exs b/test/lazy_html_test.exs index 422a36e..2c737f7 100644 --- a/test/lazy_html_test.exs +++ b/test/lazy_html_test.exs @@ -250,6 +250,60 @@ defmodule LazyHTMLTest do end end + describe "parent_nodes/1" do + test "from selector of nodes on different levels" do + lazy_html = + LazyHTML.from_fragment(""" +
+
+ Hello +
+ world +
+ """) + + spans = LazyHTML.query(lazy_html, "span") + parents = LazyHTML.parent_nodes(spans) + parent_ids = parents |> Enum.flat_map(&LazyHTML.attribute(&1, "id")) |> Enum.sort() + assert parent_ids == ["0", "1"] + + # parent of div#id=0 is + grandparents = LazyHTML.parent_nodes(parents) + assert LazyHTML.tag(grandparents) |> Enum.sort() == ["div", "html"] + + # parent of is null, so it's filtered out + great_grandparents = LazyHTML.parent_nodes(grandparents) + assert great_grandparents |> Enum.count() == 1 + + # again, parent of is filtered out + assert LazyHTML.parent_nodes(great_grandparents) |> Enum.count() == 0 + end + + test "from selector of nodes on same level" do + lazy_html = + LazyHTML.from_fragment(""" +
+
+ Hello +
+
+ world +
+
+ """) + + spans = LazyHTML.query(lazy_html, "span") + parents = LazyHTML.parent_nodes(spans) + parent_ids = parents |> Enum.flat_map(&LazyHTML.attribute(&1, "id")) |> Enum.sort() + assert parent_ids == ["1", "2"] + + # since they share the same parent, we now only have one node left + grandparent = LazyHTML.parent_nodes(parents) + assert LazyHTML.attribute(grandparent, "id") == ["0"] + end + + end + describe "query_by_id/2" do test "raises when an empty id is given" do assert_raise ArgumentError, ~r/id cannot be empty/, fn -> From f52b687fdfcc182b9464f98348e56c730724f6db Mon Sep 17 00:00:00 2001 From: spicychickensauce Date: Tue, 23 Sep 2025 11:27:08 +0200 Subject: [PATCH 02/17] Implement equals? --- c_src/lazy_html.cpp | 6 ++++++ lib/lazy_html.ex | 23 +++++++++++++++++++++++ lib/lazy_html/nif.ex | 1 + 3 files changed, 30 insertions(+) diff --git a/c_src/lazy_html.cpp b/c_src/lazy_html.cpp index fea9c77..48677ea 100644 --- a/c_src/lazy_html.cpp +++ b/c_src/lazy_html.cpp @@ -822,6 +822,12 @@ std::uint64_t num_nodes(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { FINE_NIF(num_nodes, 0); +bool equals(ErlNifEnv *env, ExLazyHTML html_a, ExLazyHTML html_b) { + return (html_a.resource->document_ref == html_b.resource->document_ref && + html_a.resource->nodes == html_b.resource->nodes); +} +FINE_NIF(equals, 0); + std::vector tag(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { auto values = std::vector(); diff --git a/lib/lazy_html.ex b/lib/lazy_html.ex index fdca13e..18dc8d9 100644 --- a/lib/lazy_html.ex +++ b/lib/lazy_html.ex @@ -511,6 +511,29 @@ defmodule LazyHTML do LazyHTML.NIF.tag(lazy_html) end + @doc """ + Returns true if the lazy_html is selecting the same nodes starting from the same document. + + ## Examples + + iex> lazy_html = LazyHTML.from_fragment(~S|
Hello
|) + iex> a = LazyHTML.query(lazy_html, "#1") + iex> b = LazyHTML.query(lazy_html, "div > span") + iex> LazyHTML.equals?(a, b) + true + + Note that if the lazy_htmls are created separately, they are never equal: + + iex> html_a = LazyHTML.from_fragment(~S|
hello
|) + iex> html_b = LazyHTML.from_fragment(~S|
hello
|) + iex> LazyHTML.equals?(html_a, html_b) + false + """ + @spec equals?(t(), t()) :: boolean() + def equals?(html_a, html_b) do + LazyHTML.NIF.equals(html_a, html_b) + end + @doc ~S""" Escapes the given string to make a valid HTML text. diff --git a/lib/lazy_html/nif.ex b/lib/lazy_html/nif.ex index a1e805c..9f26abb 100644 --- a/lib/lazy_html/nif.ex +++ b/lib/lazy_html/nif.ex @@ -28,6 +28,7 @@ defmodule LazyHTML.NIF do def tag(_lazy_html), do: err!() def nodes(_lazy_html), do: err!() def num_nodes(_lazy_html), do: err!() + def equals(_lazy_html_a, _lazy_html_b), do: err!() defp err!(), do: :erlang.nif_error(:not_loaded) end From 922e53bb2e8a933d7e462484f03d85bad5e8a5e2 Mon Sep 17 00:00:00 2001 From: spicychickensauce Date: Tue, 23 Sep 2025 11:27:44 +0200 Subject: [PATCH 03/17] Implement parent_node helper --- lib/lazy_html.ex | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/lib/lazy_html.ex b/lib/lazy_html.ex index 18dc8d9..a00a6ea 100644 --- a/lib/lazy_html.ex +++ b/lib/lazy_html.ex @@ -387,6 +387,30 @@ defmodule LazyHTML do LazyHTML.NIF.parent_nodes(lazy_html) end + @doc """ + Returns the parent nodes of the root nodes in `lazy_html`. + Useful when you're expecting a single, shared parent. + """ + def parent_node(lazy_html) do + parent = LazyHTML.NIF.parent_nodes(lazy_html) + + case LazyHTML.NIF.num_nodes(parent) do + 0 -> {:ok, nil} + 1 -> {:ok, parent} + _ -> {:error, :multiple_parents} + end + end + + @doc """ + Same as `parent_node/1` but raises on multiple parents + """ + def parent_node!(lazy_html) do + case parent_node(lazy_html) do + {:ok, res} -> res + {:error, :multiple_parents} -> raise "Selected nodes have multiple parents" + end + end + @doc """ Returns the text content of all nodes in `lazy_html`. From f6a49649b9424048b76699ab99f66217dd29aa43 Mon Sep 17 00:00:00 2001 From: spicychickensauce Date: Tue, 23 Sep 2025 11:31:01 +0200 Subject: [PATCH 04/17] Test construction of css path from node --- test/lazy_html_test.exs | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/test/lazy_html_test.exs b/test/lazy_html_test.exs index 2c737f7..9e3e150 100644 --- a/test/lazy_html_test.exs +++ b/test/lazy_html_test.exs @@ -302,6 +302,42 @@ defmodule LazyHTMLTest do assert LazyHTML.attribute(grandparent, "id") == ["0"] end + defp get_css_path(node, acc) do + parent = LazyHTML.parent_node!(node) + + if parent do + siblings = + LazyHTML.child_nodes(parent) + |> Enum.reject(fn n -> LazyHTML.tag(n) == [] end) + + [tag] = LazyHTML.tag(node) + i = Enum.find_index(siblings, fn n -> LazyHTML.equals?(n, node) end) + get_css_path(parent, [{tag, i} | acc]) + else + acc |> Enum.map_join(" > ", fn {tag, i} -> "#{tag}:nth-child(#{i + 1})" end) + end + end + + test "construct nth-child selector by traversing parents" do + lazy_html = + LazyHTML.from_fragment(""" +
+
+ wibble +
+
+ wobble +
+
+ """) + + span = LazyHTML.query(lazy_html, ".wobble span") + path = get_css_path(span, []) + assert path == "div:nth-child(1) > div:nth-child(2) > span:nth-child(1)" + + span2 = LazyHTML.query(lazy_html, path) + assert LazyHTML.equals?(span, span2) + end end describe "query_by_id/2" do From a594dd9f864bfb2cc255772149d0330503b652da Mon Sep 17 00:00:00 2001 From: spicychickensauce Date: Wed, 24 Sep 2025 14:44:27 +0200 Subject: [PATCH 05/17] Use singular parent_node as per library convention --- c_src/lazy_html.cpp | 4 ++-- lib/lazy_html.ex | 32 ++++---------------------------- lib/lazy_html/nif.ex | 2 +- test/lazy_html_test.exs | 19 ++++++++++--------- 4 files changed, 17 insertions(+), 40 deletions(-) diff --git a/c_src/lazy_html.cpp b/c_src/lazy_html.cpp index 48677ea..02f997c 100644 --- a/c_src/lazy_html.cpp +++ b/c_src/lazy_html.cpp @@ -715,7 +715,7 @@ ExLazyHTML child_nodes(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { FINE_NIF(child_nodes, 0); -ExLazyHTML parent_nodes(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { +ExLazyHTML parent_node(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { auto nodes = std::vector(); auto inserted_nodes = std::set(); @@ -732,7 +732,7 @@ ExLazyHTML parent_nodes(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { return ExLazyHTML(fine::make_resource( ex_lazy_html.resource->document_ref, nodes, true)); } -FINE_NIF(parent_nodes, ERL_NIF_DIRTY_JOB_CPU_BOUND); +FINE_NIF(parent_node, ERL_NIF_DIRTY_JOB_CPU_BOUND); std::string text(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { auto document = ex_lazy_html.resource->document_ref->document; diff --git a/lib/lazy_html.ex b/lib/lazy_html.ex index a00a6ea..91f24ab 100644 --- a/lib/lazy_html.ex +++ b/lib/lazy_html.ex @@ -364,7 +364,7 @@ defmodule LazyHTML do iex> lazy_html = LazyHTML.from_fragment(~S|
Hello world
|) iex> spans = LazyHTML.query(lazy_html, "span") - iex> LazyHTML.parent_nodes(spans) + iex> LazyHTML.parent_node(spans) #LazyHTML< 1 node (from selector) #1 @@ -374,7 +374,7 @@ defmodule LazyHTML do The root node is always , even if initialized via `from_fragment/1`: iex> lazy_html = LazyHTML.from_fragment(~S|
root
|) - iex> LazyHTML.parent_nodes(lazy_html) + iex> LazyHTML.parent_node(lazy_html) #LazyHTML< 1 node (from selector) #1 @@ -382,33 +382,9 @@ defmodule LazyHTML do > """ - @spec parent_nodes(t()) :: t() - def parent_nodes(lazy_html) do - LazyHTML.NIF.parent_nodes(lazy_html) - end - - @doc """ - Returns the parent nodes of the root nodes in `lazy_html`. - Useful when you're expecting a single, shared parent. - """ + @spec parent_node(t()) :: t() def parent_node(lazy_html) do - parent = LazyHTML.NIF.parent_nodes(lazy_html) - - case LazyHTML.NIF.num_nodes(parent) do - 0 -> {:ok, nil} - 1 -> {:ok, parent} - _ -> {:error, :multiple_parents} - end - end - - @doc """ - Same as `parent_node/1` but raises on multiple parents - """ - def parent_node!(lazy_html) do - case parent_node(lazy_html) do - {:ok, res} -> res - {:error, :multiple_parents} -> raise "Selected nodes have multiple parents" - end + LazyHTML.NIF.parent_node(lazy_html) end @doc """ diff --git a/lib/lazy_html/nif.ex b/lib/lazy_html/nif.ex index 9f26abb..f519752 100644 --- a/lib/lazy_html/nif.ex +++ b/lib/lazy_html/nif.ex @@ -21,7 +21,7 @@ defmodule LazyHTML.NIF do def filter(_lazy_html, _css_selector), do: err!() def query_by_id(_lazy_html, _id), do: err!() def child_nodes(_lazy_html), do: err!() - def parent_nodes(_lazy_html), do: err!() + def parent_node(_lazy_html), do: err!() def text(_lazy_html), do: err!() def attribute(_lazy_html, _name), do: err!() def attributes(_lazy_html), do: err!() diff --git a/test/lazy_html_test.exs b/test/lazy_html_test.exs index 9e3e150..462a8a1 100644 --- a/test/lazy_html_test.exs +++ b/test/lazy_html_test.exs @@ -250,7 +250,7 @@ defmodule LazyHTMLTest do end end - describe "parent_nodes/1" do + describe "parent_node/1" do test "from selector of nodes on different levels" do lazy_html = LazyHTML.from_fragment(""" @@ -263,20 +263,20 @@ defmodule LazyHTMLTest do """) spans = LazyHTML.query(lazy_html, "span") - parents = LazyHTML.parent_nodes(spans) + parents = LazyHTML.parent_node(spans) parent_ids = parents |> Enum.flat_map(&LazyHTML.attribute(&1, "id")) |> Enum.sort() assert parent_ids == ["0", "1"] # parent of div#id=0 is - grandparents = LazyHTML.parent_nodes(parents) + grandparents = LazyHTML.parent_node(parents) assert LazyHTML.tag(grandparents) |> Enum.sort() == ["div", "html"] # parent of is null, so it's filtered out - great_grandparents = LazyHTML.parent_nodes(grandparents) + great_grandparents = LazyHTML.parent_node(grandparents) assert great_grandparents |> Enum.count() == 1 # again, parent of is filtered out - assert LazyHTML.parent_nodes(great_grandparents) |> Enum.count() == 0 + assert LazyHTML.parent_node(great_grandparents) |> Enum.count() == 0 end test "from selector of nodes on same level" do @@ -293,19 +293,20 @@ defmodule LazyHTMLTest do """) spans = LazyHTML.query(lazy_html, "span") - parents = LazyHTML.parent_nodes(spans) + parents = LazyHTML.parent_node(spans) parent_ids = parents |> Enum.flat_map(&LazyHTML.attribute(&1, "id")) |> Enum.sort() assert parent_ids == ["1", "2"] # since they share the same parent, we now only have one node left - grandparent = LazyHTML.parent_nodes(parents) + grandparent = LazyHTML.parent_node(parents) assert LazyHTML.attribute(grandparent, "id") == ["0"] end defp get_css_path(node, acc) do - parent = LazyHTML.parent_node!(node) + 1 = Enum.count(node) + parent = LazyHTML.parent_node(node) - if parent do + if Enum.count(parent) > 0 do siblings = LazyHTML.child_nodes(parent) |> Enum.reject(fn n -> LazyHTML.tag(n) == [] end) From 50c0727740c3c72d831cd060fb0ccf0c2d2a8687 Mon Sep 17 00:00:00 2001 From: spicychickensauce Date: Wed, 24 Sep 2025 14:49:23 +0200 Subject: [PATCH 06/17] Avoid using numbers as ids --- lib/lazy_html.ex | 4 ++-- test/lazy_html_test.exs | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/lib/lazy_html.ex b/lib/lazy_html.ex index 91f24ab..991851b 100644 --- a/lib/lazy_html.ex +++ b/lib/lazy_html.ex @@ -516,8 +516,8 @@ defmodule LazyHTML do ## Examples - iex> lazy_html = LazyHTML.from_fragment(~S|
Hello
|) - iex> a = LazyHTML.query(lazy_html, "#1") + iex> lazy_html = LazyHTML.from_fragment(~S|
Hello
|) + iex> a = LazyHTML.query(lazy_html, "#a") iex> b = LazyHTML.query(lazy_html, "div > span") iex> LazyHTML.equals?(a, b) true diff --git a/test/lazy_html_test.exs b/test/lazy_html_test.exs index 462a8a1..12aaadf 100644 --- a/test/lazy_html_test.exs +++ b/test/lazy_html_test.exs @@ -254,8 +254,8 @@ defmodule LazyHTMLTest do test "from selector of nodes on different levels" do lazy_html = LazyHTML.from_fragment(""" -
-
+
+
Hello
world @@ -265,9 +265,9 @@ defmodule LazyHTMLTest do spans = LazyHTML.query(lazy_html, "span") parents = LazyHTML.parent_node(spans) parent_ids = parents |> Enum.flat_map(&LazyHTML.attribute(&1, "id")) |> Enum.sort() - assert parent_ids == ["0", "1"] + assert parent_ids == ["a", "b"] - # parent of div#id=0 is + # parent of div#id="a" is grandparents = LazyHTML.parent_node(parents) assert LazyHTML.tag(grandparents) |> Enum.sort() == ["div", "html"] @@ -282,11 +282,11 @@ defmodule LazyHTMLTest do test "from selector of nodes on same level" do lazy_html = LazyHTML.from_fragment(""" -
-
+
+
Hello
-
+
world
@@ -295,11 +295,11 @@ defmodule LazyHTMLTest do spans = LazyHTML.query(lazy_html, "span") parents = LazyHTML.parent_node(spans) parent_ids = parents |> Enum.flat_map(&LazyHTML.attribute(&1, "id")) |> Enum.sort() - assert parent_ids == ["1", "2"] + assert parent_ids == ["b", "c"] # since they share the same parent, we now only have one node left grandparent = LazyHTML.parent_node(parents) - assert LazyHTML.attribute(grandparent, "id") == ["0"] + assert LazyHTML.attribute(grandparent, "id") == ["a"] end defp get_css_path(node, acc) do From a98b1bc8d1ae773082254e96ccda126a091e09b7 Mon Sep 17 00:00:00 2001 From: spicychickensauce Date: Wed, 24 Sep 2025 16:40:37 +0200 Subject: [PATCH 07/17] Implement nth_child --- c_src/lazy_html.cpp | 30 ++++++++++++++++++++++++++++++ lib/lazy_html.ex | 17 +++++++++++++++++ lib/lazy_html/nif.ex | 1 + test/lazy_html_test.exs | 22 ++++++++++++++++++++++ 4 files changed, 70 insertions(+) diff --git a/c_src/lazy_html.cpp b/c_src/lazy_html.cpp index 02f997c..0a775e0 100644 --- a/c_src/lazy_html.cpp +++ b/c_src/lazy_html.cpp @@ -734,6 +734,36 @@ ExLazyHTML parent_node(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { } FINE_NIF(parent_node, ERL_NIF_DIRTY_JOB_CPU_BOUND); +std::vector nth_child(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { + auto values = std::vector(); + for (auto node : ex_lazy_html.resource->nodes) { + if (node->type != LXB_DOM_NODE_TYPE_ELEMENT) { + continue; + } + + auto parent = node->parent; + if (parent == NULL) { + // We're at the root, nth_child is 1 + values.push_back(1); + } else { + int64_t i = 1; + for (auto child = lxb_dom_node_first_child(parent); child != NULL; + child = lxb_dom_node_next(child)) { + if (child == node) { + break; + } + if (child->type == LXB_DOM_NODE_TYPE_ELEMENT) { + i++; + } + } + values.push_back(i); + } + } + + return values; +} +FINE_NIF(nth_child, ERL_NIF_DIRTY_JOB_CPU_BOUND); + std::string text(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { auto document = ex_lazy_html.resource->document_ref->document; diff --git a/lib/lazy_html.ex b/lib/lazy_html.ex index 991851b..9c8a69f 100644 --- a/lib/lazy_html.ex +++ b/lib/lazy_html.ex @@ -387,6 +387,23 @@ defmodule LazyHTML do LazyHTML.NIF.parent_node(lazy_html) end + @doc """ + Returns the positions of the selcted nodes among their siblings. + Note that the numbering is 1 based and doesn't include text nodes, + as it matches the `nth-child` CSS selector. + + ## Examples + + iex> lazy_html = LazyHTML.from_fragment(~S|
12
|) + iex> spans = LazyHTML.query(lazy_html, "span") + iex> LazyHTML.nth_child(spans) + [1, 2] + """ + @spec nth_child(t()) :: list(integer()) + def nth_child(lazy_html) do + LazyHTML.NIF.nth_child(lazy_html) + end + @doc """ Returns the text content of all nodes in `lazy_html`. diff --git a/lib/lazy_html/nif.ex b/lib/lazy_html/nif.ex index f519752..6c6a815 100644 --- a/lib/lazy_html/nif.ex +++ b/lib/lazy_html/nif.ex @@ -22,6 +22,7 @@ defmodule LazyHTML.NIF do def query_by_id(_lazy_html, _id), do: err!() def child_nodes(_lazy_html), do: err!() def parent_node(_lazy_html), do: err!() + def nth_child(_lazy_html), do: err!() def text(_lazy_html), do: err!() def attribute(_lazy_html, _name), do: err!() def attributes(_lazy_html), do: err!() diff --git a/test/lazy_html_test.exs b/test/lazy_html_test.exs index 12aaadf..507983a 100644 --- a/test/lazy_html_test.exs +++ b/test/lazy_html_test.exs @@ -341,6 +341,28 @@ defmodule LazyHTMLTest do end end + describe "nth_child/1" do + test "nth_child gives position" do + lazy_html = + LazyHTML.from_fragment(""" +
+ Text isn't counted. + 1 + + 2 +
+ """) + + assert LazyHTML.nth_child(lazy_html) == [1] + assert lazy_html["div"] |> LazyHTML.nth_child() == [1] + assert lazy_html["span"] |> LazyHTML.nth_child() == [1, 2] + + # Verify numbering matches css selector + assert lazy_html["span:nth-child(1)"] |> LazyHTML.text() == "1" + assert lazy_html["span:nth-child(2)"] |> LazyHTML.text() == "2" + end + end + describe "query_by_id/2" do test "raises when an empty id is given" do assert_raise ArgumentError, ~r/id cannot be empty/, fn -> From 83845cddc308130dd370822d227f8406bd28a2de Mon Sep 17 00:00:00 2001 From: spicychickensauce Date: Wed, 24 Sep 2025 16:41:16 +0200 Subject: [PATCH 08/17] Simplify get_css_path by using nth_child --- test/lazy_html_test.exs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/test/lazy_html_test.exs b/test/lazy_html_test.exs index 507983a..9f2f1ac 100644 --- a/test/lazy_html_test.exs +++ b/test/lazy_html_test.exs @@ -307,15 +307,11 @@ defmodule LazyHTMLTest do parent = LazyHTML.parent_node(node) if Enum.count(parent) > 0 do - siblings = - LazyHTML.child_nodes(parent) - |> Enum.reject(fn n -> LazyHTML.tag(n) == [] end) - [tag] = LazyHTML.tag(node) - i = Enum.find_index(siblings, fn n -> LazyHTML.equals?(n, node) end) + [i] = LazyHTML.nth_child(node) get_css_path(parent, [{tag, i} | acc]) else - acc |> Enum.map_join(" > ", fn {tag, i} -> "#{tag}:nth-child(#{i + 1})" end) + acc |> Enum.map_join(" > ", fn {tag, i} -> "#{tag}:nth-child(#{i})" end) end end From 8669e1d7c6d5f7280dae760e908752fcb6d27bac Mon Sep 17 00:00:00 2001 From: spicychickensauce Date: Thu, 25 Sep 2025 11:20:18 +0200 Subject: [PATCH 09/17] Use unordered_set instead of set --- c_src/lazy_html.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c_src/lazy_html.cpp b/c_src/lazy_html.cpp index 0a775e0..32f576b 100644 --- a/c_src/lazy_html.cpp +++ b/c_src/lazy_html.cpp @@ -4,10 +4,10 @@ #include #include #include -#include #include #include #include +#include #include #include @@ -717,7 +717,7 @@ FINE_NIF(child_nodes, 0); ExLazyHTML parent_node(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { auto nodes = std::vector(); - auto inserted_nodes = std::set(); + auto inserted_nodes = std::unordered_set(); for (auto node : ex_lazy_html.resource->nodes) { auto parent = node->parent; From 7e9d14ef95b6e07363adb39213e6325c3b62d8d8 Mon Sep 17 00:00:00 2001 From: spicychickensauce Date: Fri, 26 Sep 2025 11:05:06 +0200 Subject: [PATCH 10/17] Remove equals? --- c_src/lazy_html.cpp | 6 ------ lib/lazy_html.ex | 23 ----------------------- lib/lazy_html/nif.ex | 1 - test/lazy_html_test.exs | 2 +- 4 files changed, 1 insertion(+), 31 deletions(-) diff --git a/c_src/lazy_html.cpp b/c_src/lazy_html.cpp index 32f576b..3e4bc43 100644 --- a/c_src/lazy_html.cpp +++ b/c_src/lazy_html.cpp @@ -852,12 +852,6 @@ std::uint64_t num_nodes(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { FINE_NIF(num_nodes, 0); -bool equals(ErlNifEnv *env, ExLazyHTML html_a, ExLazyHTML html_b) { - return (html_a.resource->document_ref == html_b.resource->document_ref && - html_a.resource->nodes == html_b.resource->nodes); -} -FINE_NIF(equals, 0); - std::vector tag(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { auto values = std::vector(); diff --git a/lib/lazy_html.ex b/lib/lazy_html.ex index 9c8a69f..2bb694d 100644 --- a/lib/lazy_html.ex +++ b/lib/lazy_html.ex @@ -528,29 +528,6 @@ defmodule LazyHTML do LazyHTML.NIF.tag(lazy_html) end - @doc """ - Returns true if the lazy_html is selecting the same nodes starting from the same document. - - ## Examples - - iex> lazy_html = LazyHTML.from_fragment(~S|
Hello
|) - iex> a = LazyHTML.query(lazy_html, "#a") - iex> b = LazyHTML.query(lazy_html, "div > span") - iex> LazyHTML.equals?(a, b) - true - - Note that if the lazy_htmls are created separately, they are never equal: - - iex> html_a = LazyHTML.from_fragment(~S|
hello
|) - iex> html_b = LazyHTML.from_fragment(~S|
hello
|) - iex> LazyHTML.equals?(html_a, html_b) - false - """ - @spec equals?(t(), t()) :: boolean() - def equals?(html_a, html_b) do - LazyHTML.NIF.equals(html_a, html_b) - end - @doc ~S""" Escapes the given string to make a valid HTML text. diff --git a/lib/lazy_html/nif.ex b/lib/lazy_html/nif.ex index 6c6a815..1661af9 100644 --- a/lib/lazy_html/nif.ex +++ b/lib/lazy_html/nif.ex @@ -29,7 +29,6 @@ defmodule LazyHTML.NIF do def tag(_lazy_html), do: err!() def nodes(_lazy_html), do: err!() def num_nodes(_lazy_html), do: err!() - def equals(_lazy_html_a, _lazy_html_b), do: err!() defp err!(), do: :erlang.nif_error(:not_loaded) end diff --git a/test/lazy_html_test.exs b/test/lazy_html_test.exs index 9f2f1ac..a6de64f 100644 --- a/test/lazy_html_test.exs +++ b/test/lazy_html_test.exs @@ -333,7 +333,7 @@ defmodule LazyHTMLTest do assert path == "div:nth-child(1) > div:nth-child(2) > span:nth-child(1)" span2 = LazyHTML.query(lazy_html, path) - assert LazyHTML.equals?(span, span2) + assert LazyHTML.text(span) == LazyHTML.text(span2) end end From 0594c3f7f30694f0049c79bcde28ceefc3e84b70 Mon Sep 17 00:00:00 2001 From: spicychickensauce Date: Sat, 27 Sep 2025 10:25:04 +0200 Subject: [PATCH 11/17] Make parent of fragment root nil instead of html --- c_src/lazy_html.cpp | 23 ++++++++++++++++------- lib/lazy_html.ex | 10 ---------- test/lazy_html_test.exs | 39 +++++++++++++++++++++++++++++---------- 3 files changed, 45 insertions(+), 27 deletions(-) diff --git a/c_src/lazy_html.cpp b/c_src/lazy_html.cpp index 3e4bc43..7b8f7c7 100644 --- a/c_src/lazy_html.cpp +++ b/c_src/lazy_html.cpp @@ -44,8 +44,10 @@ auto resource = fine::Atom("resource"); struct DocumentRef { lxb_html_document_t *document; + bool is_fragment; - DocumentRef(lxb_html_document_t *document) : document(document) {} + DocumentRef(lxb_html_document_t *document, bool is_fragment) + : document(document), is_fragment(is_fragment) {} ~DocumentRef() { lxb_html_document_destroy(this->document); } }; @@ -98,7 +100,7 @@ ExLazyHTML from_document(ErlNifEnv *env, ErlNifBinary html) { throw std::runtime_error("failed to parse html document"); } - auto document_ref = std::make_shared(document); + auto document_ref = std::make_shared(document, false); document_guard.deactivate(); auto nodes = std::vector(); @@ -130,7 +132,7 @@ ExLazyHTML from_fragment(ErlNifEnv *env, ErlNifBinary html) { throw std::runtime_error("failed to parse html fragment"); } - auto document_ref = std::make_shared(document); + auto document_ref = std::make_shared(document, true); document_guard.deactivate(); auto nodes = std::vector(); @@ -523,7 +525,12 @@ ExLazyHTML from_tree(ErlNifEnv *env, std::vector tree) { nodes.push_back(node); } - auto document_ref = std::make_shared(document); + bool is_fragment = true; + if (!nodes.empty() && lxb_html_tree_node_is(nodes.front(), LXB_TAG_HTML)) { + is_fragment = false; + } + + auto document_ref = std::make_shared(document, is_fragment); document_guard.deactivate(); return ExLazyHTML(fine::make_resource(document_ref, nodes, false)); @@ -716,12 +723,14 @@ ExLazyHTML child_nodes(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { FINE_NIF(child_nodes, 0); ExLazyHTML parent_node(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { + bool is_document = !ex_lazy_html.resource->document_ref->is_fragment; auto nodes = std::vector(); auto inserted_nodes = std::unordered_set(); for (auto node : ex_lazy_html.resource->nodes) { - auto parent = node->parent; - if (parent != NULL && parent->type == LXB_DOM_NODE_TYPE_ELEMENT) { + auto parent = lxb_dom_node_parent(node); + if (parent != NULL && parent->type == LXB_DOM_NODE_TYPE_ELEMENT && + (is_document || !lxb_html_tree_node_is(parent, LXB_TAG_HTML))) { auto inserted_node = inserted_nodes.find(parent); if (inserted_node == inserted_nodes.end()) { inserted_nodes.insert(parent); @@ -741,7 +750,7 @@ std::vector nth_child(ErlNifEnv *env, ExLazyHTML ex_lazy_html) { continue; } - auto parent = node->parent; + auto parent = lxb_dom_node_parent(node); if (parent == NULL) { // We're at the root, nth_child is 1 values.push_back(1); diff --git a/lib/lazy_html.ex b/lib/lazy_html.ex index 2bb694d..98a18d9 100644 --- a/lib/lazy_html.ex +++ b/lib/lazy_html.ex @@ -371,16 +371,6 @@ defmodule LazyHTML do
Hello world
> - The root node is always , even if initialized via `from_fragment/1`: - - iex> lazy_html = LazyHTML.from_fragment(~S|
root
|) - iex> LazyHTML.parent_node(lazy_html) - #LazyHTML< - 1 node (from selector) - #1 -
root
- > - """ @spec parent_node(t()) :: t() def parent_node(lazy_html) do diff --git a/test/lazy_html_test.exs b/test/lazy_html_test.exs index a6de64f..803b631 100644 --- a/test/lazy_html_test.exs +++ b/test/lazy_html_test.exs @@ -267,16 +267,12 @@ defmodule LazyHTMLTest do parent_ids = parents |> Enum.flat_map(&LazyHTML.attribute(&1, "id")) |> Enum.sort() assert parent_ids == ["a", "b"] - # parent of div#id="a" is + # parent of div#id="a" is null grandparents = LazyHTML.parent_node(parents) - assert LazyHTML.tag(grandparents) |> Enum.sort() == ["div", "html"] + assert LazyHTML.tag(grandparents) == ["div"] - # parent of is null, so it's filtered out great_grandparents = LazyHTML.parent_node(grandparents) - assert great_grandparents |> Enum.count() == 1 - - # again, parent of is filtered out - assert LazyHTML.parent_node(great_grandparents) |> Enum.count() == 0 + assert great_grandparents |> Enum.count() == 0 end test "from selector of nodes on same level" do @@ -302,16 +298,39 @@ defmodule LazyHTMLTest do assert LazyHTML.attribute(grandparent, "id") == ["a"] end + defp parents(node) do + if Enum.count(node) == 0 do + [] + else + tag = LazyHTML.tag(node) + parents(LazyHTML.parent_node(node)) ++ tag + end + end + + test "last parent node is if instantiated via from_document and similar" do + lazy_html = LazyHTML.from_document("
root
") + assert parents(lazy_html["div"]) == ["html", "body", "div"] + + lazy_html = LazyHTML.from_fragment("
root
") + assert parents(lazy_html["div"]) == ["div"] + + lazy_html = LazyHTML.from_tree([{"div", [], []}]) + assert parents(lazy_html["div"]) == ["div"] + + lazy_html = LazyHTML.from_tree([{"html", [], [{"body", [], [{"div", [], []}]}]}]) + assert parents(lazy_html["div"]) == ["html", "body", "div"] + end + defp get_css_path(node, acc) do 1 = Enum.count(node) parent = LazyHTML.parent_node(node) + [tag] = LazyHTML.tag(node) + [i] = LazyHTML.nth_child(node) if Enum.count(parent) > 0 do - [tag] = LazyHTML.tag(node) - [i] = LazyHTML.nth_child(node) get_css_path(parent, [{tag, i} | acc]) else - acc |> Enum.map_join(" > ", fn {tag, i} -> "#{tag}:nth-child(#{i})" end) + [{tag, i} | acc] |> Enum.map_join(" > ", fn {tag, i} -> "#{tag}:nth-child(#{i})" end) end end From e2dbf91a559871b1700245c833bd4ca44ef4f740 Mon Sep 17 00:00:00 2001 From: spicychickensauce Date: Fri, 3 Oct 2025 15:01:58 +0200 Subject: [PATCH 12/17] Improve documentation of nth_child MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apply suggestion from @jonatanklosko Co-authored-by: Jonatan Kłosko --- lib/lazy_html.ex | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/lib/lazy_html.ex b/lib/lazy_html.ex index 98a18d9..b6a9907 100644 --- a/lib/lazy_html.ex +++ b/lib/lazy_html.ex @@ -378,9 +378,13 @@ defmodule LazyHTML do end @doc """ - Returns the positions of the selcted nodes among their siblings. - Note that the numbering is 1 based and doesn't include text nodes, - as it matches the `nth-child` CSS selector. + Returns the position among its siblings for every root element in `lazy_html`. + + The position numbering is 1-based and only considers siblings that + are elements, as to match the `:nth-child` CSS pseudo-class. + + Note that if there are text or comment root nodes, they are ignored, + and they have no corresponding number in the result. ## Examples @@ -388,6 +392,7 @@ defmodule LazyHTML do iex> spans = LazyHTML.query(lazy_html, "span") iex> LazyHTML.nth_child(spans) [1, 2] + """ @spec nth_child(t()) :: list(integer()) def nth_child(lazy_html) do From a4e7d69b538f50b8c0e5769246cba228e4a29fe4 Mon Sep 17 00:00:00 2001 From: spicychickensauce Date: Fri, 3 Oct 2025 15:25:48 +0200 Subject: [PATCH 13/17] Inline boolean expression --- c_src/lazy_html.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/c_src/lazy_html.cpp b/c_src/lazy_html.cpp index 7b8f7c7..ce5a8ae 100644 --- a/c_src/lazy_html.cpp +++ b/c_src/lazy_html.cpp @@ -525,10 +525,8 @@ ExLazyHTML from_tree(ErlNifEnv *env, std::vector tree) { nodes.push_back(node); } - bool is_fragment = true; - if (!nodes.empty() && lxb_html_tree_node_is(nodes.front(), LXB_TAG_HTML)) { - is_fragment = false; - } + bool is_fragment = + nodes.empty() || !lxb_html_tree_node_is(nodes.front(), LXB_TAG_HTML); auto document_ref = std::make_shared(document, is_fragment); document_guard.deactivate(); From 1b73ce7c3bfb71cfef2aaec4965444102a6a251d Mon Sep 17 00:00:00 2001 From: spicychickensauce Date: Fri, 3 Oct 2025 15:27:34 +0200 Subject: [PATCH 14/17] Rename test helper to ancestor_chain --- test/lazy_html_test.exs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/test/lazy_html_test.exs b/test/lazy_html_test.exs index 803b631..a15da81 100644 --- a/test/lazy_html_test.exs +++ b/test/lazy_html_test.exs @@ -298,27 +298,26 @@ defmodule LazyHTMLTest do assert LazyHTML.attribute(grandparent, "id") == ["a"] end - defp parents(node) do + defp ancestor_chain(node) do if Enum.count(node) == 0 do [] else - tag = LazyHTML.tag(node) - parents(LazyHTML.parent_node(node)) ++ tag + ancestor_chain(LazyHTML.parent_node(node)) ++ LazyHTML.tag(node) end end test "last parent node is if instantiated via from_document and similar" do lazy_html = LazyHTML.from_document("
root
") - assert parents(lazy_html["div"]) == ["html", "body", "div"] + assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == ["html", "body", "div"] lazy_html = LazyHTML.from_fragment("
root
") - assert parents(lazy_html["div"]) == ["div"] + assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == ["div"] lazy_html = LazyHTML.from_tree([{"div", [], []}]) - assert parents(lazy_html["div"]) == ["div"] + assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == ["div"] lazy_html = LazyHTML.from_tree([{"html", [], [{"body", [], [{"div", [], []}]}]}]) - assert parents(lazy_html["div"]) == ["html", "body", "div"] + assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == ["html", "body", "div"] end defp get_css_path(node, acc) do From 323deb76009f0bf045b0800d3003c0a569d820b3 Mon Sep 17 00:00:00 2001 From: spicychickensauce Date: Fri, 3 Oct 2025 15:29:06 +0200 Subject: [PATCH 15/17] Remove API guidance test function --- test/lazy_html_test.exs | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/test/lazy_html_test.exs b/test/lazy_html_test.exs index a15da81..84ed4e5 100644 --- a/test/lazy_html_test.exs +++ b/test/lazy_html_test.exs @@ -319,40 +319,6 @@ defmodule LazyHTMLTest do lazy_html = LazyHTML.from_tree([{"html", [], [{"body", [], [{"div", [], []}]}]}]) assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == ["html", "body", "div"] end - - defp get_css_path(node, acc) do - 1 = Enum.count(node) - parent = LazyHTML.parent_node(node) - [tag] = LazyHTML.tag(node) - [i] = LazyHTML.nth_child(node) - - if Enum.count(parent) > 0 do - get_css_path(parent, [{tag, i} | acc]) - else - [{tag, i} | acc] |> Enum.map_join(" > ", fn {tag, i} -> "#{tag}:nth-child(#{i})" end) - end - end - - test "construct nth-child selector by traversing parents" do - lazy_html = - LazyHTML.from_fragment(""" -
-
- wibble -
-
- wobble -
-
- """) - - span = LazyHTML.query(lazy_html, ".wobble span") - path = get_css_path(span, []) - assert path == "div:nth-child(1) > div:nth-child(2) > span:nth-child(1)" - - span2 = LazyHTML.query(lazy_html, path) - assert LazyHTML.text(span) == LazyHTML.text(span2) - end end describe "nth_child/1" do From 9062d95f6b8d368e44096ba69817f45020fe0fca Mon Sep 17 00:00:00 2001 From: spicychickensauce Date: Fri, 3 Oct 2025 15:32:28 +0200 Subject: [PATCH 16/17] Don't include self in ancestor_chain --- test/lazy_html_test.exs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/test/lazy_html_test.exs b/test/lazy_html_test.exs index 84ed4e5..6311261 100644 --- a/test/lazy_html_test.exs +++ b/test/lazy_html_test.exs @@ -299,25 +299,27 @@ defmodule LazyHTMLTest do end defp ancestor_chain(node) do + parent = LazyHTML.parent_node(node) + if Enum.count(node) == 0 do [] else - ancestor_chain(LazyHTML.parent_node(node)) ++ LazyHTML.tag(node) + ancestor_chain(parent) ++ LazyHTML.tag(parent) end end test "last parent node is if instantiated via from_document and similar" do lazy_html = LazyHTML.from_document("
root
") - assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == ["html", "body", "div"] + assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == ["html", "body"] lazy_html = LazyHTML.from_fragment("
root
") - assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == ["div"] + assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == [] lazy_html = LazyHTML.from_tree([{"div", [], []}]) - assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == ["div"] + assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == [] lazy_html = LazyHTML.from_tree([{"html", [], [{"body", [], [{"div", [], []}]}]}]) - assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == ["html", "body", "div"] + assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == ["html", "body"] end end From 82de17953ce862746ca073d51b6546c6b9175e2f Mon Sep 17 00:00:00 2001 From: spicychickensauce Date: Fri, 3 Oct 2025 15:36:58 +0200 Subject: [PATCH 17/17] Remove unnecessary flat_map --- test/lazy_html_test.exs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/lazy_html_test.exs b/test/lazy_html_test.exs index 6311261..84bee63 100644 --- a/test/lazy_html_test.exs +++ b/test/lazy_html_test.exs @@ -264,7 +264,7 @@ defmodule LazyHTMLTest do spans = LazyHTML.query(lazy_html, "span") parents = LazyHTML.parent_node(spans) - parent_ids = parents |> Enum.flat_map(&LazyHTML.attribute(&1, "id")) |> Enum.sort() + parent_ids = parents |> LazyHTML.attribute("id") |> Enum.sort() assert parent_ids == ["a", "b"] # parent of div#id="a" is null @@ -290,7 +290,7 @@ defmodule LazyHTMLTest do spans = LazyHTML.query(lazy_html, "span") parents = LazyHTML.parent_node(spans) - parent_ids = parents |> Enum.flat_map(&LazyHTML.attribute(&1, "id")) |> Enum.sort() + parent_ids = parents |> LazyHTML.attribute("id") |> Enum.sort() assert parent_ids == ["b", "c"] # since they share the same parent, we now only have one node left