Skip to content

Commit

Permalink
Replace BEGIN/END with Position enum (#51)
Browse files Browse the repository at this point in the history
  • Loading branch information
thomasst authored Jun 28, 2024
1 parent 5e7a533 commit 89176b6
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 141 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ line-length = 79
ignore = [
"ISC001",
"PLR2004",
"S101",
"TRY003",
# Some patterns contain special characters.
"PLR0911",
Expand Down
43 changes: 25 additions & 18 deletions quotequail/_html.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# HTML utils
import enum
from collections.abc import Iterator
from typing import TYPE_CHECKING, TypeAlias

Expand All @@ -10,8 +11,14 @@

from ._patterns import FORWARD_LINE, FORWARD_STYLES, MULTIPLE_WHITESPACE_RE


class Position(enum.Enum):
Begin = "begin"
End = "end"


Element: TypeAlias = "HtmlElement"
ElementRef = tuple["Element", str]
ElementRef = tuple["Element", Position]

INLINE_TAGS = [
"a",
Expand All @@ -31,9 +38,6 @@
"th",
]

BEGIN = "begin"
END = "end"


def trim_tree_after(element: Element, include_element: bool = True):
"""
Expand Down Expand Up @@ -184,9 +188,9 @@ def slice_tree(
new_tree = tree

if start_ref:
include_start = start_ref[1] == BEGIN
include_start = start_ref[1] is Position.Begin
if end_ref:
include_end = end_ref[1] == END
include_end = end_ref[1] is Position.End

# If start_ref is the same as end_ref, and we don't include the element,
# we are removing the entire tree. We need to handle this separately,
Expand Down Expand Up @@ -283,14 +287,14 @@ def is_indentation_element(element: Element) -> bool:

def tree_token_generator(
el: Element, indentation_level: int = 0
) -> Iterator[None | tuple[Element, str, int] | str]:
) -> Iterator[None | tuple[Element, Position, int] | str]:
"""
Yield tokens for the given HTML element as follows:
- A tuple (LXML element, BEGIN, indentation_level)
- A tuple (LXML element, Begin, indentation_level)
- Text right after the start of the tag, or None.
- Recursively calls the token generator for all child objects
- A tuple (LXML element, END, indentation_level)
- A tuple (LXML element, End, indentation_level)
- Text right after the end of the tag, or None.
"""
if not isinstance(el.tag, str):
Expand All @@ -301,7 +305,7 @@ def tree_token_generator(
if is_indentation:
indentation_level += 1

yield (el, BEGIN, indentation_level)
yield (el, Position.Begin, indentation_level)

yield el.text

Expand All @@ -311,7 +315,7 @@ def tree_token_generator(
if is_indentation:
indentation_level -= 1

yield (el, END, indentation_level)
yield (el, Position.End, indentation_level)

yield el.tail

Expand All @@ -320,7 +324,10 @@ def tree_line_generator(
el: Element, max_lines: int | None = None
) -> Iterator[
tuple[
tuple[ElementRef, str] | None, tuple[ElementRef, str] | None, int, str
tuple[ElementRef, Position] | None,
tuple[ElementRef, Position] | None,
int,
str,
]
]:
"""
Expand All @@ -343,14 +350,14 @@ def tree_line_generator(
For example, the HTML tree "<div>foo <span>bar</span><br>baz</div>" yields:
- ((<Element div>, 'begin'), (<Element br>, 'begin'), 0, 'foo bar')
- ((<Element br>, 'end'), (<Element div>, 'end'), 0, 'baz').
- ((<Element div>, Begin), (<Element br>, Begin), 0, 'foo bar')
- ((<Element br>, End), (<Element div>, End), 0, 'baz').
To illustrate the indentation level, the HTML tree
'<div><blockquote>hi</blockquote>world</div>' yields:
- ((<Element blockquote>, 'begin'), (<Element blockquote>, 'end'), 1, 'hi')
- ((<Element blockquote>, 'end'), (<Element div>, 'end'), 0, 'world')
- ((<Element blockquote>, Begin), (<Element blockquote>, End), 1, 'hi')
- ((<Element blockquote>, End), (<Element div>, End), 0, 'world')
"""

def _trim_spaces(text: str) -> str:
Expand Down Expand Up @@ -378,11 +385,11 @@ def _trim_spaces(text: str) -> str:

tag_name = el.tag.lower()

line_break = tag_name == "br" and state == BEGIN
line_break = tag_name == "br" and state is Position.Begin
is_block = tag_name not in INLINE_TAGS
is_forward = (
is_block
and state == BEGIN
and state is Position.Begin
and (style := el.attrib.get("style"))
and any(style_re.match(style) for style_re in FORWARD_STYLES)
)
Expand Down
106 changes: 106 additions & 0 deletions tests/test_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from quotequail._html import (
Position,
get_html_tree,
render_html_tree,
tree_line_generator,
trim_tree_after,
trim_tree_before,
)


def test_tree_line_generator():
tree = get_html_tree("<div>foo <span>bar</span><br>baz</div>")
data = list(tree_line_generator(tree))
div = tree.xpath("div")[0]
br = tree.xpath("div/br")[0]
assert data == [
((div, Position.Begin), (br, Position.Begin), 0, "foo bar"),
((br, Position.End), (div, Position.End), 0, "baz"),
]
data = list(tree_line_generator(tree, max_lines=1))
div = tree.xpath("div")[0]
br = tree.xpath("div/br")[0]
assert data == [
((div, Position.Begin), (br, Position.Begin), 0, "foo bar"),
]

tree = get_html_tree("<div><h1>foo</h1>bar</div>")
data = list(tree_line_generator(tree))
div = tree.xpath("div")[0]
h1 = tree.xpath("div/h1")[0]
assert data == [
((h1, Position.Begin), (h1, Position.End), 0, "foo"),
((h1, Position.End), (div, Position.End), 0, "bar"),
]

tree = get_html_tree("<div><blockquote>hi</blockquote>world</div>")
data = list(tree_line_generator(tree))
div = tree.xpath("div")[0]
blockquote = tree.xpath("div/blockquote")[0]
assert data == [
((blockquote, Position.Begin), (blockquote, Position.End), 1, "hi"),
((blockquote, Position.End), (div, Position.End), 0, "world"),
]

tree = get_html_tree(
"""
<table>
<tr><td>Subject: </td><td>the subject</td></tr>
<tr><td>From: </td><td>from line</td></tr>
</table>"""
)
data = list(tree_line_generator(tree))
tr1, tr2 = tree.xpath("table/tr")
assert data == [
(
(tr1, Position.Begin),
(tr1, Position.End),
0,
"Subject: the subject",
),
((tr2, Position.Begin), (tr2, Position.End), 0, "From: from line"),
]


def test_trim_after():
html = "<div>A<span>B</span>C<span>D</span>E</div>"

tree = get_html_tree(html)
trim_tree_after(tree.find("div/span"))
assert render_html_tree(tree) == "<div>A<span>B</span></div>"

tree = get_html_tree(html)
trim_tree_after(tree.find("div/span[2]"))
assert (
render_html_tree(tree) == "<div>A<span>B</span>C<span>D</span></div>"
)

tree = get_html_tree(html)
trim_tree_after(tree.find("div/span"), include_element=False)
assert render_html_tree(tree) == "<div>A</div>"

tree = get_html_tree(html)
trim_tree_after(tree.find("div/span[2]"), include_element=False)
assert render_html_tree(tree) == "<div>A<span>B</span>C</div>"


def test_trim_before():
html = "<div>A<span>B</span>C<span>D</span>E</div>"

tree = get_html_tree(html)
trim_tree_before(tree.find("div/span"))
assert (
render_html_tree(tree) == "<div><span>B</span>C<span>D</span>E</div>"
)

tree = get_html_tree(html)
trim_tree_before(tree.find("div/span[2]"))
assert render_html_tree(tree) == "<div><span>D</span>E</div>"

tree = get_html_tree(html)
trim_tree_before(tree.find("div/span"), include_element=False)
assert render_html_tree(tree) == "<div>C<span>D</span>E</div>"

tree = get_html_tree(html)
trim_tree_before(tree.find("div/span[2]"), include_element=False)
assert render_html_tree(tree) == "<div>E</div>"
123 changes: 0 additions & 123 deletions tests/test_quotequail.py
Original file line number Diff line number Diff line change
Expand Up @@ -1172,129 +1172,6 @@ def test_extract_headers(self):
),
)

def test_tree_line_generator(self):
from quotequail import _html

tree = _html.get_html_tree("<div>foo <span>bar</span><br>baz</div>")
data = list(_html.tree_line_generator(tree))
div = tree.xpath("div")[0]
br = tree.xpath("div/br")[0]
self.assertEqual(
data,
[
((div, "begin"), (br, "begin"), 0, "foo bar"),
((br, "end"), (div, "end"), 0, "baz"),
],
)
data = list(_html.tree_line_generator(tree, max_lines=1))
div = tree.xpath("div")[0]
br = tree.xpath("div/br")[0]
self.assertEqual(
data,
[
((div, "begin"), (br, "begin"), 0, "foo bar"),
],
)

tree = _html.get_html_tree("<div><h1>foo</h1>bar</div>")
data = list(_html.tree_line_generator(tree))
div = tree.xpath("div")[0]
h1 = tree.xpath("div/h1")[0]
self.assertEqual(
data,
[
((h1, "begin"), (h1, "end"), 0, "foo"),
((h1, "end"), (div, "end"), 0, "bar"),
],
)

tree = _html.get_html_tree(
"<div><blockquote>hi</blockquote>world</div>"
)
data = list(_html.tree_line_generator(tree))
div = tree.xpath("div")[0]
blockquote = tree.xpath("div/blockquote")[0]
self.assertEqual(
data,
[
((blockquote, "begin"), (blockquote, "end"), 1, "hi"),
((blockquote, "end"), (div, "end"), 0, "world"),
],
)

tree = _html.get_html_tree(
"""
<table>
<tr><td>Subject: </td><td>the subject</td></tr>
<tr><td>From: </td><td>from line</td></tr>
</table>"""
)
data = list(_html.tree_line_generator(tree))
tr1, tr2 = tree.xpath("table/tr")
self.assertEqual(
data,
[
((tr1, "begin"), (tr1, "end"), 0, "Subject: the subject"),
((tr2, "begin"), (tr2, "end"), 0, "From: from line"),
],
)

def test_trim_after(self):
from quotequail import _html

html = "<div>A<span>B</span>C<span>D</span>E</div>"

tree = _html.get_html_tree(html)
_html.trim_tree_after(tree.find("div/span"))
self.assertEqual(
_html.render_html_tree(tree), "<div>A<span>B</span></div>"
)

tree = _html.get_html_tree(html)
_html.trim_tree_after(tree.find("div/span[2]"))
self.assertEqual(
_html.render_html_tree(tree),
"<div>A<span>B</span>C<span>D</span></div>",
)

tree = _html.get_html_tree(html)
_html.trim_tree_after(tree.find("div/span"), include_element=False)
self.assertEqual(_html.render_html_tree(tree), "<div>A</div>")

tree = _html.get_html_tree(html)
_html.trim_tree_after(tree.find("div/span[2]"), include_element=False)
self.assertEqual(
_html.render_html_tree(tree), "<div>A<span>B</span>C</div>"
)

def test_trim_before(self):
from quotequail import _html

html = "<div>A<span>B</span>C<span>D</span>E</div>"

tree = _html.get_html_tree(html)
_html.trim_tree_before(tree.find("div/span"))
self.assertEqual(
_html.render_html_tree(tree),
"<div><span>B</span>C<span>D</span>E</div>",
)

tree = _html.get_html_tree(html)
_html.trim_tree_before(tree.find("div/span[2]"))
self.assertEqual(
_html.render_html_tree(tree), "<div><span>D</span>E</div>"
)

tree = _html.get_html_tree(html)
_html.trim_tree_before(tree.find("div/span"), include_element=False)
self.assertEqual(
_html.render_html_tree(tree), "<div>C<span>D</span>E</div>"
)

tree = _html.get_html_tree(html)
_html.trim_tree_before(tree.find("div/span[2]"), include_element=False)
self.assertEqual(_html.render_html_tree(tree), "<div>E</div>")


if __name__ == "__main__":
unittest.main()

0 comments on commit 89176b6

Please sign in to comment.