Skip to content

Commit

Permalink
Quote intro line + migrate all tests
Browse files Browse the repository at this point in the history
  • Loading branch information
thomasst committed Jul 2, 2024
1 parent 46864a2 commit bcedce1
Show file tree
Hide file tree
Showing 10 changed files with 1,105 additions and 1,214 deletions.
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,11 @@ max-complexity = 15
max-branches = 16

[tool.ruff.lint.per-file-ignores]
"tests/test_internal.py" = ["E501"]
"tests/test_quote.py" = ["E501"]
"tests/test_quote_html.py" = ["E501"]
"tests/test_quotequail.py" = ["E501", "PT009"]
"tests/test_unwrap.py" = ["E501"]
"tests/test_unwrap_html.py" = ["E501"]

[tool.mypy]
Expand Down
74 changes: 51 additions & 23 deletions quotequail/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,59 +2,87 @@
# a library that identifies quoted text in email messages

from . import _internal, _patterns
from ._enums import Position

__version__ = "0.3.1"
__all__ = ["quote", "quote_html", "unwrap", "unwrap_html"]


def quote(text: str, limit: int = 1000) -> list[tuple[bool, str]]:
def quote(
text: str, *, limit: int = 1000, quote_intro_line: bool = False
) -> list[tuple[bool, str]]:
"""
Take a plain text message as an argument, return a list of tuples. The
first argument of the tuple denotes whether the text should be expanded by
default. The second argument is the unmodified corresponding text.
Example: [(True, 'expanded text'), (False, '> Some quoted text')]
Unless the limit param is set to None, the text will automatically be
quoted starting at the line where the limit is reached.
Divide email body into quoted parts.
Args:
text: Plain text message.
limit: If set, the text will automatically be quoted starting at the
line where the limit is reached.
quote_intro_line: Whether the line introducing the quoted text ("On ...
wrote:" / "Begin forwarded message:") should be part of the quoted
text.
Returns:
List of tuples: The first argument of the tuple denotes whether the
text should be expanded by default. The second argument is the
unmodified corresponding text.
Example: [(True, 'expanded text'), (False, '> Some quoted text')]
"""
lines = text.split("\n")

position = Position.Begin if quote_intro_line else Position.End
found = _internal.find_quote_position(
lines, _patterns.MAX_WRAP_LINES, limit
lines,
_patterns.MAX_WRAP_LINES,
limit=limit,
position=position,
)

if found is not None:
return [
(True, "\n".join(lines[: found + 1])),
(False, "\n".join(lines[found + 1 :])),
]
if found is None:
return [(True, text)]

return [(True, text)]
split_idx = found if quote_intro_line else found + 1
return [
(True, "\n".join(lines[:split_idx])),
(False, "\n".join(lines[split_idx:])),
]


def quote_html(html: str, limit: int = 1000) -> list[tuple[bool, str]]:
def quote_html(
html: str, *, limit: int = 1000, quote_intro_line: bool = False
) -> list[tuple[bool, str]]:
"""
Like quote(), but takes an HTML message as an argument. The limit param
represents the maximum number of lines to traverse until quoting the rest
of the markup. Lines are separated by block elements or <br>.
Like quote(), but takes an HTML message as an argument.
Args:
html: HTML message.
limit: Maximum number of lines to traverse until quoting the rest of
the markup. Lines are separated by block elements or <br>.
quote_intro_line: Whether the line introducing the quoted text ("On ...
wrote:" / "Begin forwarded message:") should be part of the quoted
text.
"""
from . import _html

tree = _html.get_html_tree(html)

start_refs, end_refs, lines = _html.get_line_info(tree, limit + 1)

found = _internal.find_quote_position(lines, 1, limit)
position = Position.Begin if quote_intro_line else Position.End
found = _internal.find_quote_position(
lines, 1, limit=limit, position=position
)

if found is None:
# No quoting found and we're below limit. We're done.
return [(True, _html.render_html_tree(tree))]

split_idx = found if quote_intro_line else found + 1
start_tree = _html.slice_tree(
tree, start_refs, end_refs, (0, found + 1), html_copy=html
tree, start_refs, end_refs, (0, split_idx), html_copy=html
)
end_tree = _html.slice_tree(tree, start_refs, end_refs, (found + 1, None))
end_tree = _html.slice_tree(tree, start_refs, end_refs, (split_idx, None))

return [
(True, _html.render_html_tree(start_tree)),
Expand Down
6 changes: 6 additions & 0 deletions quotequail/_enums.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from enum import Enum


class Position(Enum):
Begin = "begin"
End = "end"
8 changes: 1 addition & 7 deletions quotequail/_html.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# HTML utils
import enum
from collections.abc import Iterator
from typing import TYPE_CHECKING, TypeAlias

Expand All @@ -9,14 +8,9 @@
if TYPE_CHECKING:
from lxml.html import HtmlElement

from ._enums import Position
from ._patterns import FORWARD_LINE, FORWARD_STYLES, MULTIPLE_WHITESPACE_RE


class Position(enum.Enum):
Begin = "begin"
End = "end"


Element: TypeAlias = "HtmlElement"
ElementRef = tuple["Element", Position]

Expand Down
44 changes: 37 additions & 7 deletions quotequail/_internal.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from typing_extensions import assert_never

from ._enums import Position
from ._patterns import (
COMPILED_PATTERN_MAP,
HEADER_MAP,
Expand All @@ -13,7 +16,10 @@


def find_pattern_on_line(
lines: list[str], n: int, max_wrap_lines: int
lines: list[str],
n: int,
max_wrap_lines: int,
position: Position,
) -> tuple[int, str] | None:
"""
Find a forward/reply pattern within the given lines on text on the given
Expand All @@ -28,22 +34,46 @@ def find_pattern_on_line(
for regex in regexes:
for m in range(max_wrap_lines):
match_line = join_wrapped_lines(lines[n : n + 1 + m])
# print("tryin match line", n, m, match_line)
if match_line.startswith(">"):
match_line = match_line[1:].strip()
# If this line is blank, break out of the innermost loop
# at m == 0 so that if the quoting starts in the following
# line, we'll correctly detect the start of the quoting
# position.
if not match_line:
break
if regex.match(match_line.strip()):
return n + m, typ
# print("match line", n, m, match_line)
match position:
case Position.Begin:
return n, typ
case Position.End:
return n + m, typ
case _:
assert_never(position)
return None


def find_quote_position(
lines: list[str], max_wrap_lines: int, limit: int | None = None
lines: list[str],
max_wrap_lines: int,
limit: int | None = None,
position: Position = Position.End,
) -> int | None:
"""
Return the (ending) line number of a quoting pattern. If a limit is given
and the limit is reached, the limit is returned.
Return the beginning or ending line number of a quoting pattern.
Args:
lines: List of lines of text.
max_wrap_lines: Amount to lines to join to check for potential wrapped
patterns.
limit: If line limit is given and reached without finding a pattern,
the limit is returned.
position: Whether to return the beginning or ending line number.
"""
for n in range(len(lines)):
result = find_pattern_on_line(lines, n, max_wrap_lines)
result = find_pattern_on_line(lines, n, max_wrap_lines, position)
if result:
return result[0]
if limit is not None and n >= limit - 1:
Expand Down Expand Up @@ -189,7 +219,7 @@ def find_unwrap_start(

# Find a forward / reply start pattern

result = find_pattern_on_line(lines, n, max_wrap_lines)
result = find_pattern_on_line(lines, n, max_wrap_lines, Position.End)
if result:
end, typ = result
return n, end, typ
Expand Down
135 changes: 135 additions & 0 deletions tests/test_internal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import pytest

from quotequail._internal import extract_headers, parse_reply


@pytest.mark.parametrize(
("line", "expected"),
[
# German
(
"Am 24.02.2015 um 22:48 schrieb John Doe <[email protected]>:",
{
"date": "24.02.2015 um 22:48",
"from": "John Doe <[email protected]>",
},
),
# English
(
"On Monday, March 7, 2016 10:19 AM, John Doe <[email protected]> wrote:",
{
"date": "Monday, March 7, 2016 10:19 AM",
"from": "John Doe <[email protected]>",
},
),
(
"On Feb 22, 2015, at 9:19 PM, John Doe <[email protected]> wrote:",
{
"date": "Feb 22, 2015, at 9:19 PM",
"from": "John Doe <[email protected]>",
},
),
(
"On 2016-03-14, at 20:26, John Doe <[email protected]> wrote:",
{
"date": "2016-03-14, at 20:26",
"from": "John Doe <[email protected]>",
},
),
(
"On 8 o'clock, John Doe wrote:",
{"date": "8 o'clock", "from": "John Doe"},
),
# French
(
"Le 6 janv. 2014 à 19:50, John Doe <[email protected]> a écrit :",
{
"date": "6 janv. 2014 \xe0 19:50",
"from": "John Doe <[email protected]>",
},
),
(
"Le 02.10.2013 à 11:13, John Doe <[email protected]> a écrit :",
{
"date": "02.10.2013 \xe0 11:13",
"from": "John Doe <[email protected]>",
},
),
# Spanish
(
"El 11/07/2012 06:13 p.m., John Doe escribió:",
{"date": "11/07/2012 06:13 p.m.", "from": "John Doe"},
),
(
"El 06/04/2010, a las 13:13, John Doe escribió:",
{"date": "06/04/2010, a las 13:13", "from": "John Doe"},
),
# Swedish
(
"Den 24 februari 2015 22:48 skrev John Doe <[email protected]>:",
{
"date": "24 februari 2015 22:48",
"from": "John Doe <[email protected]>",
},
),
# Brazillian portuguese
(
"Em qui, 24 de jan de 2019 às 14:31, John Doe <[email protected]> escreveu:",
{
"date": "qui, 24 de jan de 2019 às 14:31",
"from": "John Doe <[email protected]>",
},
),
# Other
(
"2009/5/12 John Doe <[email protected]>",
{"date": "2009/5/12", "from": "John Doe <[email protected]>"},
),
],
)
def test_parse_reply(line, expected):
assert parse_reply(line) == expected


def test_extract_headers():
assert extract_headers([], 2) == ({}, 0)
assert extract_headers(["test"], 2) == ({}, 0)
assert extract_headers(["From: b", "To: c"], 2) == (
{"from": "b", "to": "c"},
2,
)
assert extract_headers(["From: b", "foo"], 2) == ({"from": "b foo"}, 2)
assert extract_headers(["From: b", "foo"], 1) == ({"from": "b"}, 1)
assert extract_headers(["From: b", "To: c", "", "other line"], 2) == (
{"from": "b", "to": "c"},
2,
)
assert extract_headers(
[
"From: some very very very long name <",
"[email protected]>",
"Subject: this is a very very very very long",
"subject",
"",
"other line",
],
2,
) == (
{
"from": "some very very very long name <[email protected]>",
"subject": "this is a very very very very long subject",
},
4,
)
assert extract_headers(
[
"From: some very very very long name <",
"[email protected]>",
],
1,
) == (
{
"from": "some very very very long name <",
},
1,
)
Loading

0 comments on commit bcedce1

Please sign in to comment.