v0.4.0: Ability to quote intro line + modernize all tests (#55)

thomasst · web-flow · commit 3f8e26d173d9 · 2024-07-03T09:20:32.000+09:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # Changes
 
+## v0.4.0
+* Add `quote_intro_line` parameter to `quote` and `quote_html`.
+* Modernize all tests.
+
 ## v0.3.1
 * Fix `unwrap_html` when no result was found.
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -54,7 +54,10 @@ max-complexity = 15
 max-branches = 16
 
 [tool.ruff.lint.per-file-ignores]
-"tests/test_quotequail.py" = ["E501", "PT009"]
+"tests/test_internal.py" = ["E501"]
+"tests/test_quote.py" = ["E501"]
+"tests/test_quote_html.py" = ["E501"]
+"tests/test_unwrap.py" = ["E501"]
 "tests/test_unwrap_html.py" = ["E501"]
 
 [tool.mypy]
diff --git a/quotequail/__init__.py b/quotequail/__init__.py
@@ -2,59 +2,87 @@
 # a library that identifies quoted text in email messages
 
 from . import _internal, _patterns
+from ._enums import Position
 
-__version__ = "0.3.1"
+__version__ = "0.4.0"
 __all__ = ["quote", "quote_html", "unwrap", "unwrap_html"]
 
 
-def quote(text: str, limit: int = 1000) -> list[tuple[bool, str]]:
+def quote(
+    text: str, *, limit: int = 1000, quote_intro_line: bool = False
+) -> list[tuple[bool, str]]:
     """
-    Take a plain text message as an argument, return a list of tuples. The
-    first argument of the tuple denotes whether the text should be expanded by
-    default. The second argument is the unmodified corresponding text.
-
-    Example: [(True, 'expanded text'), (False, '> Some quoted text')]
-
-    Unless the limit param is set to None, the text will automatically be
-    quoted starting at the line where the limit is reached.
+    Divide email body into quoted parts.
+
+    Args:
+        text: Plain text message.
+        limit: If set, the text will automatically be quoted starting at the
+            line where the limit is reached.
+        quote_intro_line: Whether the line introducing the quoted text ("On ...
+            wrote:" / "Begin forwarded message:") should be part of the quoted
+            text.
+
+    Returns:
+        List of tuples: The first argument of the tuple denotes whether the
+        text should be expanded by default. The second argument is the
+        unmodified corresponding text.
+
+        Example: [(True, 'expanded text'), (False, '> Some quoted text')]
     """
     lines = text.split("\n")
 
+    position = Position.Begin if quote_intro_line else Position.End
     found = _internal.find_quote_position(
-        lines, _patterns.MAX_WRAP_LINES, limit
+        lines,
+        _patterns.MAX_WRAP_LINES,
+        limit=limit,
+        position=position,
     )
 
-    if found is not None:
-        return [
-            (True, "\n".join(lines[: found + 1])),
-            (False, "\n".join(lines[found + 1 :])),
-        ]
+    if found is None:
+        return [(True, text)]
 
-    return [(True, text)]
+    split_idx = found if quote_intro_line else found + 1
+    return [
+        (True, "\n".join(lines[:split_idx])),
+        (False, "\n".join(lines[split_idx:])),
+    ]
 
 
-def quote_html(html: str, limit: int = 1000) -> list[tuple[bool, str]]:
+def quote_html(
+    html: str, *, limit: int = 1000, quote_intro_line: bool = False
+) -> list[tuple[bool, str]]:
     """
-    Like quote(), but takes an HTML message as an argument. The limit param
-    represents the maximum number of lines to traverse until quoting the rest
-    of the markup. Lines are separated by block elements or <br>.
+    Like quote(), but takes an HTML message as an argument.
+
+    Args:
+        html: HTML message.
+        limit: Maximum number of lines to traverse until quoting the rest of
+            the markup. Lines are separated by block elements or <br>.
+        quote_intro_line: Whether the line introducing the quoted text ("On ...
+            wrote:" / "Begin forwarded message:") should be part of the quoted
+            text.
     """
     from . import _html
 
     tree = _html.get_html_tree(html)
 
     start_refs, end_refs, lines = _html.get_line_info(tree, limit + 1)
 
-    found = _internal.find_quote_position(lines, 1, limit)
+    position = Position.Begin if quote_intro_line else Position.End
+    found = _internal.find_quote_position(
+        lines, 1, limit=limit, position=position
+    )
 
     if found is None:
         # No quoting found and we're below limit. We're done.
         return [(True, _html.render_html_tree(tree))]
 
+    split_idx = found if quote_intro_line else found + 1
     start_tree = _html.slice_tree(
-        tree, start_refs, end_refs, (0, found + 1), html_copy=html
+        tree, start_refs, end_refs, (0, split_idx), html_copy=html
     )
-    end_tree = _html.slice_tree(tree, start_refs, end_refs, (found + 1, None))
+    end_tree = _html.slice_tree(tree, start_refs, end_refs, (split_idx, None))
 
     return [
         (True, _html.render_html_tree(start_tree)),
diff --git a/quotequail/_enums.py b/quotequail/_enums.py
@@ -0,0 +1,6 @@
+from enum import Enum
+
+
+class Position(Enum):
+    Begin = "begin"
+    End = "end"
diff --git a/quotequail/_html.py b/quotequail/_html.py
@@ -1,5 +1,4 @@
 # HTML utils
-import enum
 from collections.abc import Iterator
 from typing import TYPE_CHECKING, TypeAlias
 
@@ -9,14 +8,9 @@
 if TYPE_CHECKING:
     from lxml.html import HtmlElement
 
+from ._enums import Position
 from ._patterns import FORWARD_LINE, FORWARD_STYLES, MULTIPLE_WHITESPACE_RE
 
-
-class Position(enum.Enum):
-    Begin = "begin"
-    End = "end"
-
-
 Element: TypeAlias = "HtmlElement"
 ElementRef = tuple["Element", Position]
 
diff --git a/quotequail/_internal.py b/quotequail/_internal.py
@@ -1,3 +1,6 @@
+from typing_extensions import assert_never
+
+from ._enums import Position
 from ._patterns import (
     COMPILED_PATTERN_MAP,
     HEADER_MAP,
@@ -13,7 +16,10 @@
 
 
 def find_pattern_on_line(
-    lines: list[str], n: int, max_wrap_lines: int
+    lines: list[str],
+    n: int,
+    max_wrap_lines: int,
+    position: Position,
 ) -> tuple[int, str] | None:
     """
     Find a forward/reply pattern within the given lines on text on the given
@@ -30,20 +36,42 @@ def find_pattern_on_line(
                 match_line = join_wrapped_lines(lines[n : n + 1 + m])
                 if match_line.startswith(">"):
                     match_line = match_line[1:].strip()
+                # If this line is blank, break out of the innermost loop
+                # at m == 0 so that if the quoting starts in the following
+                # line, we'll correctly detect the start of the quoting
+                # position.
+                if not match_line:
+                    break
                 if regex.match(match_line.strip()):
-                    return n + m, typ
+                    match position:
+                        case Position.Begin:
+                            return n, typ
+                        case Position.End:
+                            return n + m, typ
+                        case _:
+                            assert_never(position)
     return None
 
 
 def find_quote_position(
-    lines: list[str], max_wrap_lines: int, limit: int | None = None
+    lines: list[str],
+    max_wrap_lines: int,
+    limit: int | None = None,
+    position: Position = Position.End,
 ) -> int | None:
     """
-    Return the (ending) line number of a quoting pattern. If a limit is given
-    and the limit is reached, the limit is returned.
+    Return the beginning or ending line number of a quoting pattern.
+
+    Args:
+        lines: List of lines of text.
+        max_wrap_lines: Amount to lines to join to check for potential wrapped
+            patterns.
+        limit: If line limit is given and reached without finding a pattern,
+            the limit is returned.
+        position: Whether to return the beginning or ending line number.
     """
     for n in range(len(lines)):
-        result = find_pattern_on_line(lines, n, max_wrap_lines)
+        result = find_pattern_on_line(lines, n, max_wrap_lines, position)
         if result:
             return result[0]
         if limit is not None and n >= limit - 1:
@@ -189,7 +217,7 @@ def find_unwrap_start(
 
         # Find a forward / reply start pattern
 
-        result = find_pattern_on_line(lines, n, max_wrap_lines)
+        result = find_pattern_on_line(lines, n, max_wrap_lines, Position.End)
         if result:
             end, typ = result
             return n, end, typ
diff --git a/requirements_tests.txt b/requirements_tests.txt
@@ -1,2 +1,3 @@
 lxml==5.2.2
 pytest==8.2.2
+typing-extensions==4.12.2
diff --git a/setup.py b/setup.py
@@ -28,6 +28,7 @@
     ],
     test_suite="tests",
     tests_require=["lxml"],
+    install_requires=["typing_extensions>=4.1"],
     platforms="any",
     classifiers=[
         "Environment :: Web Environment",
diff --git a/tests/test_internal.py b/tests/test_internal.py
@@ -0,0 +1,135 @@
+import pytest
+
+from quotequail._internal import extract_headers, parse_reply
+
+
+@pytest.mark.parametrize(
+    ("line", "expected"),
+    [
+        # German
+        (
+            "Am 24.02.2015 um 22:48 schrieb John Doe <john@doe.example>:",
+            {
+                "date": "24.02.2015 um 22:48",
+                "from": "John Doe <john@doe.example>",
+            },
+        ),
+        # English
+        (
+            "On Monday, March 7, 2016 10:19 AM, John Doe <john@doe.example> wrote:",
+            {
+                "date": "Monday, March 7, 2016 10:19 AM",
+                "from": "John Doe <john@doe.example>",
+            },
+        ),
+        (
+            "On Feb 22, 2015, at 9:19 PM, John Doe <john@doe.example> wrote:",
+            {
+                "date": "Feb 22, 2015, at 9:19 PM",
+                "from": "John Doe <john@doe.example>",
+            },
+        ),
+        (
+            "On 2016-03-14, at 20:26, John Doe <john@doe.example> wrote:",
+            {
+                "date": "2016-03-14, at 20:26",
+                "from": "John Doe <john@doe.example>",
+            },
+        ),
+        (
+            "On 8 o'clock, John Doe wrote:",
+            {"date": "8 o'clock", "from": "John Doe"},
+        ),
+        # French
+        (
+            "Le 6 janv. 2014 à 19:50, John Doe <john@doe.example> a écrit :",
+            {
+                "date": "6 janv. 2014 \xe0 19:50",
+                "from": "John Doe <john@doe.example>",
+            },
+        ),
+        (
+            "Le 02.10.2013 à 11:13, John Doe <john@doe.example> a écrit :",
+            {
+                "date": "02.10.2013 \xe0 11:13",
+                "from": "John Doe <john@doe.example>",
+            },
+        ),
+        # Spanish
+        (
+            "El 11/07/2012 06:13 p.m., John Doe escribió:",
+            {"date": "11/07/2012 06:13 p.m.", "from": "John Doe"},
+        ),
+        (
+            "El 06/04/2010, a las 13:13, John Doe escribió:",
+            {"date": "06/04/2010, a las 13:13", "from": "John Doe"},
+        ),
+        # Swedish
+        (
+            "Den 24 februari 2015 22:48 skrev John Doe <john@doe.example>:",
+            {
+                "date": "24 februari 2015 22:48",
+                "from": "John Doe <john@doe.example>",
+            },
+        ),
+        # Brazillian portuguese
+        (
+            "Em qui, 24 de jan de 2019 às 14:31, John Doe <john@doe.example> escreveu:",
+            {
+                "date": "qui, 24 de jan de 2019 às 14:31",
+                "from": "John Doe <john@doe.example>",
+            },
+        ),
+        # Other
+        (
+            "2009/5/12 John Doe <john@doe.example>",
+            {"date": "2009/5/12", "from": "John Doe <john@doe.example>"},
+        ),
+    ],
+)
+def test_parse_reply(line, expected):
+    assert parse_reply(line) == expected
+
+
+def test_extract_headers():
+    assert extract_headers([], 2) == ({}, 0)
+    assert extract_headers(["test"], 2) == ({}, 0)
+    assert extract_headers(["From: b", "To: c"], 2) == (
+        {"from": "b", "to": "c"},
+        2,
+    )
+    assert extract_headers(["From: b", "foo"], 2) == ({"from": "b foo"}, 2)
+    assert extract_headers(["From: b", "foo"], 1) == ({"from": "b"}, 1)
+    assert extract_headers(["From: b", "To: c", "", "other line"], 2) == (
+        {"from": "b", "to": "c"},
+        2,
+    )
+    assert extract_headers(
+        [
+            "From: some very very very long name <",
+            "verylong@example.com>",
+            "Subject: this is a very very very very long",
+            "subject",
+            "",
+            "other line",
+        ],
+        2,
+    ) == (
+        {
+            "from": "some very very very long name <verylong@example.com>",
+            "subject": "this is a very very very very long subject",
+        },
+        4,
+    )
+    assert extract_headers(
+        [
+            "From: some very very very long name <",
+            "verylong@example.com>",
+        ],
+        1,
+    ) == (
+        {
+            "from": "some very very very long name <",
+        },
+        1,
+    )
diff --git a/tests/test_quote.py b/tests/test_quote.py
diff --git a/tests/test_quote_html.py b/tests/test_quote_html.py
diff --git a/tests/test_quotequail.py b/tests/test_quotequail.py
diff --git a/tests/test_unwrap.py b/tests/test_unwrap.py

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`lxml==5.2.2`
`2`	`2`	`pytest==8.2.2`
	`3`	`+typing-extensions==4.12.2`