diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 97c0517..a302c98 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -22,7 +22,7 @@ jobs: - uses: actions/setup-python@v2 name: Install Python with: - python-version: 3.9 + python-version: 3.10 - run: | pip install packaging diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index b3e2057..f51b324 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -11,7 +11,7 @@ jobs: lint: strategy: matrix: - python-version: ['3.9', '3.10', '3.11'] + python-version: ['3.10', '3.11', '3.12'] name: Lint ${{ matrix.python-version }} runs-on: 'ubuntu-20.04' container: python:${{ matrix.python-version }} @@ -21,16 +21,21 @@ jobs: - name: Lint code run: | - pip install ruff==0.5.0 + pip install ruff==0.5.0 mypy==1.10.1 ruff check ruff format --check ruff check --select I + - name: Type check code + run: | + pip install mypy==1.10.1 + mypy + # Run tests test: strategy: matrix: - python-version: ['3.9', '3.10', '3.11'] + python-version: ['3.10', '3.11', '3.12'] # Do not cancel any jobs when a single job fails fail-fast: false name: Python ${{ matrix.python-version }} diff --git a/pyproject.toml b/pyproject.toml index b60da53..48055d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,3 +54,14 @@ max-branches = 16 [tool.ruff.lint.per-file-ignores] "tests/test_quotequail.py" = ["E501", "PT009"] + +[tool.mypy] +python_version = "3.10" +ignore_missing_imports = true +no_implicit_optional = true +strict_equality = true +follow_imports = "normal" +warn_unreachable = true +show_error_context = true +pretty = true +files = "quotequail" diff --git a/quotequail/__init__.py b/quotequail/__init__.py index 04300b0..0e18e52 100644 --- a/quotequail/__init__.py +++ b/quotequail/__init__.py @@ -7,7 +7,7 @@ __all__ = ["quote", "quote_html", "unwrap", "unwrap_html"] -def quote(text, limit=1000): +def quote(text: str, limit: int = 1000) -> list[tuple[bool, str]]: """ Take a plain text message as an argument, return a list of tuples. The first argument of the tuple denotes whether the text should be expanded by @@ -33,7 +33,7 @@ def quote(text, limit=1000): return [(True, text)] -def quote_html(html, limit=1000): +def quote_html(html: str, limit: int = 1000) -> list[tuple[bool, str]]: """ Like quote(), but takes an HTML message as an argument. The limit param represents the maximum number of lines to traverse until quoting the rest @@ -62,7 +62,7 @@ def quote_html(html, limit=1000): ] -def unwrap(text): +def unwrap(text: str) -> dict[str, str] | None: """ If the passed text is the text body of a forwarded message, a reply, or contains quoted text, a dictionary with the following keys is returned: @@ -78,31 +78,33 @@ def unwrap(text): """ lines = text.split("\n") - result = _internal.unwrap( + unwrap_result = _internal.unwrap( lines, _patterns.MAX_WRAP_LINES, _patterns.MIN_HEADER_LINES, _patterns.MIN_QUOTED_LINES, ) - if not result: + if not unwrap_result: return None - typ, top_range, hdrs, main_range, bottom_range, needs_unindent = result + typ, top_range, hdrs, main_range, bottom_range, needs_unindent = ( + unwrap_result + ) - text_top = lines[slice(*top_range)] if top_range else "" - text = lines[slice(*main_range)] if main_range else "" - text_bottom = lines[slice(*bottom_range)] if bottom_range else "" + text_top_lines = lines[slice(*top_range)] if top_range else [] + text_lines = lines[slice(*main_range)] if main_range else [] + text_bottom_lines = lines[slice(*bottom_range)] if bottom_range else [] if needs_unindent: - text = _internal.unindent_lines(text) + text_lines = _internal.unindent_lines(text_lines) result = { "type": typ, } - text = "\n".join(text).strip() - text_top = "\n".join(text_top).strip() - text_bottom = "\n".join(text_bottom).strip() + text = "\n".join(text_lines).strip() + text_top = "\n".join(text_top_lines).strip() + text_bottom = "\n".join(text_bottom_lines).strip() if text: result["text"] = text @@ -117,7 +119,7 @@ def unwrap(text): return result -def unwrap_html(html): +def unwrap_html(html: str) -> dict[str, str] | None: """ If the passed HTML is the HTML body of a forwarded message, a dictionary with the following keys is returned: @@ -137,38 +139,40 @@ def unwrap_html(html): start_refs, end_refs, lines = _html.get_line_info(tree) - result = _internal.unwrap(lines, 1, _patterns.MIN_HEADER_LINES, 1) + unwrap_result = _internal.unwrap(lines, 1, _patterns.MIN_HEADER_LINES, 1) - if result: - typ, top_range, hdrs, main_range, bottom_range, needs_unindent = result + if unwrap_result: + typ, top_range, hdrs, main_range, bottom_range, needs_unindent = ( + unwrap_result + ) result = { "type": typ, } - top_range = _html.trim_slice(lines, top_range) - main_range = _html.trim_slice(lines, main_range) - bottom_range = _html.trim_slice(lines, bottom_range) + top_range_slice = _html.trim_slice(lines, top_range) + main_range_slice = _html.trim_slice(lines, main_range) + bottom_range_slice = _html.trim_slice(lines, bottom_range) - if top_range: + if top_range_slice: top_tree = _html.slice_tree( - tree, start_refs, end_refs, top_range, html_copy=html + tree, start_refs, end_refs, top_range_slice, html_copy=html ) html_top = _html.render_html_tree(top_tree) if html_top: result["html_top"] = html_top - if bottom_range: + if bottom_range_slice: bottom_tree = _html.slice_tree( - tree, start_refs, end_refs, bottom_range, html_copy=html + tree, start_refs, end_refs, bottom_range_slice, html_copy=html ) html_bottom = _html.render_html_tree(bottom_tree) if html_bottom: result["html_bottom"] = html_bottom - if main_range: + if main_range_slice: main_tree = _html.slice_tree( - tree, start_refs, end_refs, main_range + tree, start_refs, end_refs, main_range_slice ) if needs_unindent: _html.unindent_tree(main_tree) diff --git a/quotequail/_html.py b/quotequail/_html.py index f393852..b8c4061 100644 --- a/quotequail/_html.py +++ b/quotequail/_html.py @@ -1,9 +1,11 @@ # HTML utils +from collections.abc import Iterator import lxml.etree import lxml.html from ._patterns import FORWARD_LINE, FORWARD_STYLES, MULTIPLE_WHITESPACE_RE +from .types import Element, ElementRef INLINE_TAGS = [ "a", @@ -27,7 +29,7 @@ END = "end" -def trim_tree_after(element, include_element=True): +def trim_tree_after(element: Element, include_element: bool = True): """ Remove the document tree following the given element. If include_element is True, the given element is kept in the tree, otherwise it is removed. @@ -44,7 +46,9 @@ def trim_tree_after(element, include_element=True): el = parent_el -def trim_tree_before(element, include_element=True, keep_head=True): +def trim_tree_before( + element: Element, include_element: bool = True, keep_head: bool = True +) -> None: """ Remove the document tree preceding the given element. If include_element is True, the given element is kept in the tree, otherwise it is removed. @@ -66,7 +70,9 @@ def trim_tree_before(element, include_element=True, keep_head=True): el = parent_el -def trim_slice(lines, slice_tuple): +def trim_slice( + lines: list[str], slice_tuple: tuple[int | None, int | None] | None +) -> tuple[int, int] | None: """ Trim a slice tuple (begin, end) so it starts at the first non-empty line (obtained via indented_tree_line_generator / get_line_info) and ends at the @@ -97,7 +103,7 @@ def _empty(line): return (slice_start, slice_end) -def unindent_tree(element): +def unindent_tree(element: Element) -> None: """ Remove the outermost indent. For example, the tree "
AB
C
D
E
FG
" @@ -111,7 +117,13 @@ def unindent_tree(element): return -def slice_tree(tree, start_refs, end_refs, slice_tuple, html_copy=None): +def slice_tree( + tree: Element, + start_refs: list[ElementRef | None], + end_refs: list[ElementRef | None], + slice_tuple: tuple[int | None, int | None] | None, + html_copy: str | None = None, +): """ Slice the HTML tree with the given start_refs and end_refs (obtained via get_line_info) at the given slice_tuple, a tuple (start, end) containing @@ -190,7 +202,7 @@ def slice_tree(tree, start_refs, end_refs, slice_tuple, html_copy=None): return new_tree -def get_html_tree(html): +def get_html_tree(html: str) -> Element: """ Given the HTML string, returns a LXML tree object. The tree is wrapped in
elements if it doesn't have a top level tag or parsing would @@ -198,10 +210,10 @@ def get_html_tree(html): strip_wrapping(). """ parser = lxml.html.HTMLParser(encoding="utf-8") - html = html.encode("utf8") + htmlb = html.encode("utf8") try: - tree = lxml.html.fromstring(html, parser=parser) + tree = lxml.html.fromstring(htmlb, parser=parser) except lxml.etree.Error: # E.g. empty document. Use dummy
tree = lxml.html.fromstring("
") @@ -209,8 +221,8 @@ def get_html_tree(html): # If the document doesn't start with a top level tag, wrap it with a
# that will be later stripped out for consistent behavior. if tree.tag not in lxml.html.defs.top_level_tags: - html = b"
" + html + b"
" - tree = lxml.html.fromstring(html, parser=parser) + htmlb = b"
" + htmlb + b"
" + tree = lxml.html.fromstring(htmlb, parser=parser) # HACK for Outlook emails, where tags like are rendered as

. We # can generally ignore these tags so we replace them with , which @@ -229,7 +241,7 @@ def get_html_tree(html): return tree -def strip_wrapping(html): +def strip_wrapping(html: str) -> str: """ Remove the wrapping that might have resulted when using get_html_tree(). """ @@ -238,7 +250,7 @@ def strip_wrapping(html): return html.strip() -def render_html_tree(tree): +def render_html_tree(tree: Element) -> str: """ Render the given HTML tree, and strip any wrapping that was applied in get_html_tree(). @@ -257,13 +269,15 @@ def render_html_tree(tree): return strip_wrapping(html) -def is_indentation_element(element): +def is_indentation_element(element: Element) -> bool: if isinstance(element.tag, str): return element.tag.lower() == "blockquote" return False -def tree_token_generator(el, indentation_level=0): +def tree_token_generator( + el: Element, indentation_level: int = 0 +) -> Iterator[None | tuple[Element, str, int] | str]: """ Yield tokens for the given HTML element as follows: @@ -296,7 +310,13 @@ def tree_token_generator(el, indentation_level=0): yield el.tail -def tree_line_generator(el, max_lines=None): +def tree_line_generator( + el: Element, max_lines: int | None = None +) -> Iterator[ + tuple[ + tuple[ElementRef, str] | None, tuple[ElementRef, str] | None, int, str + ] +]: """ Iterate through an LXML tree and yield a tuple per line. @@ -327,7 +347,7 @@ def tree_line_generator(el, max_lines=None): - ((, 'end'), (, 'end'), 0, 'world') """ - def _trim_spaces(text): + def _trim_spaces(text: str) -> str: return MULTIPLE_WHITESPACE_RE.sub(" ", text).strip() counter = 1 @@ -341,7 +361,7 @@ def _trim_spaces(text): start_ref = None # The indentation level at the start of the line. - start_indentation_level = None + start_indentation_level = 0 for token in tree_token_generator(el): if token is None: @@ -393,12 +413,17 @@ def _trim_spaces(text): else: raise RuntimeError(f"invalid token: {token}") + """ + TODO: wrong type, would trigger error if reached. line = _trim_spaces(line) if line: yield line + """ -def indented_tree_line_generator(el, max_lines=None): +def indented_tree_line_generator( + el: Element, max_lines: int | None = None +) -> Iterator[tuple[ElementRef | None, ElementRef | None, str]]: r""" Like tree_line_generator, but yields tuples (start_ref, end_ref, line), where the line already takes the indentation into account by having "> " @@ -413,14 +438,19 @@ def indented_tree_line_generator(el, max_lines=None): yield start_ref, end_ref, "> " * indentation_level + full_line -def get_line_info(tree, max_lines=None): +def get_line_info( + tree: Element, max_lines: int | None = None +) -> tuple[list[ElementRef | None], list[ElementRef | None], list[str]]: """ Shortcut for indented_tree_line_generator() that returns an array of start references, an array of corresponding end references (see tree_line_generator() docs), and an array of corresponding lines. """ line_gen = indented_tree_line_generator(tree, max_lines=max_lines) - line_gen_result = list(zip(*line_gen)) + line_gen_result: ( + tuple[list[ElementRef | None], list[ElementRef | None], list[str]] + | tuple[()] + ) = tuple(zip(*line_gen)) if line_gen_result: return line_gen_result return [], [], [] diff --git a/quotequail/_internal.py b/quotequail/_internal.py index c76aa60..50c2cf8 100644 --- a/quotequail/_internal.py +++ b/quotequail/_internal.py @@ -12,7 +12,9 @@ """ -def find_pattern_on_line(lines, n, max_wrap_lines): +def find_pattern_on_line( + lines: list[str], n: int, max_wrap_lines: int +) -> tuple[int, str] | None: """ Find a forward/reply pattern within the given lines on text on the given line number and return a tuple with the type ('reply' or 'forward') and @@ -20,7 +22,7 @@ def find_pattern_on_line(lines, n, max_wrap_lines): different from the given line number in case the pattern wraps over multiple lines. - Returns (None, None) if no pattern was found. + Returns None if no pattern was found. """ for typ, regexes in COMPILED_PATTERN_MAP.items(): for regex in regexes: @@ -30,25 +32,27 @@ def find_pattern_on_line(lines, n, max_wrap_lines): match_line = match_line[1:].strip() if regex.match(match_line.strip()): return n + m, typ - return None, None + return None -def find_quote_position(lines, max_wrap_lines, limit=None): +def find_quote_position( + lines: list[str], max_wrap_lines: int, limit: int | None = None +) -> int | None: """ Return the (ending) line number of a quoting pattern. If a limit is given and the limit is reached, the limit is returned. """ for n in range(len(lines)): - end, typ = find_pattern_on_line(lines, n, max_wrap_lines) - if typ: - return end + result = find_pattern_on_line(lines, n, max_wrap_lines) + if result: + return result[0] if limit is not None and n >= limit - 1: return n return None -def join_wrapped_lines(lines): +def join_wrapped_lines(lines: list[str]) -> str: """ Join one or multiple lines that wrapped. Returns the reconstructed line. Takes into account proper spacing between the lines (see @@ -68,7 +72,9 @@ def join_wrapped_lines(lines): return joined -def extract_headers(lines, max_wrap_lines): +def extract_headers( + lines: list[str], max_wrap_lines: int +) -> tuple[dict[str, str], int]: """ Extract email headers from the given lines. Returns a dict with the detected headers and the amount of lines that were processed. @@ -109,7 +115,7 @@ def extract_headers(lines, max_wrap_lines): return hdrs, lines_processed -def parse_reply(line): +def parse_reply(line: str) -> dict[str, str] | None: """ Parse the given reply line ("On DATE, USER wrote:") and returns a dictionary with the "Date" and "From" keys, or None, if couldn't parse. @@ -154,8 +160,11 @@ def parse_reply(line): def find_unwrap_start( - lines, max_wrap_lines, min_header_lines, min_quoted_lines -): + lines: list[str], + max_wrap_lines: int, + min_header_lines: int, + min_quoted_lines: int, +) -> tuple[int, int, str] | None: """ Find the starting point of a wrapped email. Returns a tuple containing (start_line_number, end_line_number, type), where type can be one of the @@ -164,7 +173,7 @@ def find_unwrap_start( * 'forward': A matching forwarding pattern was found * 'reply': A matching reply pattern was found * 'headers': Headers were found (usually a forwarded email) - * 'quote': A quote was found + * 'quoted': A quote was found start_line_number corresponds to the line number where the forwarding/reply pattern starts, or where the headers/quote starts. end_line_number is only @@ -172,7 +181,7 @@ def find_unwrap_start( multiple lines (it does not extend to the end of the headers or of the quoted section). - Returns (None, None, None) if nothing was found. + Returns None if nothing was found. """ for n, line in enumerate(lines): if not line.strip(): @@ -180,8 +189,9 @@ def find_unwrap_start( # Find a forward / reply start pattern - end, typ = find_pattern_on_line(lines, n, max_wrap_lines) - if typ: + result = find_pattern_on_line(lines, n, max_wrap_lines) + if result: + end, typ = result return n, end, typ # Find a quote @@ -210,10 +220,10 @@ def find_unwrap_start( ): return n, n, "headers" - return None, None, None + return None -def unindent_lines(lines): +def unindent_lines(lines: list[str]) -> list[str]: unquoted = [] for line in lines: if line.startswith("> "): @@ -226,7 +236,22 @@ def unindent_lines(lines): return unquoted -def unwrap(lines, max_wrap_lines, min_header_lines, min_quoted_lines): +def unwrap( + lines: list[str], + max_wrap_lines: int, + min_header_lines: int, + min_quoted_lines: int, +) -> ( + tuple[ + str, + tuple[int | None, int | None], + dict[str, str] | None, + tuple[int | None, int | None] | None, + tuple[int | None, int | None] | None, + bool, + ] + | None +): """ Return a tuple of: - Type ('forward', 'reply', 'headers', 'quoted') @@ -239,9 +264,13 @@ def unwrap(lines, max_wrap_lines, min_header_lines, min_quoted_lines): headers = {} # Get line number and wrapping type. - start, end, typ = find_unwrap_start( + result = find_unwrap_start( lines, max_wrap_lines, min_header_lines, min_quoted_lines ) + if not result: + return None + + start, end, typ = result # We found a line indicating that it's a forward/reply. if typ in ("forward", "reply"): @@ -256,19 +285,23 @@ def unwrap(lines, max_wrap_lines, min_header_lines, min_quoted_lines): # Find where the headers or the quoted section starts. # We can set min_quoted_lines to 1 because we expect a quoted section. - start2, end2, typ = find_unwrap_start( + result = find_unwrap_start( lines[end + 1 :], max_wrap_lines, min_header_lines, 1 ) + start2 = result[0] if result else 0 + typ2 = result[2] if result else None - if typ == "quoted": + if typ2 == "quoted": # Quoted section starts. Unindent and check if there are headers. quoted_start = end + 1 + start2 unquoted = unindent_lines(lines[quoted_start:]) rest_start = quoted_start + len(unquoted) - start3, end3, typ = find_unwrap_start( + result = find_unwrap_start( unquoted, max_wrap_lines, min_header_lines, min_quoted_lines ) - if typ == "headers": + start3 = result[0] if result else 0 + typ3 = result[2] if result else None + if typ3 == "headers": hdrs, hdrs_length = extract_headers( unquoted[start3:], max_wrap_lines ) @@ -330,10 +363,12 @@ def unwrap(lines, max_wrap_lines, min_header_lines, min_quoted_lines): if typ == "quoted": unquoted = unindent_lines(lines[start:]) rest_start = start + len(unquoted) - start2, end2, typ = find_unwrap_start( + result = find_unwrap_start( unquoted, max_wrap_lines, min_header_lines, min_quoted_lines ) - if typ == "headers": + start2 = result[0] if result else 0 + typ2 = result[2] if result else None + if typ2 == "headers": main_type = "forward" hdrs, hdrs_length = extract_headers( unquoted[start2:], max_wrap_lines @@ -358,4 +393,4 @@ def unwrap(lines, max_wrap_lines, min_header_lines, min_quoted_lines): True, ) - return None + raise RuntimeError(f"invalid type: {typ}") diff --git a/quotequail/types.py b/quotequail/types.py new file mode 100644 index 0000000..af1278b --- /dev/null +++ b/quotequail/types.py @@ -0,0 +1,8 @@ +from typing import TYPE_CHECKING, TypeAlias + +if TYPE_CHECKING: + from lxml.html import HtmlElement + + Element: TypeAlias = HtmlElement + +ElementRef = tuple["Element", str] diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 5332636..0000000 --- a/setup.cfg +++ /dev/null @@ -1,64 +0,0 @@ -[flake8] -ignore= - # !!! make sure you have a comma at the end of each line EXCEPT the LAST one - # line length, already enforced by black - E501, - # https://pypi.org/project/flake8-future-import/ - FI1 - # Missing docstrings - D1, - # One-line docstring should fit on one line with quotes. - # We ignore this because it's OK to buy yourself a few extra characters - # for the summary line even if the summary line is *the only* line. - D200, - # 1 blank line required between summary line and description - D205, - # Multi-line docstring summary should start at the first line. - # We ignore this because we agreed in #20553 that we we want to put the - # summary line below """ for multi-line docstrings. - D212, - # First line should end with a period - D400, - # This is not PEP8-compliant and conflicts with black - W503, - W504, - # This is not PEP8-compliant and conflicts with black - E203, - # Too intrusive, sometimes makes code less readable - SIM106 - # Allow f-strings - SFS301, - # Allow .format - SFS201 -exclude=venv -#max-complexity=2 -banned-modules= - typing.Text = use str -require-code=True - -[isort] -skip=venv,src -known_first_party=quotequail -known_tests=tests -sections=FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,TESTS,LOCALFOLDER -default_section=THIRDPARTY -use_parentheses=true -multi_line_output=3 -include_trailing_comma=True -force_grid_wrap=0 -combine_as_imports=True -line_length=87 - -[mypy] -python_version = 3.7 -ignore_missing_imports = True -no_implicit_optional = True -strict_equality = True -follow_imports = normal -warn_unreachable = True -show_error_context = True -pretty = True -files = quotequail - -[tool:pytest] -norecursedirs=venv