1
1
# HTML utils
2
+ from collections .abc import Iterator
2
3
3
4
import lxml .etree
4
5
import lxml .html
5
6
6
7
from ._patterns import FORWARD_LINE , FORWARD_STYLES , MULTIPLE_WHITESPACE_RE
8
+ from .types import Element , ElementRef
7
9
8
10
INLINE_TAGS = [
9
11
"a" ,
27
29
END = "end"
28
30
29
31
30
- def trim_tree_after (element , include_element = True ):
32
+ def trim_tree_after (element : Element , include_element : bool = True ):
31
33
"""
32
34
Remove the document tree following the given element. If include_element
33
35
is True, the given element is kept in the tree, otherwise it is removed.
@@ -44,7 +46,9 @@ def trim_tree_after(element, include_element=True):
44
46
el = parent_el
45
47
46
48
47
- def trim_tree_before (element , include_element = True , keep_head = True ):
49
+ def trim_tree_before (
50
+ element : Element , include_element : bool = True , keep_head : bool = True
51
+ ) -> None :
48
52
"""
49
53
Remove the document tree preceding the given element. If include_element
50
54
is True, the given element is kept in the tree, otherwise it is removed.
@@ -66,7 +70,9 @@ def trim_tree_before(element, include_element=True, keep_head=True):
66
70
el = parent_el
67
71
68
72
69
- def trim_slice (lines , slice_tuple ):
73
+ def trim_slice (
74
+ lines : list [str ], slice_tuple : tuple [int | None , int | None ] | None
75
+ ) -> tuple [int , int ] | None :
70
76
"""
71
77
Trim a slice tuple (begin, end) so it starts at the first non-empty line
72
78
(obtained via indented_tree_line_generator / get_line_info) and ends at the
@@ -97,7 +103,7 @@ def _empty(line):
97
103
return (slice_start , slice_end )
98
104
99
105
100
- def unindent_tree (element ) :
106
+ def unindent_tree (element : Element ) -> None :
101
107
"""
102
108
Remove the outermost indent. For example, the tree
103
109
"<div>A<blockqote>B<div>C<blockquote>D</blockquote>E</div>F</blockquote>G</div>"
@@ -111,7 +117,13 @@ def unindent_tree(element):
111
117
return
112
118
113
119
114
- def slice_tree (tree , start_refs , end_refs , slice_tuple , html_copy = None ):
120
+ def slice_tree (
121
+ tree : Element ,
122
+ start_refs : list [ElementRef | None ],
123
+ end_refs : list [ElementRef | None ],
124
+ slice_tuple : tuple [int | None , int | None ] | None ,
125
+ html_copy : str | None = None ,
126
+ ):
115
127
"""
116
128
Slice the HTML tree with the given start_refs and end_refs (obtained via
117
129
get_line_info) at the given slice_tuple, a tuple (start, end) containing
@@ -190,27 +202,27 @@ def slice_tree(tree, start_refs, end_refs, slice_tuple, html_copy=None):
190
202
return new_tree
191
203
192
204
193
- def get_html_tree (html ) :
205
+ def get_html_tree (html : str ) -> Element :
194
206
"""
195
207
Given the HTML string, returns a LXML tree object. The tree is wrapped in
196
208
<div> elements if it doesn't have a top level tag or parsing would
197
209
otherwise result in an error. The wrapping can be later removed with
198
210
strip_wrapping().
199
211
"""
200
212
parser = lxml .html .HTMLParser (encoding = "utf-8" )
201
- html = html .encode ("utf8" )
213
+ htmlb = html .encode ("utf8" )
202
214
203
215
try :
204
- tree = lxml .html .fromstring (html , parser = parser )
216
+ tree = lxml .html .fromstring (htmlb , parser = parser )
205
217
except lxml .etree .Error :
206
218
# E.g. empty document. Use dummy <div>
207
219
tree = lxml .html .fromstring ("<div></div>" )
208
220
209
221
# If the document doesn't start with a top level tag, wrap it with a <div>
210
222
# that will be later stripped out for consistent behavior.
211
223
if tree .tag not in lxml .html .defs .top_level_tags :
212
- html = b"<div>" + html + b"</div>"
213
- tree = lxml .html .fromstring (html , parser = parser )
224
+ htmlb = b"<div>" + htmlb + b"</div>"
225
+ tree = lxml .html .fromstring (htmlb , parser = parser )
214
226
215
227
# HACK for Outlook emails, where tags like <o:p> are rendered as <p>. We
216
228
# can generally ignore these tags so we replace them with <span>, which
@@ -229,7 +241,7 @@ def get_html_tree(html):
229
241
return tree
230
242
231
243
232
- def strip_wrapping (html ) :
244
+ def strip_wrapping (html : str ) -> str :
233
245
"""
234
246
Remove the wrapping that might have resulted when using get_html_tree().
235
247
"""
@@ -238,7 +250,7 @@ def strip_wrapping(html):
238
250
return html .strip ()
239
251
240
252
241
- def render_html_tree (tree ) :
253
+ def render_html_tree (tree : Element ) -> str :
242
254
"""
243
255
Render the given HTML tree, and strip any wrapping that was applied in
244
256
get_html_tree().
@@ -257,13 +269,15 @@ def render_html_tree(tree):
257
269
return strip_wrapping (html )
258
270
259
271
260
- def is_indentation_element (element ) :
272
+ def is_indentation_element (element : Element ) -> bool :
261
273
if isinstance (element .tag , str ):
262
274
return element .tag .lower () == "blockquote"
263
275
return False
264
276
265
277
266
- def tree_token_generator (el , indentation_level = 0 ):
278
+ def tree_token_generator (
279
+ el : Element , indentation_level : int = 0
280
+ ) -> Iterator [None | tuple [Element , str , int ] | str ]:
267
281
"""
268
282
Yield tokens for the given HTML element as follows:
269
283
@@ -296,7 +310,13 @@ def tree_token_generator(el, indentation_level=0):
296
310
yield el .tail
297
311
298
312
299
- def tree_line_generator (el , max_lines = None ):
313
+ def tree_line_generator (
314
+ el : Element , max_lines : int | None = None
315
+ ) -> Iterator [
316
+ tuple [
317
+ tuple [ElementRef , str ] | None , tuple [ElementRef , str ] | None , int , str
318
+ ]
319
+ ]:
300
320
"""
301
321
Iterate through an LXML tree and yield a tuple per line.
302
322
@@ -327,7 +347,7 @@ def tree_line_generator(el, max_lines=None):
327
347
- ((<Element blockquote>, 'end'), (<Element div>, 'end'), 0, 'world')
328
348
"""
329
349
330
- def _trim_spaces (text ) :
350
+ def _trim_spaces (text : str ) -> str :
331
351
return MULTIPLE_WHITESPACE_RE .sub (" " , text ).strip ()
332
352
333
353
counter = 1
@@ -341,7 +361,7 @@ def _trim_spaces(text):
341
361
start_ref = None
342
362
343
363
# The indentation level at the start of the line.
344
- start_indentation_level = None
364
+ start_indentation_level = 0
345
365
346
366
for token in tree_token_generator (el ):
347
367
if token is None :
@@ -393,12 +413,17 @@ def _trim_spaces(text):
393
413
else :
394
414
raise RuntimeError (f"invalid token: { token } " )
395
415
416
+ """
417
+ TODO: wrong type, would trigger error if reached.
396
418
line = _trim_spaces(line)
397
419
if line:
398
420
yield line
421
+ """
399
422
400
423
401
- def indented_tree_line_generator (el , max_lines = None ):
424
+ def indented_tree_line_generator (
425
+ el : Element , max_lines : int | None = None
426
+ ) -> Iterator [tuple [ElementRef | None , ElementRef | None , str ]]:
402
427
r"""
403
428
Like tree_line_generator, but yields tuples (start_ref, end_ref, line),
404
429
where the line already takes the indentation into account by having "> "
@@ -413,14 +438,19 @@ def indented_tree_line_generator(el, max_lines=None):
413
438
yield start_ref , end_ref , "> " * indentation_level + full_line
414
439
415
440
416
- def get_line_info (tree , max_lines = None ):
441
+ def get_line_info (
442
+ tree : Element , max_lines : int | None = None
443
+ ) -> tuple [list [ElementRef | None ], list [ElementRef | None ], list [str ]]:
417
444
"""
418
445
Shortcut for indented_tree_line_generator() that returns an array of
419
446
start references, an array of corresponding end references (see
420
447
tree_line_generator() docs), and an array of corresponding lines.
421
448
"""
422
449
line_gen = indented_tree_line_generator (tree , max_lines = max_lines )
423
- line_gen_result = list (zip (* line_gen ))
450
+ line_gen_result : (
451
+ tuple [list [ElementRef | None ], list [ElementRef | None ], list [str ]]
452
+ | tuple [()]
453
+ ) = tuple (zip (* line_gen ))
424
454
if line_gen_result :
425
455
return line_gen_result
426
456
return [], [], []
0 commit comments