1
1
# HTML utils
2
+ import enum
2
3
from collections .abc import Iterator
3
4
from typing import TYPE_CHECKING , TypeAlias
4
5
10
11
11
12
from ._patterns import FORWARD_LINE , FORWARD_STYLES , MULTIPLE_WHITESPACE_RE
12
13
14
+
15
+ class Position (enum .Enum ):
16
+ Begin = "begin"
17
+ End = "end"
18
+
19
+
13
20
Element : TypeAlias = "HtmlElement"
14
- ElementRef = tuple ["Element" , str ]
21
+ ElementRef = tuple ["Element" , Position ]
15
22
16
23
INLINE_TAGS = [
17
24
"a" ,
31
38
"th" ,
32
39
]
33
40
34
- BEGIN = "begin"
35
- END = "end"
36
-
37
41
38
42
def trim_tree_after (element : Element , include_element : bool = True ):
39
43
"""
@@ -184,9 +188,9 @@ def slice_tree(
184
188
new_tree = tree
185
189
186
190
if start_ref :
187
- include_start = start_ref [1 ] == BEGIN
191
+ include_start = start_ref [1 ] is Position . Begin
188
192
if end_ref :
189
- include_end = end_ref [1 ] == END
193
+ include_end = end_ref [1 ] is Position . End
190
194
191
195
# If start_ref is the same as end_ref, and we don't include the element,
192
196
# we are removing the entire tree. We need to handle this separately,
@@ -283,14 +287,14 @@ def is_indentation_element(element: Element) -> bool:
283
287
284
288
def tree_token_generator (
285
289
el : Element , indentation_level : int = 0
286
- ) -> Iterator [None | tuple [Element , str , int ] | str ]:
290
+ ) -> Iterator [None | tuple [Element , Position , int ] | str ]:
287
291
"""
288
292
Yield tokens for the given HTML element as follows:
289
293
290
- - A tuple (LXML element, BEGIN , indentation_level)
294
+ - A tuple (LXML element, Begin , indentation_level)
291
295
- Text right after the start of the tag, or None.
292
296
- Recursively calls the token generator for all child objects
293
- - A tuple (LXML element, END , indentation_level)
297
+ - A tuple (LXML element, End , indentation_level)
294
298
- Text right after the end of the tag, or None.
295
299
"""
296
300
if not isinstance (el .tag , str ):
@@ -301,7 +305,7 @@ def tree_token_generator(
301
305
if is_indentation :
302
306
indentation_level += 1
303
307
304
- yield (el , BEGIN , indentation_level )
308
+ yield (el , Position . Begin , indentation_level )
305
309
306
310
yield el .text
307
311
@@ -311,7 +315,7 @@ def tree_token_generator(
311
315
if is_indentation :
312
316
indentation_level -= 1
313
317
314
- yield (el , END , indentation_level )
318
+ yield (el , Position . End , indentation_level )
315
319
316
320
yield el .tail
317
321
@@ -320,7 +324,10 @@ def tree_line_generator(
320
324
el : Element , max_lines : int | None = None
321
325
) -> Iterator [
322
326
tuple [
323
- tuple [ElementRef , str ] | None , tuple [ElementRef , str ] | None , int , str
327
+ tuple [ElementRef , Position ] | None ,
328
+ tuple [ElementRef , Position ] | None ,
329
+ int ,
330
+ str ,
324
331
]
325
332
]:
326
333
"""
@@ -343,14 +350,14 @@ def tree_line_generator(
343
350
344
351
For example, the HTML tree "<div>foo <span>bar</span><br>baz</div>" yields:
345
352
346
- - ((<Element div>, 'begin' ), (<Element br>, 'begin' ), 0, 'foo bar')
347
- - ((<Element br>, 'end' ), (<Element div>, 'end' ), 0, 'baz').
353
+ - ((<Element div>, Begin ), (<Element br>, Begin ), 0, 'foo bar')
354
+ - ((<Element br>, End ), (<Element div>, End ), 0, 'baz').
348
355
349
356
To illustrate the indentation level, the HTML tree
350
357
'<div><blockquote>hi</blockquote>world</div>' yields:
351
358
352
- - ((<Element blockquote>, 'begin' ), (<Element blockquote>, 'end' ), 1, 'hi')
353
- - ((<Element blockquote>, 'end' ), (<Element div>, 'end' ), 0, 'world')
359
+ - ((<Element blockquote>, Begin ), (<Element blockquote>, End ), 1, 'hi')
360
+ - ((<Element blockquote>, End ), (<Element div>, End ), 0, 'world')
354
361
"""
355
362
356
363
def _trim_spaces (text : str ) -> str :
@@ -378,11 +385,11 @@ def _trim_spaces(text: str) -> str:
378
385
379
386
tag_name = el .tag .lower ()
380
387
381
- line_break = tag_name == "br" and state == BEGIN
388
+ line_break = tag_name == "br" and state is Position . Begin
382
389
is_block = tag_name not in INLINE_TAGS
383
390
is_forward = (
384
391
is_block
385
- and state == BEGIN
392
+ and state is Position . Begin
386
393
and (style := el .attrib .get ("style" ))
387
394
and any (style_re .match (style ) for style_re in FORWARD_STYLES )
388
395
)
0 commit comments