Skip to content

Commit d9b385c

Browse files
committed
TLDR-748 fix tests
1 parent 11f1a3a commit d9b385c

25 files changed

+123
-85
lines changed

dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,7 @@ def __get_tag(self, line: LineWithMeta, line_type: str) -> HierarchyLevel:
277277
if line_type == "litem": # TODO automatic list depth and merge list items from multiple lines
278278
return HierarchyLevel(None, None, False, HierarchyLevel.list_item)
279279

280-
return HierarchyLevel(None, None, True, line_type)
280+
return HierarchyLevel.create_unknown()
281281

282282
def __jar_path(self) -> str:
283283
import os

dedoc/readers/txt_reader/raw_text_reader.py

+1-7
Original file line numberDiff line numberDiff line change
@@ -108,15 +108,9 @@ def __get_starting_spacing(self, line: Optional[LineWithMeta]) -> int:
108108
return space_this.end() - space_this.start()
109109

110110
def __is_paragraph(self, line: LineWithMeta, previous_line: Optional[LineWithMeta]) -> bool:
111-
from dedoc.data_structures.hierarchy_level import HierarchyLevel
112-
113-
if not line.metadata.tag_hierarchy_level.can_be_multiline and \
114-
line.metadata.tag_hierarchy_level.line_type not in (HierarchyLevel.raw_text, HierarchyLevel.unknown):
115-
return True
116111
space_this = self.__get_starting_spacing(line)
117112
space_prev = self.__get_starting_spacing(previous_line)
118-
return line.metadata.tag_hierarchy_level.line_type in (HierarchyLevel.raw_text, HierarchyLevel.unknown) \
119-
and not line.line.isspace() and space_this - space_prev >= 2
113+
return not line.line.isspace() and space_this - space_prev >= 2
120114

121115
def _postprocess(self, document: UnstructuredDocument) -> UnstructuredDocument:
122116
previous_line = None

dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,16 @@ def __get_patterns(self, parameters: dict) -> List[AbstractPattern]:
4343
from dedoc.structure_extractors.patterns.letter_list_pattern import LetterListPattern
4444
from dedoc.structure_extractors.patterns.tag_header_pattern import TagHeaderPattern
4545
from dedoc.structure_extractors.patterns.tag_list_pattern import TagListPattern
46+
from dedoc.structure_extractors.patterns.tag_pattern import TagPattern
4647

4748
patterns = [
48-
TagHeaderPattern(line_type=HierarchyLevel.header, level_1=1),
49-
TagListPattern(line_type=HierarchyLevel.list_item, level_1=2),
50-
DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2),
51-
BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1),
52-
LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1),
53-
BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1),
49+
TagHeaderPattern(line_type=HierarchyLevel.header, level_1=1, can_be_multiline=False),
50+
TagListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False),
51+
DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False),
52+
BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1, can_be_multiline=False),
53+
LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1, can_be_multiline=False),
54+
BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1, can_be_multiline=False),
55+
TagPattern(line_type=HierarchyLevel.raw_text)
5456
]
5557
else:
5658
import json

dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py

+10-10
Original file line numberDiff line numberDiff line change
@@ -8,22 +8,22 @@
88
from dedoc.structure_extractors.hierarchy_level_builders.law_builders.body_builder.abstract_body_hierarchy_level_builder import \
99
AbstractBodyHierarchyLevelBuilder
1010
from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_digits_with_dots
11-
from dedoc.structure_extractors.patterns import BracketListPattern, BulletListPattern, DottedListPattern, LetterListPattern, TagHeaderPattern, TagListPattern
11+
from dedoc.structure_extractors.patterns import BracketListPattern, BulletListPattern, DottedListPattern, LetterListPattern, TagListPattern, TagPattern
1212

1313

1414
class DiplomaBodyBuilder(AbstractHierarchyLevelBuilder):
1515
named_item_keywords = ("введение", "заключение", "библиографический список", "список литературы", "глава", "приложение", "приложения")
1616

17-
def __int__(self) -> None:
17+
def __init__(self) -> None:
1818
super().__init__()
1919
self.digits_with_dots_regexp = regexps_digits_with_dots
2020
self.patterns = [
21-
TagHeaderPattern(line_type=HierarchyLevel.header, level_1=1),
22-
TagListPattern(line_type=HierarchyLevel.list_item, level_1=2),
23-
DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2),
24-
BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1),
25-
LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1),
26-
BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1),
21+
TagListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False),
22+
DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False),
23+
BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1, can_be_multiline=False),
24+
LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1, can_be_multiline=False),
25+
BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1, can_be_multiline=False),
26+
TagPattern(line_type=HierarchyLevel.raw_text)
2727
]
2828

2929
def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, str]], init_hl_depth: int) -> List[LineWithMeta]:
@@ -66,10 +66,10 @@ def __handle_named_item(self, init_hl_depth: int, line: LineWithMeta, prediction
6666
if text.startswith(self.named_item_keywords):
6767
hierarchy_level = HierarchyLevel(init_hl_depth, 0, True, prediction)
6868
elif item_depth == -1:
69-
if previous_named_item_line and previous_named_item_line.metadata.hierarchy_level.line_type == "named_item":
69+
if previous_named_item_line:
7070
hierarchy_level = previous_named_item_line.metadata.hierarchy_level
7171
else:
72-
hierarchy_level = HierarchyLevel(init_hl_depth + 1, 0, True, prediction)
72+
hierarchy_level = HierarchyLevel(init_hl_depth, 0, True, prediction)
7373
else:
7474
hierarchy_level = HierarchyLevel(init_hl_depth, item_depth - 1, True, prediction)
7575
line.metadata.hierarchy_level = hierarchy_level
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
11
from dedoc.structure_extractors.patterns.bracket_list_pattern import BracketListPattern
2+
from dedoc.structure_extractors.patterns.bracket_roman_list_pattern import BracketRomanListPattern
23
from dedoc.structure_extractors.patterns.bullet_list_pattern import BulletListPattern
34
from dedoc.structure_extractors.patterns.dotted_list_pattern import DottedListPattern
45
from dedoc.structure_extractors.patterns.letter_list_pattern import LetterListPattern
6+
from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern
7+
from dedoc.structure_extractors.patterns.roman_list_pattern import RomanListPattern
8+
from dedoc.structure_extractors.patterns.start_word_pattern import StartWordPattern
59
from dedoc.structure_extractors.patterns.tag_header_pattern import TagHeaderPattern
610
from dedoc.structure_extractors.patterns.tag_list_pattern import TagListPattern
11+
from dedoc.structure_extractors.patterns.tag_pattern import TagPattern
712

8-
__all__ = ["BracketListPattern", "BulletListPattern", "DottedListPattern", "LetterListPattern", "TagHeaderPattern", "TagListPattern"]
13+
__all__ = ["BracketListPattern", "BracketRomanListPattern", "BulletListPattern", "DottedListPattern", "LetterListPattern", "RegexpPattern", "RomanListPattern",
14+
"StartWordPattern", "TagHeaderPattern", "TagListPattern", "TagPattern"]

dedoc/structure_extractors/patterns/abstract_pattern.py

+39-4
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,14 @@
88
class AbstractPattern(ABC):
99
__name = ""
1010

11-
def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None:
11+
def __init__(self,
12+
line_type: Optional[str] = None,
13+
level_1: Optional[int] = None,
14+
level_2: Optional[int] = None,
15+
can_be_multiline: Optional[bool] = None) -> None:
1216
self._line_type = line_type
1317
self._level_1 = level_1
14-
self._level_2 = level_2 if level_2 else 1
18+
self._level_2 = level_2
1519
self._can_be_multiline = can_be_multiline
1620

1721
@classmethod
@@ -22,6 +26,37 @@ def name(cls: "AbstractPattern") -> str:
2226
def match(self, line: LineWithMeta) -> bool:
2327
pass
2428

25-
@abstractmethod
2629
def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
27-
pass
30+
return HierarchyLevel(
31+
line_type=self._get_line_type(line),
32+
level_1=self._get_level_1(line),
33+
level_2=self._get_level_2(line),
34+
can_be_multiline=self._get_can_be_multiline(line)
35+
)
36+
37+
def _get_line_type(self, line: LineWithMeta) -> str:
38+
if self._line_type is not None:
39+
return self._line_type
40+
41+
if line.metadata.tag_hierarchy_level is None:
42+
raise ValueError(f"Cannot resolve line type: tag_hierarchy_level is missing and {self.__name} line_type isn't configured")
43+
44+
return line.metadata.tag_hierarchy_level.line_type
45+
46+
def _get_level_1(self, line: LineWithMeta) -> Optional[int]:
47+
if self._level_1 is not None:
48+
return self._level_1
49+
50+
return line.metadata.tag_hierarchy_level.level_1 if line.metadata.tag_hierarchy_level else None
51+
52+
def _get_level_2(self, line: LineWithMeta) -> Optional[int]:
53+
if self._level_2 is not None:
54+
return self._level_2
55+
56+
return line.metadata.tag_hierarchy_level.level_2 if line.metadata.tag_hierarchy_level else None
57+
58+
def _get_can_be_multiline(self, line: LineWithMeta) -> bool:
59+
if self._can_be_multiline is not None:
60+
return self._can_be_multiline
61+
62+
return line.metadata.tag_hierarchy_level.can_be_multiline if line.metadata.tag_hierarchy_level else True

dedoc/structure_extractors/patterns/bracket_list_pattern.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,9 @@
77
class BracketListPattern(RegexpPattern):
88
__name = "bracket_list"
99

10-
def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None:
10+
def __init__(self,
11+
line_type: Optional[str] = None,
12+
level_1: Optional[int] = None,
13+
level_2: Optional[int] = None,
14+
can_be_multiline: Optional[bool] = None) -> None:
1115
super().__init__(regexp=BracketPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)

dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,9 @@
77
class BracketRomanListPattern(RegexpPattern):
88
__name = "bracket_roman_list"
99

10-
def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None:
10+
def __init__(self,
11+
line_type: Optional[str] = None,
12+
level_1: Optional[int] = None,
13+
level_2: Optional[int] = None,
14+
can_be_multiline: Optional[bool] = None) -> None:
1115
super().__init__(regexp=BracketRomanPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)

dedoc/structure_extractors/patterns/bullet_list_pattern.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,9 @@
77
class BulletListPattern(RegexpPattern):
88
__name = "bullet_list"
99

10-
def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None:
10+
def __init__(self,
11+
line_type: Optional[str] = None,
12+
level_1: Optional[int] = None,
13+
level_2: Optional[int] = None,
14+
can_be_multiline: Optional[bool] = None) -> None:
1115
super().__init__(regexp=BulletPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)

dedoc/structure_extractors/patterns/dotted_list_pattern.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,20 @@
99
class DottedListPattern(RegexpPattern):
1010
__name = "dotted_list"
1111

12-
def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None:
12+
def __init__(self,
13+
line_type: Optional[str] = None,
14+
level_1: Optional[int] = None,
15+
level_2: Optional[int] = None,
16+
can_be_multiline: Optional[bool] = None) -> None:
1317
super().__init__(regexp=DottedPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
1418

1519
def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
16-
level_2 = self.__get_list_depth(line=line)
17-
return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=level_2, can_be_multiline=self._can_be_multiline)
20+
return HierarchyLevel(
21+
line_type=self._get_line_type(line),
22+
level_1=self._get_level_1(line),
23+
level_2=self.__get_list_depth(line=line),
24+
can_be_multiline=self._get_can_be_multiline(line)
25+
)
1826

1927
def __get_list_depth(self, line: LineWithMeta) -> int:
2028
text = line.line.strip().lower()

dedoc/structure_extractors/patterns/letter_list_pattern.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,9 @@
77
class LetterListPattern(RegexpPattern):
88
__name = "letter_list"
99

10-
def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None:
10+
def __init__(self,
11+
line_type: Optional[str] = None,
12+
level_1: Optional[int] = None,
13+
level_2: Optional[int] = None,
14+
can_be_multiline: Optional[bool] = None) -> None:
1115
super().__init__(regexp=AnyLetterPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)

dedoc/structure_extractors/patterns/regexp_pattern.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,23 @@
11
import re
22
from typing import Optional
33

4-
from dedoc.data_structures.hierarchy_level import HierarchyLevel
54
from dedoc.data_structures.line_with_meta import LineWithMeta
65
from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern
76

87

98
class RegexpPattern(AbstractPattern):
109
__name = "regexp"
1110

12-
def __init__(self, regexp: str or re.Pattern, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None:
11+
def __init__(self,
12+
regexp: str or re.Pattern,
13+
line_type: Optional[str] = None,
14+
level_1: Optional[int] = None,
15+
level_2: Optional[int] = None,
16+
can_be_multiline: Optional[bool] = None) -> None:
1317
super().__init__(line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
1418
self._regexp = re.compile(regexp) if isinstance(regexp, str) else regexp
1519

1620
def match(self, line: LineWithMeta) -> bool:
1721
text = line.line.strip().lower()
1822
match = self._regexp.match(text)
1923
return match is not None
20-
21-
def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
22-
return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=self._level_2, can_be_multiline=self._can_be_multiline)

dedoc/structure_extractors/patterns/roman_list_pattern.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,9 @@
77
class RomanListPattern(RegexpPattern):
88
__name = "roman_list"
99

10-
def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None:
10+
def __init__(self,
11+
line_type: Optional[str] = None,
12+
level_1: Optional[int] = None,
13+
level_2: Optional[int] = None,
14+
can_be_multiline: Optional[bool] = None) -> None:
1115
super().__init__(regexp=RomanPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)

dedoc/structure_extractors/patterns/start_word_pattern.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,21 @@
11
from typing import Optional
22

3-
from dedoc.data_structures.hierarchy_level import HierarchyLevel
43
from dedoc.data_structures.line_with_meta import LineWithMeta
54
from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern
65

76

87
class StartWordPattern(AbstractPattern):
98
__name = "start_word"
109

11-
def __init__(self, start_word: str, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None:
10+
def __init__(self,
11+
start_word: str,
12+
line_type: Optional[str] = None,
13+
level_1: Optional[int] = None,
14+
level_2: Optional[int] = None,
15+
can_be_multiline: Optional[bool] = None) -> None:
1216
super().__init__(line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
1317
self.__start_word = start_word.strip().lower()
1418

1519
def match(self, line: LineWithMeta) -> bool:
1620
text = line.line.strip().lower()
1721
return text.startswith(self.__start_word)
18-
19-
def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
20-
return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=self._level_2, can_be_multiline=self._can_be_multiline)

dedoc/structure_extractors/patterns/tag_header_pattern.py

-4
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,3 @@ def match(self, line: LineWithMeta) -> bool:
1212

1313
level_1, level_2 = line.metadata.tag_hierarchy_level.level_1, line.metadata.tag_hierarchy_level.level_2
1414
return level_1 is not None and level_2 is not None
15-
16-
def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
17-
level_2 = line.metadata.tag_hierarchy_level.level_2
18-
return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=level_2, can_be_multiline=self._can_be_multiline)

dedoc/structure_extractors/patterns/tag_list_pattern.py

-4
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,3 @@ def match(self, line: LineWithMeta) -> bool:
1212

1313
level_1, level_2 = line.metadata.tag_hierarchy_level.level_1, line.metadata.tag_hierarchy_level.level_2
1414
return level_1 is not None and level_2 is not None
15-
16-
def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
17-
level_1, level_2 = line.metadata.tag_hierarchy_level.level_1, line.metadata.tag_hierarchy_level.level_2
18-
return HierarchyLevel(line_type=self._line_type, level_1=level_1, level_2=level_2, can_be_multiline=self._can_be_multiline)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from dedoc.data_structures.line_with_meta import LineWithMeta
2+
from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern
3+
4+
5+
class TagPattern(AbstractPattern):
6+
__name = "tag"
7+
8+
def match(self, line: LineWithMeta) -> bool:
9+
return line.metadata.tag_hierarchy_level is not None

dedoc/structure_extractors/patterns/utils.py

-2
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@ def get_pattern(pattern_parameters: dict) -> AbstractPattern:
66

77
assert isinstance(pattern_parameters, dict)
88
assert "name" in pattern_parameters, "Pattern parameter missing 'name'"
9-
assert "line_type" in pattern_parameters, "Pattern parameter missing 'line_type'"
10-
assert "level_1" in pattern_parameters, "Pattern parameter missing 'level_1'"
119

1210
supported_patterns = {pattern.name: pattern for pattern in patterns_module.__all__}
1311
pattern_class = supported_patterns.get(pattern_parameters["name"])

tests/api_tests/test_api_doctype_diploma.py

-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import os
2-
import unittest
32

43
from tests.api_tests.abstract_api_test import AbstractTestApiDocReader
54

@@ -9,7 +8,6 @@ class TestApiDiploma(AbstractTestApiDocReader):
98
def _get_abs_path(self, file_name: str) -> str:
109
return os.path.join(self.data_directory_path, "diplomas", file_name)
1110

12-
@unittest.skip("TLDR-748")
1311
def test_diploma_pdf(self) -> None:
1412
file_name = "diploma.pdf"
1513
result = self._send_request(file_name, dict(document_type="diploma", pdf_with_text_layer="tabby"))
@@ -53,7 +51,6 @@ def test_diploma_pdf(self) -> None:
5351
self.assertEqual("БИБЛИОГРАФИЧЕСКИЙ СПИСОК", node["text"].strip())
5452
self.assertEqual("named_item", node["metadata"]["paragraph_type"])
5553

56-
@unittest.skip("TLDR-748")
5754
def test_diploma_docx(self) -> None:
5855
file_name = "diploma.docx"
5956
result = self._send_request(file_name, dict(document_type="diploma"))

0 commit comments

Comments
 (0)