Skip to content

Commit 297dec8

Browse files
NastyBogetTravvy88Alexander Golodkovdronperminovoksidgy
authored
update master (#408)
* TLDR-549 delete custom loggers (#393) * changed dedoc-utils version (#394) * remove PdfTxtlayerReader from TxtLayerDetector (#395) * Make train dataset API separated (#396) * TLDR-584 words boldness for images (#397) * TLDR-584 text boldness for words in images * TLDR-582 fix pdf_txtlayer_reader bboxes for lines (labeling_mode="true") (#399) * TLDR-585 added TEDS table benchmark (#398) * TLDR-538 tesseract postprocessing (#388) * TLDR-590 fix code style in scripts directory (#400) * Add job to meet requirements of the develop branch (#401) * TLDR-602 some fixes of web form (#402) * Translate labeling web pages into English (#403) * TLDR-556 tutorial how to add a new structure type (#405) * TLRD-182 eml reader bug fix (#406) * new version (#407) --------- Co-authored-by: Nikita Shevtsov <[email protected]> Co-authored-by: Alexander Golodkov <[email protected]> Co-authored-by: Andrew Perminov <[email protected]> Co-authored-by: Oksana Belyaeva <[email protected]>
1 parent 1888659 commit 297dec8

File tree

234 files changed

+64121
-1487
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

234 files changed

+64121
-1487
lines changed

Diff for: .flake8

+3-3
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ max-line-length = 160
44
max-complexity = 13
55
inline-quotes = "
66
7-
application-import-names = dedoc, tests
7+
application-import-names = dedoc, tests, scripts, train_dataset
88
import-order-style = pycharm
99
1010
exclude =
@@ -14,8 +14,6 @@ exclude =
1414
.github,
1515
*__init__.py,
1616
resources,
17-
dedoc/scripts,
18-
examples,
1917
venv,
2018
build,
2119
dedoc.egg-info
@@ -24,3 +22,5 @@ exclude =
2422
# ANN101 - type annotations for self
2523
ignore =
2624
ANN101
25+
per-file-ignores =
26+
scripts/*:T201

Diff for: .github/workflows/docs.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,4 @@ jobs:
3232
cd docs/source/_static/code_examples
3333
python dedoc_usage_tutorial.py
3434
python dedoc_add_new_doc_type_tutorial.py
35+
python dedoc_add_new_structure_type_tutorial.py

Diff for: .github/workflows/test_labeling.yaml

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
name: CI
2+
3+
# Controls when the action will run.
4+
on:
5+
pull_request:
6+
branches:
7+
- develop
8+
- master
9+
paths-ignore:
10+
- 'VERSION'
11+
- 'docs/source/changelog.rst'
12+
push:
13+
branches:
14+
- develop
15+
- master
16+
paths-ignore:
17+
- 'VERSION'
18+
- 'docs/source/changelog.rst'
19+
# Allows you to run this workflow manually from the Actions tab
20+
workflow_dispatch:
21+
22+
jobs:
23+
labeling:
24+
runs-on: ubuntu-latest
25+
steps:
26+
- name: Checkout repo
27+
uses: actions/checkout@v2
28+
- name: Set up Python ${{ matrix.python-version }}
29+
uses: actions/setup-python@v2
30+
with:
31+
python-version: '3.9'
32+
- name: Run tests for labeling
33+
run: |
34+
test="true" docker-compose -f labeling/docker-compose.yml up --build --exit-code-from test

Diff for: .github/workflows/test_skip.yaml

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
name: Skip CI
2+
3+
on:
4+
push:
5+
branches:
6+
- new_version
7+
paths:
8+
- 'VERSION'
9+
- 'docs/source/changelog.rst'
10+
workflow_dispatch:
11+
12+
jobs:
13+
pipeline:
14+
runs-on: ubuntu-latest
15+
steps:
16+
- name: Skip tests (only VERSION and changelog have been changed)
17+
run: echo "This is used to meet the requirements of pull-request to the develop branch (pipeline should pass)"

Diff for: .gitignore

-1
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,6 @@ ENV/
9696
[Ll]ib
9797
[Ll]ib64
9898
[Ll]ocal
99-
[Ss]cripts
10099
pyvenv.cfg
101100
.venv
102101
pip-selfcheck.json

Diff for: .pre-commit-config.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ repos:
33
rev: 5.0.4
44
hooks:
55
- id: flake8
6-
exclude: \.github|.*__init__\.py|resources|dedoc/scripts|examples|docs|venv|build|dedoc\.egg-info
6+
exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info
77
args:
88
- "--config=.flake8"
99
additional_dependencies: [

Diff for: Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,4 @@ RUN echo "__version__ = \"$(cat /dedoc_root/VERSION)\"" > /dedoc_root/dedoc/vers
2020
ADD tests /dedoc_root/tests
2121
ADD resources /dedoc_root/resources
2222

23-
CMD ["python3", "/dedoc_root/dedoc/main.py", "-c", "/dedoc_root/dedoc/config.py"]
23+
CMD ["python3", "/dedoc_root/dedoc/main.py"]

Diff for: VERSION

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.0
1+
2.1

Diff for: dedoc/api/api_utils.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ def json2html(text: str, paragraph: TreeNode, tables: Optional[List[Table]], tab
133133
if tables is not None and len(tables) > 0:
134134
text += "<h3> Tables: </h3>"
135135
for table in tables:
136-
text += __table2html(table, table2id)
136+
text += table2html(table, table2id)
137137
text += "<p>&nbsp;</p>"
138138
return text
139139

@@ -201,7 +201,7 @@ def __annotations2html(paragraph: TreeNode, table2id: Dict[str, int]) -> str:
201201
return text.replace("\n", "<br>")
202202

203203

204-
def __table2html(table: Table, table2id: Dict[str, int]) -> str:
204+
def table2html(table: Table, table2id: Dict[str, int]) -> str:
205205
uid = table.metadata.uid
206206
text = f"<h4> table {table2id[uid]}:</h4>"
207207
text += f'<table border="1" id={uid} style="border-collapse: collapse; width: 100%;">\n<tbody>\n'

Diff for: dedoc/api/web/index.html

+10-6
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
<style>
1111
.parameters {padding: 5px; box-shadow: 1px 1px 2px #bbbbbb; width: 70%}
1212
.body {margin-left: 20%}
13-
details > summary {font-style: italic; cursor: pointer}
13+
details > summary {font-style: italic; cursor: pointer; display: list-item;}
14+
.child.max {padding-left: 5px; flex: 1}
15+
.parent {display: flex}
1416
</style>
1517
</head>
1618

@@ -92,11 +94,13 @@ <h4>Attachments handling</h4>
9294

9395

9496
<div class="parameters">
95-
<h4>Tables handling</h4>
97+
<h4>Tables handling </h4>
9698
<details><summary>need_pdf_table_analysis, orient_analysis_cells, orient_cell_angle</summary>
9799
<br>
98100
<p>
99-
<label><input name="need_pdf_table_analysis" type="checkbox" value="true" checked> need_pdf_table_analysis</label>
101+
<label>
102+
<input type="hidden" name="need_pdf_table_analysis" value="false">
103+
<input type="checkbox" name="need_pdf_table_analysis" value="true" checked> need_pdf_table_analysis</label>
100104
</p>
101105

102106
<p>
@@ -188,9 +192,9 @@ <h4>Other formats handling</h4>
188192
</div>
189193

190194
<br>
191-
<div class="row">
192-
<div class="col-md-3"><input type=file name=file class="btn btn-default" data-buttonText="Choose file"></div>
193-
<div class="col-md-2"><input type=submit value=Upload class="btn btn-default"></div>
195+
<div class="parent">
196+
<div class="child"><input type=file name=file class="btn btn-default" data-buttonText="Choose file"></div>
197+
<div class="child max"><input type=submit value=Upload class="btn btn-default"></div>
194198
</div>
195199

196200
</form>

Diff for: dedoc/api/web/train_dataset/info_labeling_mode.html

-38
This file was deleted.

Diff for: dedoc/config.py

+5-15
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
1-
import importlib.util
21
import logging
32
import os
43
import sys
5-
from typing import Any, Optional
64

75
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s - %(pathname)s - %(levelname)s - %(message)s")
86

@@ -13,6 +11,7 @@
1311
# -----------------------------------------RESOURCES PATH SETTINGS----------------------------------------------------
1412
resources_path=RESOURCES_PATH,
1513
intermediate_data_path=os.path.join(RESOURCES_PATH, "datasets"),
14+
table_path="/tmp/tables",
1615

1716
# -----------------------------------------COMMON DEBUG SETTINGS----------------------------------------------------
1817
debug_mode=DEBUG_MODE,
@@ -66,20 +65,11 @@ def get_instance(cls: "Configuration") -> "Configuration":
6665

6766
return cls.__instance
6867

69-
def __init_config(self, args: Optional[Any] = None) -> None:
70-
if args is not None and args.config_path is not None:
71-
spec = importlib.util.spec_from_file_location("config_module", args.config_path)
72-
config_module = importlib.util.module_from_spec(spec)
73-
spec.loader.exec_module(config_module)
74-
self.__config = config_module._config
75-
else:
68+
def get_config(self) -> dict:
69+
if self.__config is None:
7670
self.__config = _config
77-
78-
def get_config(self, args: Optional[Any] = None) -> dict:
79-
if self.__config is None or args is not None:
80-
self.__init_config(args)
8171
return self.__config
8272

8373

84-
def get_config(args: Optional[Any] = None) -> dict:
85-
return Configuration.get_instance().get_config(args)
74+
def get_config() -> dict:
75+
return Configuration.get_instance().get_config()

Diff for: dedoc/data_structures/hierarchy_level.py

+27-19
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from functools import total_ordering
22
from typing import Optional
33

4+
import numpy as np
5+
46

57
@total_ordering
68
class HierarchyLevel:
@@ -12,7 +14,9 @@ class HierarchyLevel:
1214
- level_1 defines primary importance (e.g. root - level_1=0, header - level_1=1, etc.);
1315
- level_2 defines the level inside lines of equal type (e.g. for list items - "1." - level_2=1, "1.1." - level_2=2, etc.).
1416
15-
For the least important lines like raw_text both levels are None.
17+
For the least important lines (line_type=raw_text) both levels are None.
18+
19+
Look to the :ref:`hierarchy level description <add_structure_type_hierarchy_level>` to get more details.
1620
"""
1721
root = "root"
1822
toc = "toc"
@@ -46,43 +50,47 @@ def __is_defined(self, other: "HierarchyLevel") -> bool:
4650
def __eq__(self, other: "HierarchyLevel") -> bool:
4751
"""
4852
Defines the equality of two hierarchy levels:
49-
- two raw text lines or lines with unknown type are equal;
5053
- two lines with equal level_1, level_2 are equal.
54+
- if some of the levels is None, its value is considered as +inf (infinities have equal value)
55+
56+
:param other: other hierarchy level
57+
:return: whether current hierarchy level == other hierarchy level
5158
"""
5259
if not isinstance(other, HierarchyLevel):
5360
return False
5461

55-
if self.__is_defined(other) and (self.level_1, self.level_2) == (other.level_1, other.level_2):
56-
return True
57-
if self.line_type == HierarchyLevel.raw_text and other.line_type == HierarchyLevel.raw_text:
58-
return True
59-
if self.line_type == HierarchyLevel.unknown and other.line_type == HierarchyLevel.unknown:
60-
return True
61-
return False
62+
level_1, level_2 = self.__to_number(self.level_1), self.__to_number(self.level_2)
63+
other_level_1, other_level_2 = self.__to_number(other.level_1), self.__to_number(other.level_2)
64+
return (level_1, level_2) == (other_level_1, other_level_2)
6265

6366
def __lt__(self, other: "HierarchyLevel") -> bool:
6467
"""
6568
Defines the comparison of hierarchy levels:
66-
- line1 < line2 if (level_1, level_2) of line1 <= (level_1, level_2) of line2;
67-
- line1 < line2 if line2 is raw text or unknown, and line1 has another type.
69+
- current level < other level if (level_1, level_2) < other (level_1, level_2);
70+
- if some of the levels is None, its value is considered as +inf (infinities have equal value)
6871
69-
Else line1 >= line2.
70-
71-
:param other: hierarchy level of the line2
72+
:param other: other hierarchy level
73+
:return: whether current hierarchy level < other hierarchy level
7274
"""
75+
# all not None
7376
if self.__is_defined(other):
7477
return (self.level_1, self.level_2) < (other.level_1, other.level_2)
78+
79+
# all None
7580
if self.level_1 is None and self.level_2 is None and other.level_1 is None and other.level_2 is None:
7681
return False
77-
if (self.level_1 is None or self.level_2 is None) and (other.level_1 is not None or other.level_2 is not None):
78-
return False
79-
if (self.level_1 is not None or self.level_2 is not None) and (other.level_1 is None or other.level_2 is None):
80-
return True
81-
return (self.level_1, self.level_2) < (other.level_1, other.level_2)
82+
83+
level_1, level_2 = self.__to_number(self.level_1), self.__to_number(self.level_2)
84+
other_level_1, other_level_2 = self.__to_number(other.level_1), self.__to_number(other.level_2)
85+
86+
return (level_1, level_2) < (other_level_1, other_level_2)
8287

8388
def __str__(self) -> str:
8489
return f"HierarchyLevel(level_1={self.level_1}, level_2={self.level_2}, can_be_multiline={self.can_be_multiline}, line_type={self.line_type})"
8590

91+
def __to_number(self, x: Optional[int]) -> int:
92+
return np.inf if x is None else x
93+
8694
def is_raw_text(self) -> bool:
8795
"""
8896
Check if the line is raw text.

Diff for: dedoc/dedoc_manager.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from dedoc.data_structures import ParsedDocument, UnstructuredDocument
1111
from dedoc.manager_config import get_manager_config
1212
from dedoc.metadata_extractors import BaseMetadataExtractor
13-
from dedoc.train_dataset.train_dataset_utils import get_path_original_documents, save_line_with_meta
13+
from dedoc.utils.train_dataset_utils import get_path_original_documents, save_line_with_meta
1414
from dedoc.utils.utils import get_unique_name
1515

1616

@@ -114,7 +114,7 @@ def __parse_no_error_handling(self, file_path: str, parameters: Dict[str, str])
114114
self.logger.info(f"Extract structure from file {file_name}")
115115

116116
if self.config.get("labeling_mode", False):
117-
self.__save(os.path.join(tmp_dir, unique_filename), unstructured_document)
117+
self.__save(converted_file_path, unstructured_document)
118118

119119
# Step 5 - Form the output structure
120120
parsed_document = self.structure_constructor.construct(document=unstructured_document, parameters=parameters)
@@ -141,5 +141,6 @@ def __init_parameters(self, file_path: str, parameters: Optional[dict]) -> dict:
141141
return result_parameters
142142

143143
def __save(self, file_path: str, classified_document: UnstructuredDocument) -> None:
144+
self.logger.info(f'Save document lines to {self.config["intermediate_data_path"]}')
144145
save_line_with_meta(lines=classified_document.lines, config=self.config, original_document=os.path.basename(file_path))
145146
shutil.copy(file_path, os.path.join(get_path_original_documents(self.config), os.path.basename(file_path)))

Diff for: dedoc/main.py

+3-22
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,7 @@
1-
import argparse
2-
31
from dedoc.api.dedoc_api import get_api, run_api # noqa
4-
from dedoc.config import Configuration, get_config
5-
6-
7-
def main() -> None:
8-
run_api(get_api())
2+
from dedoc.config import Configuration
93

104

115
if __name__ == "__main__":
12-
parser_config = argparse.ArgumentParser()
13-
parser_config.add_argument("-c", "--config_path", help="path to configuration file")
14-
parser_config.add_argument("-m", "--module", help="Only for tests")
15-
parser_config.add_argument("-f", "--test_files", metavar="VALUE", nargs="*", help="Only for tests")
16-
parser_config.add_argument("-v", "--unitest_verbose_mode", nargs="?", help="to enable verbose mode of unittest. Only for tests")
17-
18-
args_config = parser_config.parse_args()
19-
Configuration.get_instance().get_config(args_config)
20-
config = get_config()
21-
22-
if config.get("labeling_mode", False):
23-
from api.train_dataset.train_dataset_api import run_special_api # noqa
24-
run_special_api()
25-
else:
26-
main()
6+
Configuration.get_instance().get_config()
7+
run_api(get_api())

0 commit comments

Comments
 (0)