1
+ import os .path
1
2
from typing import List , Optional , Tuple
2
3
3
4
from dedocutils .data_structures import BBox
@@ -62,13 +63,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
62
63
warnings = []
63
64
64
65
with tempfile .TemporaryDirectory () as tmp_dir :
65
- lines , tables , tables_on_images , attachments , document_metadata = self .__extract (
66
- path = file_path ,
67
- parameters = parameters ,
68
- warnings = warnings ,
69
- tmp_dir = tmp_dir
70
- )
71
- lines = self .linker .link_objects (lines = lines , tables = tables_on_images , images = attachments )
66
+ lines , tables , attachments , document_metadata = self .__extract (path = file_path , parameters = parameters , warnings = warnings , tmp_dir = tmp_dir )
72
67
73
68
if get_param_with_attachments (parameters ) and self .attachment_extractor .can_extract (file_path ):
74
69
attachments += self .attachment_extractor .extract (file_path = file_path , parameters = parameters )
@@ -79,14 +74,15 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
79
74
80
75
return self ._postprocess (result )
81
76
82
- def __extract (self , path : str , parameters : dict , warnings : list , tmp_dir : str )\
83
- -> Tuple [List [LineWithMeta ], List [Table ], List [ScanTable ], List [ PdfImageAttachment ], Optional [dict ]]:
77
+ def __extract (self , path : str , parameters : dict , warnings : List [ str ] , tmp_dir : str )\
78
+ -> Tuple [List [LineWithMeta ], List [Table ], List [PdfImageAttachment ], Optional [dict ]]:
84
79
import math
85
80
from dedoc .utils .pdf_utils import get_pdf_page_count
86
81
from dedoc .utils .utils import calculate_file_hash
87
82
from dedoc .utils .parameter_utils import get_param_page_slice , get_param_with_attachments
83
+ from dedoc .utils .parameter_utils import get_param_need_gost_frame_analysis
88
84
89
- all_lines , all_tables , all_tables_on_images , all_attached_images = [], [], [], []
85
+ all_lines , all_tables , all_scan_tables , all_attached_images = [], [], [], []
90
86
with_attachments = get_param_with_attachments (parameters )
91
87
document_metadata = None
92
88
@@ -104,40 +100,70 @@ def __extract(self, path: str, parameters: dict, warnings: list, tmp_dir: str)\
104
100
document_metadata ["last_page" ] = last_page
105
101
106
102
if empty_page_limit :
107
- return all_lines , all_tables , all_tables_on_images , all_attached_images , document_metadata
103
+ return all_lines , all_tables , all_attached_images , document_metadata
104
+
105
+ remove_gost_frame = get_param_need_gost_frame_analysis (parameters )
106
+ gost_json_path = self .__save_gost_frame_boxes_to_json (first_page = first_page , last_page = last_page , page_count = page_count , tmp_dir = tmp_dir , path = path ) \
107
+ if remove_gost_frame else ""
108
108
109
109
# in java tabby reader page numeration starts with 1, end_page is included
110
110
first_tabby_page = first_page + 1 if first_page is not None else 1
111
111
last_tabby_page = page_count if (last_page is None ) or (last_page is not None and last_page > page_count ) else last_page
112
112
self .logger .info (f"Reading PDF pages from { first_tabby_page } to { last_tabby_page } " )
113
- document = self .__process_pdf (path = path , start_page = first_tabby_page , end_page = last_tabby_page , tmp_dir = tmp_dir )
113
+ document = self .__process_pdf (path = path ,
114
+ start_page = first_tabby_page ,
115
+ end_page = last_tabby_page ,
116
+ tmp_dir = tmp_dir ,
117
+ gost_json_path = gost_json_path ,
118
+ remove_frame = remove_gost_frame )
114
119
115
120
pages = document .get ("pages" , [])
116
121
for page in pages :
117
122
page_lines = self .__get_lines_with_location (page , file_hash )
118
123
if page_lines :
119
124
all_lines .extend (page_lines )
120
- page_tables , table_on_images = self .__get_tables (page )
121
- assert len (page_tables ) == len (table_on_images )
122
- if page_tables :
123
- all_tables .extend (page_tables )
124
- all_tables_on_images .extend (table_on_images )
125
+ scan_tables = self .__get_tables (page )
126
+ all_scan_tables .extend (scan_tables )
125
127
126
128
attached_images = self .__get_attached_images (page = page , parameters = parameters , path = path ) if with_attachments else []
127
129
if attached_images :
128
130
all_attached_images .extend (attached_images )
129
131
130
- return all_lines , all_tables , all_tables_on_images , all_attached_images , document_metadata
132
+ mp_tables = self .table_recognizer .convert_to_multipages_tables (all_scan_tables , lines_with_meta = all_lines )
133
+ all_lines = self .linker .link_objects (lines = all_lines , tables = mp_tables , images = all_attached_images )
134
+
135
+ tables = [scan_table .to_table () for scan_table in mp_tables ]
136
+
137
+ return all_lines , tables , all_attached_images , document_metadata
138
+
139
+ def __save_gost_frame_boxes_to_json (self , first_page : Optional [int ], last_page : Optional [int ], page_count : int , path : str , tmp_dir : str ) -> str :
140
+ from joblib import Parallel , delayed
141
+ import json
142
+
143
+ first_page = 0 if first_page is None or first_page < 0 else first_page
144
+ last_page = page_count if (last_page is None ) or (last_page is not None and last_page > page_count ) else last_page
145
+ images = self ._get_images (path , first_page , last_page )
146
+
147
+ gost_analyzed_images = Parallel (n_jobs = self .config ["n_jobs" ])(delayed (self .gost_frame_recognizer .rec_and_clean_frame )(image ) for image in images )
148
+
149
+ result_dict = {
150
+ page_number : {** page_data [1 ].to_dict (), ** {"original_image_width" : page_data [2 ][1 ], "original_image_height" : page_data [2 ][0 ]}}
151
+ for page_number , page_data in enumerate (gost_analyzed_images , start = first_page )
152
+ }
131
153
132
- def __get_tables (self , page : dict ) -> Tuple [List [Table ], List [ScanTable ]]:
154
+ result_json_path = os .path .join (tmp_dir , "gost_frame_bboxes.json" )
155
+ with open (result_json_path , "w" ) as f :
156
+ json .dump (result_dict , f )
157
+
158
+ return result_json_path
159
+
160
+ def __get_tables (self , page : dict ) -> List [ScanTable ]:
133
161
import uuid
134
162
from dedoc .data_structures .cell_with_meta import CellWithMeta
135
163
from dedoc .data_structures .concrete_annotations .bbox_annotation import BBoxAnnotation
136
164
from dedoc .data_structures .line_metadata import LineMetadata
137
- from dedoc .data_structures .table_metadata import TableMetadata
138
165
139
- tables = []
140
- tables_on_image = []
166
+ scan_tables = []
141
167
page_number = page ["number" ]
142
168
page_width = int (page ["width" ])
143
169
page_height = int (page ["height" ])
@@ -149,7 +175,7 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:
149
175
cell_properties = table ["cell_properties" ]
150
176
assert len (rows ) == len (cell_properties )
151
177
152
- result_cells = []
178
+ cells = []
153
179
for num_row , row in enumerate (rows ):
154
180
assert len (row ) == len (cell_properties [num_row ])
155
181
@@ -161,20 +187,22 @@ def __get_tables(self, page: dict) -> Tuple[List[Table], List[ScanTable]]:
161
187
for c in cell_blocks :
162
188
cell_bbox = BBox (x_top_left = int (c ["x_top_left" ]), y_top_left = int (c ["y_top_left" ]), width = int (c ["width" ]), height = int (c ["height" ]))
163
189
annotations .append (BBoxAnnotation (c ["start" ], c ["end" ], cell_bbox , page_width = page_width , page_height = page_height ))
190
+ """
191
+ TODO: change to Cell class after tabby can return cell coordinates. Then set type Cell in class "ScanTable"
192
+ https://jira.intra.ispras.ru/browse/TLDR-851
193
+ """
164
194
165
195
result_row .append (CellWithMeta (
166
196
lines = [LineWithMeta (line = cell ["text" ], metadata = LineMetadata (page_id = page_number , line_id = 0 ), annotations = annotations )],
167
197
colspan = cell_properties [num_row ][num_col ]["col_span" ],
168
198
rowspan = cell_properties [num_row ][num_col ]["row_span" ],
169
199
invisible = bool (cell_properties [num_row ][num_col ]["invisible" ])
170
200
))
171
- result_cells .append (result_row )
201
+ cells .append (result_row )
172
202
173
- table_name = str (uuid .uuid4 ())
174
- tables .append (Table (cells = result_cells , metadata = TableMetadata (page_id = page_number , uid = table_name )))
175
- tables_on_image .append (ScanTable (page_number = page_number , matrix_cells = None , bbox = table_bbox , name = table_name , order = order ))
203
+ scan_tables .append (ScanTable (page_number = page_number , matrix_cells = cells , bbox = table_bbox , name = str (uuid .uuid4 ()), order = order ))
176
204
177
- return tables , tables_on_image
205
+ return scan_tables
178
206
179
207
def __get_attached_images (self , page : dict , parameters : dict , path : str ) -> List [PdfImageAttachment ]:
180
208
import os
@@ -291,10 +319,20 @@ def __jar_path(self) -> str:
291
319
import os
292
320
return os .environ .get ("TABBY_JAR" , self .default_config ["JAR_PATH" ])
293
321
294
- def __run (self , path : str , tmp_dir : str , encoding : str = "utf-8" , start_page : int = None , end_page : int = None ) -> bytes :
322
+ def __run (self ,
323
+ path : str ,
324
+ tmp_dir : str ,
325
+ encoding : str = "utf-8" ,
326
+ start_page : int = None ,
327
+ end_page : int = None ,
328
+ remove_frame : bool = False ,
329
+ gost_json_path : str = ""
330
+ ) -> bytes :
295
331
import subprocess
296
332
297
333
args = ["java" ] + ["-jar" , self .__jar_path (), "-i" , path , "-tmp" , f"{ tmp_dir } /" ]
334
+ if remove_frame :
335
+ args += ["-rf" , gost_json_path ]
298
336
if start_page is not None and end_page is not None :
299
337
args += ["-sp" , str (start_page ), "-ep" , str (end_page )]
300
338
try :
@@ -307,11 +345,18 @@ def __run(self, path: str, tmp_dir: str, encoding: str = "utf-8", start_page: in
307
345
except subprocess .CalledProcessError as e :
308
346
raise TabbyPdfError (e .stderr .decode (encoding ))
309
347
310
- def __process_pdf (self , path : str , tmp_dir : str , start_page : int = None , end_page : int = None ) -> dict :
348
+ def __process_pdf (self ,
349
+ path : str ,
350
+ tmp_dir : str ,
351
+ start_page : int = None ,
352
+ end_page : int = None ,
353
+ gost_json_path : str = "" ,
354
+ remove_frame : bool = False ) -> dict :
311
355
import json
312
356
import os
313
357
314
- self .__run (path = path , start_page = start_page , end_page = end_page , tmp_dir = tmp_dir )
358
+ self .__run (path = path , start_page = start_page , end_page = end_page , tmp_dir = tmp_dir , remove_frame = remove_frame , gost_json_path = gost_json_path )
359
+
315
360
with open (os .path .join (tmp_dir , "data.json" ), "r" ) as response :
316
361
document = json .load (response )
317
362
0 commit comments