From 2a3d0e6b6f1b0aea5b233d565d3784babe307eb4 Mon Sep 17 00:00:00 2001 From: Belyaeva Oksana Date: Thu, 12 Dec 2024 15:19:33 +0300 Subject: [PATCH] TLDR-861 remove orient cell params --- dedoc/api/api_args.py | 3 --- dedoc/api/web/index.html | 10 +--------- .../pdf_reader/data_classes/tables/scantable.py | 3 --- dedoc/readers/pdf_reader/pdf_base_reader.py | 4 ++-- .../pdf_txtlayer_reader/pdf_tabby_reader.py | 4 +--- dedoc/utils/parameter_utils.py | 17 ----------------- 6 files changed, 4 insertions(+), 37 deletions(-) diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py index d1f7d5cf..f2b9e7c4 100644 --- a/dedoc/api/api_args.py +++ b/dedoc/api/api_args.py @@ -22,9 +22,6 @@ class QueryParameters: # tables handling need_pdf_table_analysis: str = Form("true", enum=["true", "false"], description="Enable table recognition for pdf") table_type: str = Form("", description="Pipeline mode for table recognition") - orient_analysis_cells: str = Form("false", enum=["true", "false"], description="Enable analysis of rotated cells in table headers") - orient_cell_angle: str = Form("90", enum=["90", "270"], - description='Set cells orientation in table headers, "90" means 90 degrees counterclockwise cells rotation') # pdf handling pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"], diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html index ede62117..c68963b6 100644 --- a/dedoc/api/web/index.html +++ b/dedoc/api/web/index.html @@ -101,21 +101,13 @@

Attachments handling

Tables handling

-
need_pdf_table_analysis, orient_analysis_cells, orient_cell_angle +
need_pdf_table_analysis

- -

- -

- -

- -

diff --git a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py index e8010886..fa60aaeb 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py @@ -33,9 +33,6 @@ def check_on_cell_instance(self) -> bool: return False return True - def to_table(self) -> Table: - return super() - @staticmethod def get_cells_text(cells: List[List[CellWithMeta]]) -> List[List[str]]: return [[cell.get_text() for cell in row] for row in cells] diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index 60ccd865..41e2990f 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -87,12 +87,12 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure ) lines, scan_tables, attachments, warnings, metadata = self._parse_document(file_path, params_for_parse) - tables = [scan_table.to_table() for scan_table in scan_tables] + # tables = [scan_table.to_table() for scan_table in scan_tables] if params_for_parse.with_attachments and self.attachment_extractor.can_extract(file_path): attachments += self.attachment_extractor.extract(file_path=file_path, parameters=parameters) - result = UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=warnings, metadata=metadata) + result = UnstructuredDocument(lines=lines, tables=scan_tables, attachments=attachments, warnings=warnings, metadata=metadata) return self._postprocess(result) def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> ( diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index c927ab0e..ef47db28 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -136,9 +136,7 @@ def __extract(self, path: str, parameters: dict, warnings: List[str], tmp_dir: s mp_tables = self.table_recognizer.convert_to_multipages_tables(all_scan_tables, lines_with_meta=all_lines) all_lines = self.linker.link_objects(lines=all_lines, tables=mp_tables, images=all_attached_images) - tables = [scan_table.to_table() for scan_table in mp_tables] - - return all_lines, tables, all_attached_images, document_metadata + return all_lines, mp_tables, all_attached_images, document_metadata def __save_gost_frame_boxes_to_json(self, first_page: Optional[int], last_page: Optional[int], page_count: int, path: str, tmp_dir: str) -> str: from joblib import Parallel, delayed diff --git a/dedoc/utils/parameter_utils.py b/dedoc/utils/parameter_utils.py index 3df9f6ca..993b6b8a 100644 --- a/dedoc/utils/parameter_utils.py +++ b/dedoc/utils/parameter_utils.py @@ -33,13 +33,6 @@ def get_param_document_type(parameters: Optional[dict]) -> str: return document_type -def get_param_orient_analysis_cells(parameters: Optional[dict]) -> bool: - if parameters is None: - return False - orient_analysis_cells = str(parameters.get("orient_analysis_cells", "False")).lower() == "true" - return orient_analysis_cells - - def get_param_with_attachments(parameters: Optional[dict]) -> bool: if parameters is None: return False @@ -80,16 +73,6 @@ def get_param_need_binarization(parameters: Optional[dict]) -> bool: return need_binarization -def get_param_orient_cell_angle(parameters: Optional[dict]) -> int: - if parameters is None: - return 90 - - orient_cell_angle = str(parameters.get("orient_cell_angle", "90")) - if orient_cell_angle == "": - orient_cell_angle = "90" - return int(orient_cell_angle) - - def get_param_is_one_column_document(parameters: Optional[dict]) -> Optional[bool]: if parameters is None: return None