TLDR-861 added BBox to Cell; rename TableHeaderExtractor; refactor ta…

…ble recognizer; added tests
ispras · Dec 13, 2024 · d5b1cc0 · d5b1cc0
1 parent 2a3d0e6
commit d5b1cc0
Show file tree

Hide file tree

Showing 12 changed files with 220 additions and 276 deletions.
diff --git a/dedoc/readers/pdf_reader/data_classes/tables/cell.py b/dedoc/readers/pdf_reader/data_classes/tables/cell.py
@@ -1,3 +1,4 @@
+import copy
 from typing import List, Optional
 
 from dedocutils.data_structures import BBox
@@ -9,64 +10,33 @@
 class Cell(CellWithMeta):
 
     @staticmethod
-    def copy_from(cell: "Cell",
-                  x_top_left: Optional[int] = None,
-                  x_bottom_right: Optional[int] = None,
-                  y_top_left: Optional[int] = None,
-                  y_bottom_right: Optional[int] = None) -> "Cell":
-        x_top_left = cell.x_top_left if x_top_left is None else x_top_left
-        x_bottom_right = cell.x_bottom_right if x_bottom_right is None else x_bottom_right
-        y_top_left = cell.y_top_left if y_top_left is None else y_top_left
-        y_bottom_right = cell.y_bottom_right if y_bottom_right is None else y_bottom_right
-
-        # TODO change x_top_left ... y_bottom_right to BBox
-
-        return Cell(x_top_left=x_top_left,
-                    x_bottom_right=x_bottom_right,
-                    y_top_left=y_top_left,
-                    y_bottom_right=y_bottom_right,
-                    id_con=cell.id_con,
-                    lines=cell.lines,
-                    colspan=cell.colspan,
-                    rowspan=cell.rowspan,
-                    invisible=cell.invisible,
-                    is_attribute=cell.is_attribute,
-                    is_attribute_required=cell.is_attribute_required,
-                    rotated_angle=cell.rotated_angle,
-                    uid=cell.uuid,
-                    contour_coord=cell.con_coord)
+    def copy_from(cell: "Cell", bbox: Optional[BBox] = None) -> "Cell":
+        copy_cell = copy.deepcopy(cell)
+        if bbox:
+            copy_cell.bbox = bbox
+
+        return copy_cell
 
     def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None:
         if self.lines:
             for line in self.lines:
                 line.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)
-        self.x_top_left += shift_x
-        self.x_bottom_right += shift_x
-        self.y_top_left += shift_y
-        self.y_bottom_right += shift_y
+
+        self.bbox.shift(shift_x=shift_x, shift_y=shift_y)
         if self.con_coord:
             self.con_coord.shift(shift_x=shift_x, shift_y=shift_y)
 
-    def __init__(self, x_top_left: int, x_bottom_right: int, y_top_left: int, y_bottom_right: int, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None,
+    def __init__(self, bbox: BBox, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None,
                  is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = Optional[None],
                  contour_coord: Optional[BBox] = None, colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None:
 
         import uuid
 
-        assert x_top_left <= x_bottom_right
-        assert y_top_left <= y_bottom_right
-
         self.lines = [] if lines is None else lines
         super().__init__(lines=lines, colspan=colspan, rowspan=rowspan, invisible=invisible)
 
-        # TODO change to BBox
-        self.x_top_left = x_top_left
-        self.x_bottom_right = x_bottom_right
-        self.y_top_left = y_top_left
-        self.y_bottom_right = y_bottom_right
-
+        self.bbox = bbox
         self.id_con = id_con
-
         self.is_attribute = is_attribute
         self.is_attribute_required = is_attribute_required
         self.rotated_angle = rotated_angle
@@ -96,11 +66,3 @@ def change_lines_boxes_page_width_height(self, new_page_width: int, new_page_hei
 
     def __repr__(self) -> str:
         return self.__str__()
-
-    @property
-    def width(self) -> int:
-        return self.x_bottom_right - self.x_top_left
-
-    @property
-    def height(self) -> int:
-        return self.y_bottom_right - self.y_top_left
diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/cell_splitter.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/cell_splitter.py
@@ -1,6 +1,7 @@
 from typing import Dict, List, Optional, Tuple
 
 import numpy as np
+from dedocutils.data_structures import BBox
 
 from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell
 from dedoc.utils.utils import flatten
@@ -55,25 +56,26 @@ def split(self, cells: List[List[Cell]]) -> List[List[Cell]]:
         for row_id, row in enumerate(result_matrix):
             for col_id, cell in enumerate(row):
                 if cell is None:
-                    result_matrix[row_id][col_id] = Cell(x_top_left=horizontal_borders[row_id],
-                                                         x_bottom_right=horizontal_borders[row_id + 1],
-                                                         y_top_left=vertical_borders[col_id],
-                                                         y_bottom_right=vertical_borders[col_id + 1])
+                    bbox = BBox(x_top_left=int(horizontal_borders[row_id]),
+                                y_top_left=int(vertical_borders[col_id]),
+                                width=int(horizontal_borders[row_id + 1] - horizontal_borders[row_id]),
+                                height=int(vertical_borders[col_id + 1] - vertical_borders[col_id]))
+                    result_matrix[row_id][col_id] = Cell(bbox=bbox)
         return result_matrix
 
     @staticmethod
     def __split_one_cell(cell: Cell, horizontal_borders: np.ndarray, vertical_borders: np.ndarray, result_matrix: List[List[Cell]]) -> None:
-        left_id, right_id = np.searchsorted(vertical_borders, [cell.x_top_left, cell.x_bottom_right])
-        top_id, bottom_id = np.searchsorted(horizontal_borders, [cell.y_top_left, cell.y_bottom_right])
+        left_id, right_id = np.searchsorted(vertical_borders, [cell.bbox.x_top_left, cell.bbox.x_bottom_right])
+        top_id, bottom_id = np.searchsorted(horizontal_borders, [cell.bbox.y_top_left, cell.bbox.y_bottom_right])
         colspan = right_id - left_id
         rowspan = bottom_id - top_id
         for row_id in range(top_id, bottom_id):
             for column_id in range(left_id, right_id):
-                new_cell = Cell.copy_from(cell,
-                                          x_top_left=vertical_borders[column_id],
-                                          x_bottom_right=vertical_borders[column_id + 1],
-                                          y_top_left=horizontal_borders[row_id],
-                                          y_bottom_right=horizontal_borders[row_id + 1])
+                bbox = BBox(x_top_left=int(vertical_borders[column_id]),
+                            y_top_left=int(horizontal_borders[row_id]),
+                            width=int(vertical_borders[column_id + 1] - vertical_borders[column_id]),
+                            height=int(horizontal_borders[row_id + 1] - horizontal_borders[row_id]))
+                new_cell = Cell.copy_from(cell, bbox)
                 new_cell.invisible = True
                 result_matrix[row_id][column_id] = new_cell
 
@@ -106,20 +108,21 @@ def _merge_close_borders(self, cells: List[List[Cell]]) -> List[List[Cell]]:
         @return: cells with merged borders
         """
         horizontal_borders, vertical_borders = self.__get_borders(cells)
-        eps_vertical = self.eps * min((cell.width for cell in flatten(cells)), default=0)
-        eps_horizontal = self.eps * min((cell.height for cell in flatten(cells)), default=0)
+        eps_vertical = self.eps * min((cell.bbox.width for cell in flatten(cells)), default=0)
+        eps_horizontal = self.eps * min((cell.bbox.height for cell in flatten(cells)), default=0)
         horizontal_dict = self.__get_border_dict(borders=horizontal_borders, threshold=eps_horizontal)
         vertical_dict = self.__get_border_dict(borders=vertical_borders, threshold=eps_vertical)
         result = []
         for row in cells:
             new_row = []
             for cell in row:
-                x_top_left = vertical_dict[cell.x_top_left]
-                x_bottom_right = vertical_dict[cell.x_bottom_right]
-                y_top_left = horizontal_dict[cell.y_top_left]
-                y_bottom_right = horizontal_dict[cell.y_bottom_right]
+                x_top_left = vertical_dict[cell.bbox.x_top_left]
+                x_bottom_right = vertical_dict[cell.bbox.x_bottom_right]
+                y_top_left = horizontal_dict[cell.bbox.y_top_left]
+                y_bottom_right = horizontal_dict[cell.bbox.y_bottom_right]
                 if y_top_left < y_bottom_right and x_top_left < x_bottom_right:
-                    new_cell = Cell.copy_from(cell, x_top_left=x_top_left, x_bottom_right=x_bottom_right, y_top_left=y_top_left, y_bottom_right=y_bottom_right)
+                    bbox = BBox(x_top_left=x_top_left, y_top_left=y_top_left, width=x_bottom_right - x_top_left, height=y_bottom_right - y_top_left)
+                    new_cell = Cell.copy_from(cell, bbox)
                     new_row.append(new_cell)
             result.append(new_row)
         return result
@@ -130,8 +133,8 @@ def __get_borders(cells: List[List[Cell]]) -> Tuple[List[int], List[int]]:
         vertical_borders = []
         for row in cells:
             for cell in row:
-                horizontal_borders.append(cell.y_top_left)
-                horizontal_borders.append(cell.y_bottom_right)
-                vertical_borders.append(cell.x_top_left)
-                vertical_borders.append(cell.x_bottom_right)
+                horizontal_borders.append(cell.bbox.y_top_left)
+                horizontal_borders.append(cell.bbox.y_bottom_right)
+                vertical_borders.append(cell.bbox.x_top_left)
+                vertical_borders.append(cell.bbox.x_bottom_right)
         return horizontal_borders, vertical_borders
diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py
@@ -127,8 +127,8 @@ def _split_row(cell_splitter: Cell, union_cell: List[Cell], language: str, image
 
     # Get width of all union cell
     eps = len(union_cell)
-    x_left = union_cell[0].x_top_left + eps
-    x_right = union_cell[-1].x_bottom_right
+    x_left = union_cell[0].bbox.x_top_left + eps
+    x_right = union_cell[-1].bbox.x_bottom_right
     # get y coordinate from cell before union cell
     y_top_split = cell_splitter.con_coord.y_top_left
     y_bottom_split = cell_splitter.con_coord.y_top_left + cell_splitter.con_coord.height
@@ -141,8 +141,8 @@ def _split_row(cell_splitter: Cell, union_cell: List[Cell], language: str, image
     col_id = len(union_cell) - 1
     result_row = copy.deepcopy(union_cell)
     while col_id >= 0:
-        union_cell[col_id].y_top_left = y_top_split
-        union_cell[col_id].y_bottom_right = y_bottom_split
+        union_cell[col_id].bbox.y_top_left = y_top_split
+        union_cell[col_id].bbox.height = y_bottom_split - union_cell[col_id].bbox.y_top_left
 
         cell_image, padding_value = OCRCellExtractor.upscale(image[y_top_split:y_bottom_split, x_left:x_right])
         result_row[col_id].lines = __get_ocr_lines(cell_image, language, page_image=image,
@@ -163,10 +163,8 @@ def __get_ocr_lines(cell_image: np.ndarray, language: str, page_image: np.ndarra
         text_line = OCRCellExtractor.get_line_with_meta("")
         for word in line.words:
             # do absolute coordinate on src_image (inside src_image)
-            word.bbox.y_top_left -= padding_cell_value
-            word.bbox.x_top_left -= padding_cell_value
-            word.bbox.y_top_left += cell_bbox.y_top_left
-            word.bbox.x_top_left += cell_bbox.x_top_left
+            word.bbox.shift(shift_x=-padding_cell_value, shift_y=-padding_cell_value)
+            word.bbox.shift(shift_x=cell_bbox.x_top_left, shift_y=cell_bbox.y_top_left)
 
             # add space between words
             if len(text_line) != 0:

diff --git a/...reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py b/...reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py
@@ -7,7 +7,7 @@
 from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell
 from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
 from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.base_table_extractor import BaseTableExtractor
-from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableAttributeExtractor
+from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableHeaderExtractor
 from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.utils import equal_with_eps
 
 
@@ -117,12 +117,12 @@ def __get_width_cell_wo_separating(row: List[Cell]) -> List[int]:
         end = None
         for cell_id, cell in enumerate(row):
             if prev_uid is None:
-                start = cell.x_top_left
+                start = cell.bbox.x_top_left
                 prev_uid = cell.uuid
             elif prev_uid != cell.uuid:
                 widths.append(end - start)
-                start = cell.x_top_left
-            end = cell.x_bottom_right
+                start = cell.bbox.x_top_left
+            end = cell.bbox.x_bottom_right
             if cell_id == len(row) - 1:
                 widths.append(end - start)
         return widths
@@ -154,16 +154,16 @@ def __is_one_table(self, t1: ScanTable, t2: ScanTable) -> bool:
             return False
 
         # condition 2. Exclusion of the duplicated header (if any)
-        attr1 = TableAttributeExtractor.get_header_table(t1.cells)
-        attr2 = TableAttributeExtractor.get_header_table(t2.cells)
+        attr1 = TableHeaderExtractor.get_header_table(t1.cells)
+        attr2 = TableHeaderExtractor.get_header_table(t2.cells)
         t2_update = copy.deepcopy(t2)
-        if TableAttributeExtractor.is_equal_attributes(attr1, attr2):
+        if TableHeaderExtractor.is_equal_header(attr1, attr2):
             t2_update.cells = t2_update.cells[len(attr2):]
 
         if len(t2_update.cells) == 0 or len(t1.cells) == 0:
             return False
 
-        TableAttributeExtractor.clear_attributes(t2_update.cells)
+        TableHeaderExtractor.clear_attributes(t2_update.cells)
 
         # condition 3. Number of columns should be equal
         if len(t1.cells[-1]) != len(t2_update.cells[0]):

diff --git a/...e_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py b/...e_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py
@@ -12,7 +12,7 @@
 from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.cell_splitter import CellSplitter
 from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.split_last_hor_union_cells import split_last_column
 from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.base_table_extractor import BaseTableExtractor
-from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableAttributeExtractor
+from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableHeaderExtractor
 from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.img_processing import detect_tables_by_contours
 
 
@@ -23,7 +23,7 @@ def __init__(self, *, config: dict, logger: logging.Logger) -> None:
 
         self.image = None
         self.page_number = 0
-        self.attribute_selector = TableAttributeExtractor(logger=self.logger)
+        self.table_header_selector = TableHeaderExtractor(logger=self.logger)
         self.count_vertical_extended = 0
         self.splitter = CellSplitter()
         self.table_options = TableTypeAdditionalOptions()
@@ -50,17 +50,6 @@ def extract_onepage_tables_from_image(self, image: np.ndarray, page_number: int,
                 location.bbox.rotate_coordinates(angle_rotate=-angle_rotate, image_shape=image.shape)
                 location.rotated_angle = angle_rotate
 
-        tables = self.__select_attributes_tables(tables=tables)
-
-        return tables
-
-    def __select_attributes_tables(self, tables: List[ScanTable]) -> List[ScanTable]:
-        for table in tables:
-            table = self.attribute_selector.set_attributes(table)
-
-            if self.config.get("debug_mode", False):
-                self._print_table_attr(table.cells)
-
         return tables
 
     def __get_matrix_table_from_tree(self, table_tree: TableTree) -> ScanTable:
@@ -71,15 +60,12 @@ def __get_matrix_table_from_tree(self, table_tree: TableTree) -> ScanTable:
         matrix = []
         line = []
         for cell in table_tree.children:
-            if len(line) != 0 and abs(cell.cell_box.y_top_left - line[-1].y_top_left) > 15:  # add eps
+            if len(line) != 0 and abs(cell.cell_box.y_top_left - line[-1].bbox.y_top_left) > 15:  # add eps
                 cpy_line = copy.deepcopy(line)
                 matrix.append(cpy_line)
                 line.clear()
 
-            cell_ = Cell(x_top_left=cell.cell_box.x_top_left,
-                         x_bottom_right=cell.cell_box.x_bottom_right,
-                         y_top_left=cell.cell_box.y_top_left,
-                         y_bottom_right=cell.cell_box.y_bottom_right,
+            cell_ = Cell(bbox=cell.cell_box,
                          id_con=cell.id_contours,
                          lines=cell.lines,
                          contour_coord=cell.cell_box)
@@ -88,7 +74,7 @@ def __get_matrix_table_from_tree(self, table_tree: TableTree) -> ScanTable:
 
         # sorting column in each row
         for i in range(0, len(matrix)):
-            matrix[i] = sorted(matrix[i], key=lambda cell: cell.x_top_left, reverse=False)
+            matrix[i] = sorted(matrix[i], key=lambda cell: cell.bbox.x_top_left, reverse=False)
 
         matrix_table = ScanTable(cells=matrix, bbox=table_tree.cell_box, page_number=self.page_number)
 
@@ -125,4 +111,9 @@ def handle_cells(self, cells: List[List[Cell]], table_type: str = "") -> List[Li
         if self.table_options.split_last_column in table_type:
             cells = split_last_column(cells, language=self.language, image=self.image)
 
+        self.table_header_selector.set_header_cells(cells)
+
+        if self.config.get("debug_mode", False):
+            self._print_table_attr(cells)
+
         return cells