diff --git a/dedoc/readers/pdf_reader/data_classes/tables/cell.py b/dedoc/readers/pdf_reader/data_classes/tables/cell.py index effd58c0..b2b28bf2 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/cell.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/cell.py @@ -1,3 +1,4 @@ +import copy from typing import List, Optional from dedocutils.data_structures import BBox @@ -9,64 +10,33 @@ class Cell(CellWithMeta): @staticmethod - def copy_from(cell: "Cell", - x_top_left: Optional[int] = None, - x_bottom_right: Optional[int] = None, - y_top_left: Optional[int] = None, - y_bottom_right: Optional[int] = None) -> "Cell": - x_top_left = cell.x_top_left if x_top_left is None else x_top_left - x_bottom_right = cell.x_bottom_right if x_bottom_right is None else x_bottom_right - y_top_left = cell.y_top_left if y_top_left is None else y_top_left - y_bottom_right = cell.y_bottom_right if y_bottom_right is None else y_bottom_right - - # TODO change x_top_left ... y_bottom_right to BBox - - return Cell(x_top_left=x_top_left, - x_bottom_right=x_bottom_right, - y_top_left=y_top_left, - y_bottom_right=y_bottom_right, - id_con=cell.id_con, - lines=cell.lines, - colspan=cell.colspan, - rowspan=cell.rowspan, - invisible=cell.invisible, - is_attribute=cell.is_attribute, - is_attribute_required=cell.is_attribute_required, - rotated_angle=cell.rotated_angle, - uid=cell.uuid, - contour_coord=cell.con_coord) + def copy_from(cell: "Cell", bbox: Optional[BBox] = None) -> "Cell": + copy_cell = copy.deepcopy(cell) + if bbox: + copy_cell.bbox = bbox + + return copy_cell def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None: if self.lines: for line in self.lines: line.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height) - self.x_top_left += shift_x - self.x_bottom_right += shift_x - self.y_top_left += shift_y - self.y_bottom_right += shift_y + + self.bbox.shift(shift_x=shift_x, shift_y=shift_y) if self.con_coord: self.con_coord.shift(shift_x=shift_x, shift_y=shift_y) - def __init__(self, x_top_left: int, x_bottom_right: int, y_top_left: int, y_bottom_right: int, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None, + def __init__(self, bbox: BBox, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None, is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = Optional[None], contour_coord: Optional[BBox] = None, colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None: import uuid - assert x_top_left <= x_bottom_right - assert y_top_left <= y_bottom_right - self.lines = [] if lines is None else lines super().__init__(lines=lines, colspan=colspan, rowspan=rowspan, invisible=invisible) - # TODO change to BBox - self.x_top_left = x_top_left - self.x_bottom_right = x_bottom_right - self.y_top_left = y_top_left - self.y_bottom_right = y_bottom_right - + self.bbox = bbox self.id_con = id_con - self.is_attribute = is_attribute self.is_attribute_required = is_attribute_required self.rotated_angle = rotated_angle @@ -96,11 +66,3 @@ def change_lines_boxes_page_width_height(self, new_page_width: int, new_page_hei def __repr__(self) -> str: return self.__str__() - - @property - def width(self) -> int: - return self.x_bottom_right - self.x_top_left - - @property - def height(self) -> int: - return self.y_bottom_right - self.y_top_left diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/cell_splitter.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/cell_splitter.py index 0e72128c..ab1c355d 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/cell_splitter.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/cell_splitter.py @@ -1,6 +1,7 @@ from typing import Dict, List, Optional, Tuple import numpy as np +from dedocutils.data_structures import BBox from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell from dedoc.utils.utils import flatten @@ -55,25 +56,26 @@ def split(self, cells: List[List[Cell]]) -> List[List[Cell]]: for row_id, row in enumerate(result_matrix): for col_id, cell in enumerate(row): if cell is None: - result_matrix[row_id][col_id] = Cell(x_top_left=horizontal_borders[row_id], - x_bottom_right=horizontal_borders[row_id + 1], - y_top_left=vertical_borders[col_id], - y_bottom_right=vertical_borders[col_id + 1]) + bbox = BBox(x_top_left=int(horizontal_borders[row_id]), + y_top_left=int(vertical_borders[col_id]), + width=int(horizontal_borders[row_id + 1] - horizontal_borders[row_id]), + height=int(vertical_borders[col_id + 1] - vertical_borders[col_id])) + result_matrix[row_id][col_id] = Cell(bbox=bbox) return result_matrix @staticmethod def __split_one_cell(cell: Cell, horizontal_borders: np.ndarray, vertical_borders: np.ndarray, result_matrix: List[List[Cell]]) -> None: - left_id, right_id = np.searchsorted(vertical_borders, [cell.x_top_left, cell.x_bottom_right]) - top_id, bottom_id = np.searchsorted(horizontal_borders, [cell.y_top_left, cell.y_bottom_right]) + left_id, right_id = np.searchsorted(vertical_borders, [cell.bbox.x_top_left, cell.bbox.x_bottom_right]) + top_id, bottom_id = np.searchsorted(horizontal_borders, [cell.bbox.y_top_left, cell.bbox.y_bottom_right]) colspan = right_id - left_id rowspan = bottom_id - top_id for row_id in range(top_id, bottom_id): for column_id in range(left_id, right_id): - new_cell = Cell.copy_from(cell, - x_top_left=vertical_borders[column_id], - x_bottom_right=vertical_borders[column_id + 1], - y_top_left=horizontal_borders[row_id], - y_bottom_right=horizontal_borders[row_id + 1]) + bbox = BBox(x_top_left=int(vertical_borders[column_id]), + y_top_left=int(horizontal_borders[row_id]), + width=int(vertical_borders[column_id + 1] - vertical_borders[column_id]), + height=int(horizontal_borders[row_id + 1] - horizontal_borders[row_id])) + new_cell = Cell.copy_from(cell, bbox) new_cell.invisible = True result_matrix[row_id][column_id] = new_cell @@ -106,20 +108,21 @@ def _merge_close_borders(self, cells: List[List[Cell]]) -> List[List[Cell]]: @return: cells with merged borders """ horizontal_borders, vertical_borders = self.__get_borders(cells) - eps_vertical = self.eps * min((cell.width for cell in flatten(cells)), default=0) - eps_horizontal = self.eps * min((cell.height for cell in flatten(cells)), default=0) + eps_vertical = self.eps * min((cell.bbox.width for cell in flatten(cells)), default=0) + eps_horizontal = self.eps * min((cell.bbox.height for cell in flatten(cells)), default=0) horizontal_dict = self.__get_border_dict(borders=horizontal_borders, threshold=eps_horizontal) vertical_dict = self.__get_border_dict(borders=vertical_borders, threshold=eps_vertical) result = [] for row in cells: new_row = [] for cell in row: - x_top_left = vertical_dict[cell.x_top_left] - x_bottom_right = vertical_dict[cell.x_bottom_right] - y_top_left = horizontal_dict[cell.y_top_left] - y_bottom_right = horizontal_dict[cell.y_bottom_right] + x_top_left = vertical_dict[cell.bbox.x_top_left] + x_bottom_right = vertical_dict[cell.bbox.x_bottom_right] + y_top_left = horizontal_dict[cell.bbox.y_top_left] + y_bottom_right = horizontal_dict[cell.bbox.y_bottom_right] if y_top_left < y_bottom_right and x_top_left < x_bottom_right: - new_cell = Cell.copy_from(cell, x_top_left=x_top_left, x_bottom_right=x_bottom_right, y_top_left=y_top_left, y_bottom_right=y_bottom_right) + bbox = BBox(x_top_left=x_top_left, y_top_left=y_top_left, width=x_bottom_right - x_top_left, height=y_bottom_right - y_top_left) + new_cell = Cell.copy_from(cell, bbox) new_row.append(new_cell) result.append(new_row) return result @@ -130,8 +133,8 @@ def __get_borders(cells: List[List[Cell]]) -> Tuple[List[int], List[int]]: vertical_borders = [] for row in cells: for cell in row: - horizontal_borders.append(cell.y_top_left) - horizontal_borders.append(cell.y_bottom_right) - vertical_borders.append(cell.x_top_left) - vertical_borders.append(cell.x_bottom_right) + horizontal_borders.append(cell.bbox.y_top_left) + horizontal_borders.append(cell.bbox.y_bottom_right) + vertical_borders.append(cell.bbox.x_top_left) + vertical_borders.append(cell.bbox.x_bottom_right) return horizontal_borders, vertical_borders diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py index 0b14f034..e80769e0 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/split_last_hor_union_cells.py @@ -127,8 +127,8 @@ def _split_row(cell_splitter: Cell, union_cell: List[Cell], language: str, image # Get width of all union cell eps = len(union_cell) - x_left = union_cell[0].x_top_left + eps - x_right = union_cell[-1].x_bottom_right + x_left = union_cell[0].bbox.x_top_left + eps + x_right = union_cell[-1].bbox.x_bottom_right # get y coordinate from cell before union cell y_top_split = cell_splitter.con_coord.y_top_left y_bottom_split = cell_splitter.con_coord.y_top_left + cell_splitter.con_coord.height @@ -141,8 +141,8 @@ def _split_row(cell_splitter: Cell, union_cell: List[Cell], language: str, image col_id = len(union_cell) - 1 result_row = copy.deepcopy(union_cell) while col_id >= 0: - union_cell[col_id].y_top_left = y_top_split - union_cell[col_id].y_bottom_right = y_bottom_split + union_cell[col_id].bbox.y_top_left = y_top_split + union_cell[col_id].bbox.height = y_bottom_split - union_cell[col_id].bbox.y_top_left cell_image, padding_value = OCRCellExtractor.upscale(image[y_top_split:y_bottom_split, x_left:x_right]) result_row[col_id].lines = __get_ocr_lines(cell_image, language, page_image=image, @@ -163,10 +163,8 @@ def __get_ocr_lines(cell_image: np.ndarray, language: str, page_image: np.ndarra text_line = OCRCellExtractor.get_line_with_meta("") for word in line.words: # do absolute coordinate on src_image (inside src_image) - word.bbox.y_top_left -= padding_cell_value - word.bbox.x_top_left -= padding_cell_value - word.bbox.y_top_left += cell_bbox.y_top_left - word.bbox.x_top_left += cell_bbox.x_top_left + word.bbox.shift(shift_x=-padding_cell_value, shift_y=-padding_cell_value) + word.bbox.shift(shift_x=cell_bbox.x_top_left, shift_y=cell_bbox.y_top_left) # add space between words if len(text_line) != 0: diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py index 5cff352d..8d74829d 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py @@ -7,7 +7,7 @@ from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.base_table_extractor import BaseTableExtractor -from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableAttributeExtractor +from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableHeaderExtractor from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.utils import equal_with_eps @@ -117,12 +117,12 @@ def __get_width_cell_wo_separating(row: List[Cell]) -> List[int]: end = None for cell_id, cell in enumerate(row): if prev_uid is None: - start = cell.x_top_left + start = cell.bbox.x_top_left prev_uid = cell.uuid elif prev_uid != cell.uuid: widths.append(end - start) - start = cell.x_top_left - end = cell.x_bottom_right + start = cell.bbox.x_top_left + end = cell.bbox.x_bottom_right if cell_id == len(row) - 1: widths.append(end - start) return widths @@ -154,16 +154,16 @@ def __is_one_table(self, t1: ScanTable, t2: ScanTable) -> bool: return False # condition 2. Exclusion of the duplicated header (if any) - attr1 = TableAttributeExtractor.get_header_table(t1.cells) - attr2 = TableAttributeExtractor.get_header_table(t2.cells) + attr1 = TableHeaderExtractor.get_header_table(t1.cells) + attr2 = TableHeaderExtractor.get_header_table(t2.cells) t2_update = copy.deepcopy(t2) - if TableAttributeExtractor.is_equal_attributes(attr1, attr2): + if TableHeaderExtractor.is_equal_header(attr1, attr2): t2_update.cells = t2_update.cells[len(attr2):] if len(t2_update.cells) == 0 or len(t1.cells) == 0: return False - TableAttributeExtractor.clear_attributes(t2_update.cells) + TableHeaderExtractor.clear_attributes(t2_update.cells) # condition 3. Number of columns should be equal if len(t1.cells[-1]) != len(t2_update.cells[0]): diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py index 2a05a03c..f345f1e3 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py @@ -12,7 +12,7 @@ from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.cell_splitter import CellSplitter from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.split_last_hor_union_cells import split_last_column from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.base_table_extractor import BaseTableExtractor -from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableAttributeExtractor +from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableHeaderExtractor from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.img_processing import detect_tables_by_contours @@ -23,7 +23,7 @@ def __init__(self, *, config: dict, logger: logging.Logger) -> None: self.image = None self.page_number = 0 - self.attribute_selector = TableAttributeExtractor(logger=self.logger) + self.table_header_selector = TableHeaderExtractor(logger=self.logger) self.count_vertical_extended = 0 self.splitter = CellSplitter() self.table_options = TableTypeAdditionalOptions() @@ -50,17 +50,6 @@ def extract_onepage_tables_from_image(self, image: np.ndarray, page_number: int, location.bbox.rotate_coordinates(angle_rotate=-angle_rotate, image_shape=image.shape) location.rotated_angle = angle_rotate - tables = self.__select_attributes_tables(tables=tables) - - return tables - - def __select_attributes_tables(self, tables: List[ScanTable]) -> List[ScanTable]: - for table in tables: - table = self.attribute_selector.set_attributes(table) - - if self.config.get("debug_mode", False): - self._print_table_attr(table.cells) - return tables def __get_matrix_table_from_tree(self, table_tree: TableTree) -> ScanTable: @@ -71,15 +60,12 @@ def __get_matrix_table_from_tree(self, table_tree: TableTree) -> ScanTable: matrix = [] line = [] for cell in table_tree.children: - if len(line) != 0 and abs(cell.cell_box.y_top_left - line[-1].y_top_left) > 15: # add eps + if len(line) != 0 and abs(cell.cell_box.y_top_left - line[-1].bbox.y_top_left) > 15: # add eps cpy_line = copy.deepcopy(line) matrix.append(cpy_line) line.clear() - cell_ = Cell(x_top_left=cell.cell_box.x_top_left, - x_bottom_right=cell.cell_box.x_bottom_right, - y_top_left=cell.cell_box.y_top_left, - y_bottom_right=cell.cell_box.y_bottom_right, + cell_ = Cell(bbox=cell.cell_box, id_con=cell.id_contours, lines=cell.lines, contour_coord=cell.cell_box) @@ -88,7 +74,7 @@ def __get_matrix_table_from_tree(self, table_tree: TableTree) -> ScanTable: # sorting column in each row for i in range(0, len(matrix)): - matrix[i] = sorted(matrix[i], key=lambda cell: cell.x_top_left, reverse=False) + matrix[i] = sorted(matrix[i], key=lambda cell: cell.bbox.x_top_left, reverse=False) matrix_table = ScanTable(cells=matrix, bbox=table_tree.cell_box, page_number=self.page_number) @@ -125,4 +111,9 @@ def handle_cells(self, cells: List[List[Cell]], table_type: str = "") -> List[Li if self.table_options.split_last_column in table_type: cells = split_last_column(cells, language=self.language, image=self.image) + self.table_header_selector.set_header_cells(cells) + + if self.config.get("debug_mode", False): + self._print_table_attr(cells) + return cells diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py index fbca8cd0..e25dbd2d 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py @@ -2,31 +2,31 @@ from typing import List from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell -from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.utils import similarity -class TableAttributeExtractor: +class TableHeaderExtractor: """ - Class finds and labels "is_attributes=True" attribute cells into ScanTable + Class finds and labels "is_attributes=True" attribute (header) cells into ScanTable + """ def __init__(self, logger: logging.Logger) -> None: self.logger = logger - def set_attributes(self, scan_table: ScanTable) -> ScanTable: - return self.__set_attributes_for_type_top(scan_table) + def set_header_cells(self, cells: List[List[Cell]]) -> None: + self.__set_attributes_for_type_top(cells) @staticmethod - def is_equal_attributes(attr1: List[List[Cell]], attr2: List[List[Cell]], thr_similarity: int = 0.8) -> bool: - if len(attr1) != len(attr2): + def is_equal_header(header_1: List[List[Cell]], header_2: List[List[Cell]], thr_similarity: int = 0.8) -> bool: + if len(header_1) != len(header_2): return False - for i in range(len(attr1)): - if len(attr1[i]) != len(attr2[i]): + for i in range(len(header_1)): + if len(header_1[i]) != len(header_2[i]): return False - for j in range(len(attr1[i])): - if similarity(attr1[i][j].get_text(), attr2[i][j].get_text()) < thr_similarity: + for j in range(len(header_1[i])): + if similarity(header_1[i][j].get_text(), header_2[i][j].get_text()) < thr_similarity: return False return True @@ -44,7 +44,7 @@ def check_have_attributes(matrix_table: List[List[Cell]]) -> bool: @staticmethod def get_header_table(matrix_table: List[List[Cell]]) -> List[List[Cell]]: - if not TableAttributeExtractor.check_have_attributes(matrix_table): + if not TableHeaderExtractor.check_have_attributes(matrix_table): return matrix_table[:1] header_rows = len(matrix_table) @@ -58,7 +58,7 @@ def get_header_table(matrix_table: List[List[Cell]]) -> List[List[Cell]]: @staticmethod def clear_attributes(matrix_table: List[List[Cell]]) -> None: - if not TableAttributeExtractor.check_have_attributes(matrix_table): + if not TableHeaderExtractor.check_have_attributes(matrix_table): return for row in matrix_table: @@ -74,15 +74,15 @@ def __is_indexable_column(self, matrix_table: List[List[Cell]], column_id: int, return True return False - def __set_attributes_for_type_top(self, scan_table: ScanTable) -> ScanTable: - vertical_union_columns = self.__analyze_attr_for_vertical_union_columns(scan_table) - horizontal_union_rows = self.__analyze_attr_for_horizontal_union_raws(scan_table) + def __set_attributes_for_type_top(self, cells: List[List[Cell]]) -> List[List[Cell]]: + vertical_union_columns = self.__analyze_attr_for_vertical_union_columns(cells) + horizontal_union_rows = self.__analyze_attr_for_horizontal_union_raws(cells) # simple table if (0 not in horizontal_union_rows) and len(vertical_union_columns) == 0: - self.__analyze_attr_for_simple_table(scan_table) + self.__analyze_attr_for_simple_table(cells) - return scan_table + return cells def __is_empty_column(self, matrix_table: List[List[Cell]], column_id: int) -> bool: all_empty = True @@ -102,72 +102,72 @@ def __is_empty_row(self, matrix_table: List[List[Cell]], row_index: int) -> bool break return all_empty - def __analyze_attr_for_vertical_union_columns(self, scan_table: ScanTable) -> List[int]: + def __analyze_attr_for_vertical_union_columns(self, cells: List[List[Cell]]) -> List[int]: vertical_union_columns = [] - if len(vertical_union_columns) != 0 and len(scan_table.cells) > 1: + if len(vertical_union_columns) != 0 and len(cells) > 1: self.logger.debug("ATTR_TYPE: vertical union table") row_max_attr = 1 - i = 1 # Установка атрибутов таблицы for i in range(0, row_max_attr): - for j in range(0, len(scan_table.cells[i])): - scan_table.cells[i][j].is_attribute = True + for j in range(0, len(cells[i])): + cells[i][j].is_attribute = True + # Установка обязательных атрибутов - scan_table.cells[0][0].is_attribute_required = True - for j in range(1, len(scan_table.cells[0])): + cells[0][0].is_attribute_required = True + for j in range(1, len(cells[0])): is_attribute_required = True if is_attribute_required: - scan_table.cells[0][j].is_attribute_required = True + cells[0][j].is_attribute_required = True return vertical_union_columns - def __analyze_attr_for_horizontal_union_raws(self, scan_table: ScanTable) -> List[int]: + def __analyze_attr_for_horizontal_union_raws(self, cells: List[List[Cell]]) -> List[int]: horizontal_union_rows = [] union_first = False - for i in range(0, len(scan_table.cells)): + for i in range(0, len(cells)): if len(horizontal_union_rows) > 0 and i not in horizontal_union_rows: horizontal_union_rows.append(i) - if not self.__is_empty_row(scan_table.cells, i): + if not self.__is_empty_row(cells, i): break if union_first and len(horizontal_union_rows) != 0: self.logger.debug("ATTR_TYPE: horizontal_union_rows") for i in range(0, len(horizontal_union_rows)): - for j in range(0, len(scan_table.cells[i])): - scan_table.cells[i][j].is_attribute = True - scan_table.cells[0][0].is_attribute_required = True + for j in range(0, len(cells[i])): + cells[i][j].is_attribute = True + cells[0][0].is_attribute_required = True first_required_column = 0 # search indexable_column # один один столбец должен быть (0) - нумерованным, # один (1) - с обязательными поляями, один (2) - с необязательными # поэтому len(matrix_table) > first_required_column + 2 if len(horizontal_union_rows) > 0 and \ - self.__is_indexable_column(scan_table.cells, first_required_column, max_raw_of_search=horizontal_union_rows[-1]) \ - and len(scan_table.cells) > first_required_column + 2: - scan_table.cells[0][first_required_column + 1].is_attribute_required = True + self.__is_indexable_column(cells, first_required_column, max_raw_of_search=horizontal_union_rows[-1]) \ + and len(cells) > first_required_column + 2: + cells[0][first_required_column + 1].is_attribute_required = True # Полностью пустые строки не могут быть атрибутами (не информативны) # Перенос атрибутов на след строку таблицы index_empty_rows = horizontal_union_rows[-1] - if self.__is_empty_row(scan_table.cells, index_empty_rows) and len(scan_table.cells) != index_empty_rows + 1: + if self.__is_empty_row(cells, index_empty_rows) and len(cells) != index_empty_rows + 1: horizontal_union_rows.append(index_empty_rows + 1) - for j in range(0, len(scan_table.cells[index_empty_rows + 1])): - scan_table.cells[index_empty_rows + 1][j].is_attribute = True + for j in range(0, len(cells[index_empty_rows + 1])): + cells[index_empty_rows + 1][j].is_attribute = True self.logger.debug("detect empty attributes row") return horizontal_union_rows - def __analyze_attr_for_simple_table(self, scan_table: ScanTable) -> None: + def __analyze_attr_for_simple_table(self, cells: List[List[Cell]]) -> None: self.logger.debug("ATTR_TYPE: simple table") - for j in range(0, len(scan_table.cells[0])): - scan_table.cells[0][j].is_attribute = True + for j in range(0, len(cells[0])): + cells[0][j].is_attribute = True # set first required column j = 0 first_required_column = j - while j < len(scan_table.cells[0]): - if not self.__is_empty_column(scan_table.cells, j): - scan_table.cells[0][j].is_attribute_required = True + while j < len(cells[0]): + if not self.__is_empty_column(cells, j): + cells[0][j].is_attribute_required = True first_required_column = j break j += 1 @@ -175,5 +175,5 @@ def __analyze_attr_for_simple_table(self, scan_table: ScanTable) -> None: # один один столбец должен быть (0) - нумерованным, # один (1) - с обязательными поляями, один (2) - с необязательными # поэтому len(matrix_table) > first_required_column + 2 - if self.__is_indexable_column(scan_table.cells, first_required_column, 0) and len(scan_table.cells) > first_required_column + 2: - scan_table.cells[0][first_required_column + 1].is_attribute_required = True + if self.__is_indexable_column(cells, first_required_column, 0) and len(cells) > first_required_column + 2: + cells[0][first_required_column + 1].is_attribute_required = True diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py index 3d2f89ea..eb07732d 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py @@ -36,8 +36,8 @@ def convert_to_multipages_tables(self, all_single_tables: List[ScanTable], lines def recognize_tables_from_image(self, image: np.ndarray, page_number: int, language: str, table_type: str = "") -> Tuple[np.ndarray, List[ScanTable]]: self.logger.debug(f"Page {page_number}") try: - cleaned_image, matrix_tables = self.__rec_tables_from_img(image, page_num=page_number, language=language, table_type=table_type) - return cleaned_image, matrix_tables + cleaned_image, scan_tables = self.__rec_tables_from_img(image, page_num=page_number, language=language, table_type=table_type) + return cleaned_image, scan_tables except Exception as ex: logging.warning(ex) if self.config.get("debug_mode", False): @@ -113,7 +113,7 @@ def __if_not_table(self, table: ScanTable, image: np.ndarray) -> bool: cells_area = 0 for row in table.cells: for cell in row: - cells_area += cell.width * cell.height + cells_area += cell.bbox.width * cell.bbox.height ratio = cells_area / table_area res = (white_mean < 0.5) or (black_mean > 0.3) or (std < 30) or (mean < 150) or (mean < 200 and std < 80) or ratio < 0.65 diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py index 4b7211b6..c98d71a5 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py @@ -4,6 +4,7 @@ from typing import List, Tuple import cv2 +from dedocutils.data_structures import BBox from dedoc.config import get_config from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell @@ -12,7 +13,7 @@ def _create_cell(c: str, text_cells: list) -> Cell: - cell = Cell(x_bottom_right=-1, x_top_left=-1, y_top_left=-1, y_bottom_right=-1) + cell = Cell(BBox(x_top_left=-1, y_top_left=-1, width=0, height=0)) if "a" in c: cell.is_attribute = True # loading cell text @@ -81,8 +82,15 @@ def draw_recognized_cell(tables: List[ScanTable], path_image: str, path_save: st cv2.rectangle(img, (bbox.x_top_left, bbox.y_top_left), (bbox.width, bbox.height), blue_color, 6) for i in range(0, len(table)): for j in range(0, len(table[i])): - cv2.rectangle(img, (table[i][j].x_top_left, table[i][j].y_top_left), (table[i][j].x_bottom_right, table[i][j].y_bottom_right), red_color, 4) - cv2.putText(img, str(table[i][j].id_con), (table[i][j].x_top_left, table[i][j].y_bottom_right), cv2.FONT_HERSHEY_PLAIN, 4, green_color) + cv2.rectangle(img, + (table[i][j].bbox.x_top_left, table[i][j].bbox.y_top_left), + (table[i][j].bbox.x_bottom_right, table[i][j].bbox.y_bottom_right), + red_color, 4 + ) + cv2.putText(img, str(table[i][j].id_con), + (table[i][j].bbox.x_top_left, table[i][j].bbox.y_bottom_right), + cv2.FONT_HERSHEY_PLAIN, 4, green_color + ) cv2.imwrite(path_save, img) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/utils.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/utils.py index 19674772..80ac01e7 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/utils.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/utils.py @@ -1,5 +1,3 @@ -import difflib - import numpy as np @@ -20,24 +18,9 @@ def get_highest_pixel_frequency(image: np.ndarray) -> int: def similarity(s1: str, s2: str) -> float: """string similarity""" + import difflib + normalized1 = s1.lower() normalized2 = s2.lower() matcher = difflib.SequenceMatcher(None, normalized1, normalized2) return matcher.ratio() - - -MINIMAL_CELL_CNT_LINE = 7 -MINIMAL_CELL_AVG_LENGTH_LINE = 10 - - -def detect_diff_orient(cell_text: str) -> bool: - # 1 - разбиваем на строки длины которых состоят хотя бы из одного символа - parts = cell_text.split("\n") - parts = [p for p in parts if len(p) > 0] - - # 2 - подсчитываем среднюю длину строк ячейки - len_parts = [len(p) for p in parts] - avg_len_part = np.average(len_parts) - - # Эвристика: считаем сто ячейка повернута если у нас большое количество строк и строки короткие - return len(parts) > MINIMAL_CELL_CNT_LINE and avg_len_part < MINIMAL_CELL_AVG_LENGTH_LINE diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index ef47db28..cce14d01 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -15,7 +15,7 @@ from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.onepage_table_extractor import OnePageTableExtractor -from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableAttributeExtractor +from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableHeaderExtractor class PdfTabbyReader(PdfBaseReader): @@ -38,7 +38,7 @@ def __init__(self, *, config: Optional[dict] = None) -> None: self.jar_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "tabbypdf", "jars")) self.java_not_found_error = "`java` command is not found from this Python process. Please ensure Java is installed and PATH is set for `java`" self.default_config = {"JAR_PATH": os.path.join(self.jar_dir, self.jar_name)} - self.attribute_selector = TableAttributeExtractor(logger=self.logger) + self.table_header_selector = TableHeaderExtractor(logger=self.logger) self.table_extractor = OnePageTableExtractor(config=config, logger=self.logger) def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: @@ -171,7 +171,7 @@ def __get_tables(self, page: dict) -> List[ScanTable]: for table in page["tables"]: table_bbox = BBox(x_top_left=table["x_top_left"], y_top_left=table["y_top_left"], width=table["width"], height=table["height"]) - order = table["order"] # TODO add table order into TableMetadata + order = table["order"] rows = table["rows"] cell_properties = table["cell_properties"] assert len(rows) == len(cell_properties) @@ -188,30 +188,27 @@ def __get_tables(self, page: dict) -> List[ScanTable]: for c in cell_blocks: cell_bbox = BBox(x_top_left=int(c["x_top_left"]), y_top_left=int(c["y_top_left"]), width=int(c["width"]), height=int(c["height"])) annotations.append(BBoxAnnotation(c["start"], c["end"], cell_bbox, page_width=page_width, page_height=page_height)) - """ - TODO: change to Cell class after tabby can return cell coordinates. Then set type Cell in class "ScanTable" - https://jira.intra.ispras.ru/browse/TLDR-851 - """ + current_cell_properties = cell_properties[num_row][num_col] + bbox = BBox(x_top_left=int(current_cell_properties["x_top_left"]), + y_top_left=int(current_cell_properties["y_top_left"]), + width=int(current_cell_properties["width"]), + height=int(current_cell_properties["height"])) + result_row.append(Cell( + bbox=bbox, lines=[LineWithMeta(line=cell["text"], metadata=LineMetadata(page_id=page_number, line_id=0), annotations=annotations)], colspan=current_cell_properties["col_span"], rowspan=current_cell_properties["row_span"], - invisible=bool(current_cell_properties["invisible"]), - x_top_left=int(current_cell_properties["x_top_left"]), - x_bottom_right=int(current_cell_properties["x_top_left"]) + int(current_cell_properties["width"]), - y_top_left=int(current_cell_properties["y_top_left"]), - y_bottom_right=int(current_cell_properties["y_top_left"]) + int(current_cell_properties["height"]) + invisible=bool(current_cell_properties["invisible"]) )) cells.append(result_row) try: cells = self.table_extractor.handle_cells(cells) - table = ScanTable(page_number=page_number, cells=cells, bbox=table_bbox, order=order) - table = self.attribute_selector.set_attributes(table) - scan_tables.append(table) + scan_tables.append(ScanTable(page_number=page_number, cells=cells, bbox=table_bbox, order=order)) except Exception as ex: - self.logger.warning(f"Warning: unrecognized table into page {self.page_number}. {ex}") + self.logger.warning(f"Warning: unrecognized table on page {self.page_number}. {ex}") if self.config.get("debug_mode", False): raise ex diff --git a/tests/api_tests/test_api_format_pdf_tabby_reader.py b/tests/api_tests/test_api_format_pdf_tabby_reader.py index b2ff91a6..959e15ca 100644 --- a/tests/api_tests/test_api_format_pdf_tabby_reader.py +++ b/tests/api_tests/test_api_format_pdf_tabby_reader.py @@ -182,7 +182,7 @@ def test_pdf_with_tables(self) -> None: table = tables[3]["cells"] self.assertListEqual(["", "2016", "2017", "2018", "2019"], self._get_text_of_row(table[0])) - self.assertListEqual(["", "Прогноз", "Прогноз бюджета"], self._get_text_of_row(table[1])) + self.assertListEqual(["", "Прогноз", "Прогноз бюджета", "Прогноз бюджета", "Прогноз бюджета"], self._get_text_of_row(table[1])) self.assertListEqual(["Ненефтегазов\nые доходы", "10,4", "9,6", "9,6", "9,6"], self._get_text_of_row(table[21])) self.assertListEqual(["Сальдо\nбюджета", "-3,7", "-3,2", "-2,2", "-1,2"], self._get_text_of_row(table[22])) @@ -227,7 +227,7 @@ def test_tables_with_merged_cells(self) -> None: result = self._send_request(file_name, data=dict(pdf_with_text_layer="tabby")) table = result["content"]["tables"][0]["cells"] - hidden_cells_big_table_with_colspan = [[(1, 0), 10], [(5, 1), 5]] + hidden_cells_big_table_with_colspan = [[(1, 0), 10], [(5, 5), 5]] for (i, j), k in hidden_cells_big_table_with_colspan: self.assertFalse(table[i][j]["invisible"]) diff --git a/tests/unit_tests/test_module_cell_splitter.py b/tests/unit_tests/test_module_cell_splitter.py index ad48952a..36113dbc 100644 --- a/tests/unit_tests/test_module_cell_splitter.py +++ b/tests/unit_tests/test_module_cell_splitter.py @@ -1,5 +1,7 @@ import unittest +from dedocutils.data_structures import BBox + from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.cell_splitter import CellSplitter @@ -10,42 +12,42 @@ class TestCellSplitter(unittest.TestCase): def test_merge_close_borders(self) -> None: cells = [ [ - Cell(x_top_left=0, y_top_left=0, x_bottom_right=50, y_bottom_right=30), - Cell(x_top_left=51, y_top_left=2, x_bottom_right=90, y_bottom_right=29) + Cell(BBox(x_top_left=0, y_top_left=0, width=50, height=30)), + Cell(BBox(x_top_left=51, y_top_left=2, width=39, height=27)) ], [ - Cell(x_top_left=0, y_top_left=31, x_bottom_right=50, y_bottom_right=50), - Cell(x_top_left=51, y_top_left=31, x_bottom_right=91, y_bottom_right=50) + Cell(BBox(x_top_left=0, y_top_left=31, width=50, height=19)), + Cell(BBox(x_top_left=51, y_top_left=31, width=40, height=19)) ] ] cells_merged = self.splitter._merge_close_borders(cells) - self.assertEqual(0, cells_merged[0][0].x_top_left) - self.assertEqual(0, cells_merged[0][0].y_top_left) - self.assertEqual(50, cells_merged[0][0].x_bottom_right) - self.assertEqual(29, cells_merged[0][0].y_bottom_right) - - self.assertEqual(50, cells_merged[0][1].x_top_left) - self.assertEqual(0, cells_merged[0][1].y_top_left) - self.assertEqual(90, cells_merged[0][1].x_bottom_right) - self.assertEqual(29, cells_merged[0][1].y_bottom_right) - - self.assertEqual(0, cells_merged[1][0].x_top_left) - self.assertEqual(29, cells_merged[1][0].y_top_left) - self.assertEqual(50, cells_merged[1][0].x_bottom_right) - self.assertEqual(50, cells_merged[1][0].y_bottom_right) - - self.assertEqual(50, cells_merged[1][1].x_top_left) - self.assertEqual(29, cells_merged[1][1].y_top_left) - self.assertEqual(90, cells_merged[1][1].x_bottom_right) - self.assertEqual(50, cells_merged[1][1].y_bottom_right) + self.assertEqual(0, cells_merged[0][0].bbox.x_top_left) + self.assertEqual(0, cells_merged[0][0].bbox.y_top_left) + self.assertEqual(50, cells_merged[0][0].bbox.x_bottom_right) + self.assertEqual(29, cells_merged[0][0].bbox.y_bottom_right) + + self.assertEqual(50, cells_merged[0][1].bbox.x_top_left) + self.assertEqual(0, cells_merged[0][1].bbox.y_top_left) + self.assertEqual(90, cells_merged[0][1].bbox.x_bottom_right) + self.assertEqual(29, cells_merged[0][1].bbox.y_bottom_right) + + self.assertEqual(0, cells_merged[1][0].bbox.x_top_left) + self.assertEqual(29, cells_merged[1][0].bbox.y_top_left) + self.assertEqual(50, cells_merged[1][0].bbox.x_bottom_right) + self.assertEqual(50, cells_merged[1][0].bbox.y_bottom_right) + + self.assertEqual(50, cells_merged[1][1].bbox.x_top_left) + self.assertEqual(29, cells_merged[1][1].bbox.y_top_left) + self.assertEqual(90, cells_merged[1][1].bbox.x_bottom_right) + self.assertEqual(50, cells_merged[1][1].bbox.y_bottom_right) def test_merge_close_borders_one_cell(self) -> None: - cells = [[Cell(x_top_left=0, y_top_left=0, x_bottom_right=50, y_bottom_right=30)]] + cells = [[Cell(BBox(x_top_left=0, y_top_left=0, width=50, height=30))]] cells_merged = self.splitter._merge_close_borders(cells) - self.assertEqual(0, cells_merged[0][0].x_top_left) - self.assertEqual(0, cells_merged[0][0].y_top_left) - self.assertEqual(50, cells_merged[0][0].x_bottom_right) - self.assertEqual(30, cells_merged[0][0].y_bottom_right) + self.assertEqual(0, cells_merged[0][0].bbox.x_top_left) + self.assertEqual(0, cells_merged[0][0].bbox.y_top_left) + self.assertEqual(50, cells_merged[0][0].bbox.x_bottom_right) + self.assertEqual(30, cells_merged[0][0].bbox.y_bottom_right) def test_merge_zero_cells(self) -> None: cells = [[]] @@ -58,24 +60,24 @@ def test_split_zero_cells(self) -> None: self.assertListEqual([[]], matrix) def test_split_one_cell(self) -> None: - cells = [[Cell(x_top_left=0, y_top_left=0, x_bottom_right=10, y_bottom_right=15)]] + cells = [[Cell(BBox(x_top_left=0, y_top_left=0, width=10, height=15))]] matrix = self.splitter.split(cells=cells) self.assertEqual(1, len(matrix)) self.assertEqual(1, len(matrix[0])) new_cell = matrix[0][0] - self.assertEqual(0, new_cell.x_top_left) - self.assertEqual(0, new_cell.y_top_left) - self.assertEqual(10, new_cell.x_bottom_right) - self.assertEqual(15, new_cell.y_bottom_right) + self.assertEqual(0, new_cell.bbox.x_top_left) + self.assertEqual(0, new_cell.bbox.y_top_left) + self.assertEqual(10, new_cell.bbox.x_bottom_right) + self.assertEqual(15, new_cell.bbox.y_bottom_right) def test_horizontal_split(self) -> None: cells = [ [ - Cell(x_top_left=0, y_top_left=0, x_bottom_right=3, y_bottom_right=5), - Cell(x_top_left=3, y_top_left=0, x_bottom_right=7, y_bottom_right=3), + Cell(BBox(x_top_left=0, y_top_left=0, width=3, height=5)), + Cell(BBox(x_top_left=3, y_top_left=0, width=4, height=3)), ], [ - Cell(x_top_left=3, y_top_left=3, x_bottom_right=7, y_bottom_right=5), + Cell(BBox(x_top_left=3, y_top_left=3, width=4, height=2)), ] ] matrix = self.splitter.split(cells) @@ -83,34 +85,34 @@ def test_horizontal_split(self) -> None: self.assertEqual(2, len(matrix[0])) self.assertEqual(2, len(matrix[1])) [cell_a, cell_b], [cell_c, cell_d] = matrix - self.assertEqual(0, cell_a.x_top_left) - self.assertEqual(0, cell_a.y_top_left) - self.assertEqual(3, cell_a.x_bottom_right) - self.assertEqual(3, cell_a.y_bottom_right) - - self.assertEqual(3, cell_b.x_top_left) - self.assertEqual(0, cell_b.y_top_left) - self.assertEqual(7, cell_b.x_bottom_right) - self.assertEqual(3, cell_b.y_bottom_right) - - self.assertEqual(0, cell_c.x_top_left) - self.assertEqual(3, cell_c.y_top_left) - self.assertEqual(3, cell_c.x_bottom_right) - self.assertEqual(5, cell_c.y_bottom_right) - - self.assertEqual(3, cell_d.x_top_left) - self.assertEqual(3, cell_d.y_top_left) - self.assertEqual(7, cell_d.x_bottom_right) - self.assertEqual(5, cell_d.y_bottom_right) + self.assertEqual(0, cell_a.bbox.x_top_left) + self.assertEqual(0, cell_a.bbox.y_top_left) + self.assertEqual(3, cell_a.bbox.x_bottom_right) + self.assertEqual(3, cell_a.bbox.y_bottom_right) + + self.assertEqual(3, cell_b.bbox.x_top_left) + self.assertEqual(0, cell_b.bbox.y_top_left) + self.assertEqual(7, cell_b.bbox.x_bottom_right) + self.assertEqual(3, cell_b.bbox.y_bottom_right) + + self.assertEqual(0, cell_c.bbox.x_top_left) + self.assertEqual(3, cell_c.bbox.y_top_left) + self.assertEqual(3, cell_c.bbox.x_bottom_right) + self.assertEqual(5, cell_c.bbox.y_bottom_right) + + self.assertEqual(3, cell_d.bbox.x_top_left) + self.assertEqual(3, cell_d.bbox.y_top_left) + self.assertEqual(7, cell_d.bbox.x_bottom_right) + self.assertEqual(5, cell_d.bbox.y_bottom_right) def test_vertical_split(self) -> None: cells = [ [ - Cell(x_top_left=0, y_top_left=0, x_bottom_right=8, y_bottom_right=2), + Cell(BBox(x_top_left=0, y_top_left=0, width=8, height=2)), ], [ - Cell(x_top_left=0, y_top_left=2, x_bottom_right=5, y_bottom_right=5), - Cell(x_top_left=5, y_top_left=2, x_bottom_right=8, y_bottom_right=5), + Cell(BBox(x_top_left=0, y_top_left=2, width=5, height=3)), + Cell(BBox(x_top_left=5, y_top_left=2, width=3, height=3)), ] ] matrix = self.splitter.split(cells) @@ -118,35 +120,35 @@ def test_vertical_split(self) -> None: self.assertEqual(2, len(matrix[0])) self.assertEqual(2, len(matrix[1])) [cell_a, cell_b], [cell_c, cell_d] = matrix - self.assertEqual(0, cell_a.x_top_left) - self.assertEqual(0, cell_a.y_top_left) - self.assertEqual(5, cell_a.x_bottom_right) - self.assertEqual(2, cell_a.y_bottom_right) - - self.assertEqual(5, cell_b.x_top_left) - self.assertEqual(0, cell_b.y_top_left) - self.assertEqual(8, cell_b.x_bottom_right) - self.assertEqual(2, cell_b.y_bottom_right) - - self.assertEqual(0, cell_c.x_top_left) - self.assertEqual(2, cell_c.y_top_left) - self.assertEqual(5, cell_c.x_bottom_right) - self.assertEqual(5, cell_c.y_bottom_right) - - self.assertEqual(5, cell_d.x_top_left) - self.assertEqual(2, cell_d.y_top_left) - self.assertEqual(8, cell_d.x_bottom_right) - self.assertEqual(5, cell_d.y_bottom_right) + self.assertEqual(0, cell_a.bbox.x_top_left) + self.assertEqual(0, cell_a.bbox.y_top_left) + self.assertEqual(5, cell_a.bbox.x_bottom_right) + self.assertEqual(2, cell_a.bbox.y_bottom_right) + + self.assertEqual(5, cell_b.bbox.x_top_left) + self.assertEqual(0, cell_b.bbox.y_top_left) + self.assertEqual(8, cell_b.bbox.x_bottom_right) + self.assertEqual(2, cell_b.bbox.y_bottom_right) + + self.assertEqual(0, cell_c.bbox.x_top_left) + self.assertEqual(2, cell_c.bbox.y_top_left) + self.assertEqual(5, cell_c.bbox.x_bottom_right) + self.assertEqual(5, cell_c.bbox.y_bottom_right) + + self.assertEqual(5, cell_d.bbox.x_top_left) + self.assertEqual(2, cell_d.bbox.y_top_left) + self.assertEqual(8, cell_d.bbox.x_bottom_right) + self.assertEqual(5, cell_d.bbox.y_bottom_right) def test_no_split(self) -> None: cells = [ [ - Cell(x_top_left=160, y_top_left=321, x_bottom_right=825, y_bottom_right=369), - Cell(x_top_left=825, y_top_left=321, x_bottom_right=1494, y_bottom_right=369) + Cell(BBox(x_top_left=160, y_top_left=321, width=665, height=48)), + Cell(BBox(x_top_left=825, y_top_left=321, width=669, height=48)) ], [ - Cell(x_top_left=160, y_top_left=374, x_bottom_right=825, y_bottom_right=423), - Cell(x_top_left=825, y_top_left=374, x_bottom_right=1494, y_bottom_right=423) + Cell(BBox(x_top_left=160, y_top_left=374, width=665, height=49)), + Cell(BBox(x_top_left=825, y_top_left=374, width=669, height=49)) ] ]