Skip to content

Commit

Permalink
TLDR-861 added BBox to Cell; rename TableHeaderExtractor; refactor ta…
Browse files Browse the repository at this point in the history
…ble recognizer; added tests
  • Loading branch information
oksidgy committed Dec 13, 2024
1 parent 2a3d0e6 commit d5b1cc0
Show file tree
Hide file tree
Showing 12 changed files with 220 additions and 276 deletions.
60 changes: 11 additions & 49 deletions dedoc/readers/pdf_reader/data_classes/tables/cell.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
from typing import List, Optional

from dedocutils.data_structures import BBox
Expand All @@ -9,64 +10,33 @@
class Cell(CellWithMeta):

@staticmethod
def copy_from(cell: "Cell",
x_top_left: Optional[int] = None,
x_bottom_right: Optional[int] = None,
y_top_left: Optional[int] = None,
y_bottom_right: Optional[int] = None) -> "Cell":
x_top_left = cell.x_top_left if x_top_left is None else x_top_left
x_bottom_right = cell.x_bottom_right if x_bottom_right is None else x_bottom_right
y_top_left = cell.y_top_left if y_top_left is None else y_top_left
y_bottom_right = cell.y_bottom_right if y_bottom_right is None else y_bottom_right

# TODO change x_top_left ... y_bottom_right to BBox

return Cell(x_top_left=x_top_left,
x_bottom_right=x_bottom_right,
y_top_left=y_top_left,
y_bottom_right=y_bottom_right,
id_con=cell.id_con,
lines=cell.lines,
colspan=cell.colspan,
rowspan=cell.rowspan,
invisible=cell.invisible,
is_attribute=cell.is_attribute,
is_attribute_required=cell.is_attribute_required,
rotated_angle=cell.rotated_angle,
uid=cell.uuid,
contour_coord=cell.con_coord)
def copy_from(cell: "Cell", bbox: Optional[BBox] = None) -> "Cell":
copy_cell = copy.deepcopy(cell)
if bbox:
copy_cell.bbox = bbox

return copy_cell

def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None:
if self.lines:
for line in self.lines:
line.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)
self.x_top_left += shift_x
self.x_bottom_right += shift_x
self.y_top_left += shift_y
self.y_bottom_right += shift_y

self.bbox.shift(shift_x=shift_x, shift_y=shift_y)
if self.con_coord:
self.con_coord.shift(shift_x=shift_x, shift_y=shift_y)

def __init__(self, x_top_left: int, x_bottom_right: int, y_top_left: int, y_bottom_right: int, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None,
def __init__(self, bbox: BBox, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None,
is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = Optional[None],
contour_coord: Optional[BBox] = None, colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None:

import uuid

assert x_top_left <= x_bottom_right
assert y_top_left <= y_bottom_right

self.lines = [] if lines is None else lines
super().__init__(lines=lines, colspan=colspan, rowspan=rowspan, invisible=invisible)

# TODO change to BBox
self.x_top_left = x_top_left
self.x_bottom_right = x_bottom_right
self.y_top_left = y_top_left
self.y_bottom_right = y_bottom_right

self.bbox = bbox
self.id_con = id_con

self.is_attribute = is_attribute
self.is_attribute_required = is_attribute_required
self.rotated_angle = rotated_angle
Expand Down Expand Up @@ -96,11 +66,3 @@ def change_lines_boxes_page_width_height(self, new_page_width: int, new_page_hei

def __repr__(self) -> str:
return self.__str__()

@property
def width(self) -> int:
return self.x_bottom_right - self.x_top_left

@property
def height(self) -> int:
return self.y_bottom_right - self.y_top_left
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Dict, List, Optional, Tuple

import numpy as np
from dedocutils.data_structures import BBox

from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell
from dedoc.utils.utils import flatten
Expand Down Expand Up @@ -55,25 +56,26 @@ def split(self, cells: List[List[Cell]]) -> List[List[Cell]]:
for row_id, row in enumerate(result_matrix):
for col_id, cell in enumerate(row):
if cell is None:
result_matrix[row_id][col_id] = Cell(x_top_left=horizontal_borders[row_id],
x_bottom_right=horizontal_borders[row_id + 1],
y_top_left=vertical_borders[col_id],
y_bottom_right=vertical_borders[col_id + 1])
bbox = BBox(x_top_left=int(horizontal_borders[row_id]),
y_top_left=int(vertical_borders[col_id]),
width=int(horizontal_borders[row_id + 1] - horizontal_borders[row_id]),
height=int(vertical_borders[col_id + 1] - vertical_borders[col_id]))
result_matrix[row_id][col_id] = Cell(bbox=bbox)
return result_matrix

@staticmethod
def __split_one_cell(cell: Cell, horizontal_borders: np.ndarray, vertical_borders: np.ndarray, result_matrix: List[List[Cell]]) -> None:
left_id, right_id = np.searchsorted(vertical_borders, [cell.x_top_left, cell.x_bottom_right])
top_id, bottom_id = np.searchsorted(horizontal_borders, [cell.y_top_left, cell.y_bottom_right])
left_id, right_id = np.searchsorted(vertical_borders, [cell.bbox.x_top_left, cell.bbox.x_bottom_right])
top_id, bottom_id = np.searchsorted(horizontal_borders, [cell.bbox.y_top_left, cell.bbox.y_bottom_right])
colspan = right_id - left_id
rowspan = bottom_id - top_id
for row_id in range(top_id, bottom_id):
for column_id in range(left_id, right_id):
new_cell = Cell.copy_from(cell,
x_top_left=vertical_borders[column_id],
x_bottom_right=vertical_borders[column_id + 1],
y_top_left=horizontal_borders[row_id],
y_bottom_right=horizontal_borders[row_id + 1])
bbox = BBox(x_top_left=int(vertical_borders[column_id]),
y_top_left=int(horizontal_borders[row_id]),
width=int(vertical_borders[column_id + 1] - vertical_borders[column_id]),
height=int(horizontal_borders[row_id + 1] - horizontal_borders[row_id]))
new_cell = Cell.copy_from(cell, bbox)
new_cell.invisible = True
result_matrix[row_id][column_id] = new_cell

Expand Down Expand Up @@ -106,20 +108,21 @@ def _merge_close_borders(self, cells: List[List[Cell]]) -> List[List[Cell]]:
@return: cells with merged borders
"""
horizontal_borders, vertical_borders = self.__get_borders(cells)
eps_vertical = self.eps * min((cell.width for cell in flatten(cells)), default=0)
eps_horizontal = self.eps * min((cell.height for cell in flatten(cells)), default=0)
eps_vertical = self.eps * min((cell.bbox.width for cell in flatten(cells)), default=0)
eps_horizontal = self.eps * min((cell.bbox.height for cell in flatten(cells)), default=0)
horizontal_dict = self.__get_border_dict(borders=horizontal_borders, threshold=eps_horizontal)
vertical_dict = self.__get_border_dict(borders=vertical_borders, threshold=eps_vertical)
result = []
for row in cells:
new_row = []
for cell in row:
x_top_left = vertical_dict[cell.x_top_left]
x_bottom_right = vertical_dict[cell.x_bottom_right]
y_top_left = horizontal_dict[cell.y_top_left]
y_bottom_right = horizontal_dict[cell.y_bottom_right]
x_top_left = vertical_dict[cell.bbox.x_top_left]
x_bottom_right = vertical_dict[cell.bbox.x_bottom_right]
y_top_left = horizontal_dict[cell.bbox.y_top_left]
y_bottom_right = horizontal_dict[cell.bbox.y_bottom_right]
if y_top_left < y_bottom_right and x_top_left < x_bottom_right:
new_cell = Cell.copy_from(cell, x_top_left=x_top_left, x_bottom_right=x_bottom_right, y_top_left=y_top_left, y_bottom_right=y_bottom_right)
bbox = BBox(x_top_left=x_top_left, y_top_left=y_top_left, width=x_bottom_right - x_top_left, height=y_bottom_right - y_top_left)
new_cell = Cell.copy_from(cell, bbox)
new_row.append(new_cell)
result.append(new_row)
return result
Expand All @@ -130,8 +133,8 @@ def __get_borders(cells: List[List[Cell]]) -> Tuple[List[int], List[int]]:
vertical_borders = []
for row in cells:
for cell in row:
horizontal_borders.append(cell.y_top_left)
horizontal_borders.append(cell.y_bottom_right)
vertical_borders.append(cell.x_top_left)
vertical_borders.append(cell.x_bottom_right)
horizontal_borders.append(cell.bbox.y_top_left)
horizontal_borders.append(cell.bbox.y_bottom_right)
vertical_borders.append(cell.bbox.x_top_left)
vertical_borders.append(cell.bbox.x_bottom_right)
return horizontal_borders, vertical_borders
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,8 @@ def _split_row(cell_splitter: Cell, union_cell: List[Cell], language: str, image

# Get width of all union cell
eps = len(union_cell)
x_left = union_cell[0].x_top_left + eps
x_right = union_cell[-1].x_bottom_right
x_left = union_cell[0].bbox.x_top_left + eps
x_right = union_cell[-1].bbox.x_bottom_right
# get y coordinate from cell before union cell
y_top_split = cell_splitter.con_coord.y_top_left
y_bottom_split = cell_splitter.con_coord.y_top_left + cell_splitter.con_coord.height
Expand All @@ -141,8 +141,8 @@ def _split_row(cell_splitter: Cell, union_cell: List[Cell], language: str, image
col_id = len(union_cell) - 1
result_row = copy.deepcopy(union_cell)
while col_id >= 0:
union_cell[col_id].y_top_left = y_top_split
union_cell[col_id].y_bottom_right = y_bottom_split
union_cell[col_id].bbox.y_top_left = y_top_split
union_cell[col_id].bbox.height = y_bottom_split - union_cell[col_id].bbox.y_top_left

cell_image, padding_value = OCRCellExtractor.upscale(image[y_top_split:y_bottom_split, x_left:x_right])
result_row[col_id].lines = __get_ocr_lines(cell_image, language, page_image=image,
Expand All @@ -163,10 +163,8 @@ def __get_ocr_lines(cell_image: np.ndarray, language: str, page_image: np.ndarra
text_line = OCRCellExtractor.get_line_with_meta("")
for word in line.words:
# do absolute coordinate on src_image (inside src_image)
word.bbox.y_top_left -= padding_cell_value
word.bbox.x_top_left -= padding_cell_value
word.bbox.y_top_left += cell_bbox.y_top_left
word.bbox.x_top_left += cell_bbox.x_top_left
word.bbox.shift(shift_x=-padding_cell_value, shift_y=-padding_cell_value)
word.bbox.shift(shift_x=cell_bbox.x_top_left, shift_y=cell_bbox.y_top_left)

# add space between words
if len(text_line) != 0:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.base_table_extractor import BaseTableExtractor
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableAttributeExtractor
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableHeaderExtractor
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.utils import equal_with_eps


Expand Down Expand Up @@ -117,12 +117,12 @@ def __get_width_cell_wo_separating(row: List[Cell]) -> List[int]:
end = None
for cell_id, cell in enumerate(row):
if prev_uid is None:
start = cell.x_top_left
start = cell.bbox.x_top_left
prev_uid = cell.uuid
elif prev_uid != cell.uuid:
widths.append(end - start)
start = cell.x_top_left
end = cell.x_bottom_right
start = cell.bbox.x_top_left
end = cell.bbox.x_bottom_right
if cell_id == len(row) - 1:
widths.append(end - start)
return widths
Expand Down Expand Up @@ -154,16 +154,16 @@ def __is_one_table(self, t1: ScanTable, t2: ScanTable) -> bool:
return False

# condition 2. Exclusion of the duplicated header (if any)
attr1 = TableAttributeExtractor.get_header_table(t1.cells)
attr2 = TableAttributeExtractor.get_header_table(t2.cells)
attr1 = TableHeaderExtractor.get_header_table(t1.cells)
attr2 = TableHeaderExtractor.get_header_table(t2.cells)
t2_update = copy.deepcopy(t2)
if TableAttributeExtractor.is_equal_attributes(attr1, attr2):
if TableHeaderExtractor.is_equal_header(attr1, attr2):
t2_update.cells = t2_update.cells[len(attr2):]

if len(t2_update.cells) == 0 or len(t1.cells) == 0:
return False

TableAttributeExtractor.clear_attributes(t2_update.cells)
TableHeaderExtractor.clear_attributes(t2_update.cells)

# condition 3. Number of columns should be equal
if len(t1.cells[-1]) != len(t2_update.cells[0]):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.cell_splitter import CellSplitter
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.split_last_hor_union_cells import split_last_column
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.base_table_extractor import BaseTableExtractor
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableAttributeExtractor
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableHeaderExtractor
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.img_processing import detect_tables_by_contours


Expand All @@ -23,7 +23,7 @@ def __init__(self, *, config: dict, logger: logging.Logger) -> None:

self.image = None
self.page_number = 0
self.attribute_selector = TableAttributeExtractor(logger=self.logger)
self.table_header_selector = TableHeaderExtractor(logger=self.logger)
self.count_vertical_extended = 0
self.splitter = CellSplitter()
self.table_options = TableTypeAdditionalOptions()
Expand All @@ -50,17 +50,6 @@ def extract_onepage_tables_from_image(self, image: np.ndarray, page_number: int,
location.bbox.rotate_coordinates(angle_rotate=-angle_rotate, image_shape=image.shape)
location.rotated_angle = angle_rotate

tables = self.__select_attributes_tables(tables=tables)

return tables

def __select_attributes_tables(self, tables: List[ScanTable]) -> List[ScanTable]:
for table in tables:
table = self.attribute_selector.set_attributes(table)

if self.config.get("debug_mode", False):
self._print_table_attr(table.cells)

return tables

def __get_matrix_table_from_tree(self, table_tree: TableTree) -> ScanTable:
Expand All @@ -71,15 +60,12 @@ def __get_matrix_table_from_tree(self, table_tree: TableTree) -> ScanTable:
matrix = []
line = []
for cell in table_tree.children:
if len(line) != 0 and abs(cell.cell_box.y_top_left - line[-1].y_top_left) > 15: # add eps
if len(line) != 0 and abs(cell.cell_box.y_top_left - line[-1].bbox.y_top_left) > 15: # add eps
cpy_line = copy.deepcopy(line)
matrix.append(cpy_line)
line.clear()

cell_ = Cell(x_top_left=cell.cell_box.x_top_left,
x_bottom_right=cell.cell_box.x_bottom_right,
y_top_left=cell.cell_box.y_top_left,
y_bottom_right=cell.cell_box.y_bottom_right,
cell_ = Cell(bbox=cell.cell_box,
id_con=cell.id_contours,
lines=cell.lines,
contour_coord=cell.cell_box)
Expand All @@ -88,7 +74,7 @@ def __get_matrix_table_from_tree(self, table_tree: TableTree) -> ScanTable:

# sorting column in each row
for i in range(0, len(matrix)):
matrix[i] = sorted(matrix[i], key=lambda cell: cell.x_top_left, reverse=False)
matrix[i] = sorted(matrix[i], key=lambda cell: cell.bbox.x_top_left, reverse=False)

matrix_table = ScanTable(cells=matrix, bbox=table_tree.cell_box, page_number=self.page_number)

Expand Down Expand Up @@ -125,4 +111,9 @@ def handle_cells(self, cells: List[List[Cell]], table_type: str = "") -> List[Li
if self.table_options.split_last_column in table_type:
cells = split_last_column(cells, language=self.language, image=self.image)

self.table_header_selector.set_header_cells(cells)

if self.config.get("debug_mode", False):
self._print_table_attr(cells)

return cells
Loading

0 comments on commit d5b1cc0

Please sign in to comment.