Skip to content

Commit

Permalink
ESL-159 fixed extract boxes from pdfminer reader (#350)
Browse files Browse the repository at this point in the history
  • Loading branch information
oksidgy authored Oct 10, 2023
1 parent f174ad6 commit 0fae5f3
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 7 deletions.
13 changes: 13 additions & 0 deletions dedoc/data_structures/concrete_annotations/bbox_annotation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
from typing import Tuple

from dedocutils.data_structures import BBox
from flask_restx import Api, Model, fields
Expand All @@ -25,6 +26,18 @@ def __init__(self, start: int, end: int, value: BBox, page_width: int, page_heig

super().__init__(start=start, end=end, name=BBoxAnnotation.name, value=json.dumps(value.to_relative_dict(page_width, page_height)))

@staticmethod
def get_bbox_from_value(value: str) -> Tuple[BBox, int, int]:
"""
Returns: BBox object, page_width, page_height
"""
bbox_dict = json.loads(value)
bbox = BBox(x_top_left=int(bbox_dict["x_top_left"] * bbox_dict["page_width"]),
y_top_left=int(bbox_dict["y_top_left"] * bbox_dict["page_height"]),
width=int(bbox_dict["width"] * bbox_dict["page_width"]),
height=int(bbox_dict["height"] * bbox_dict["page_height"]))
return bbox, bbox_dict["page_width"], bbox_dict["page_height"]

@staticmethod
def get_api_dict(api: Api) -> Model:
return api.model("BBoxAnnotation", {
Expand Down
7 changes: 5 additions & 2 deletions dedoc/readers/pdf_reader/data_classes/page_with_bboxes.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List
from typing import List, Optional

import numpy as np

Expand All @@ -8,8 +8,11 @@

class PageWithBBox:

def __init__(self, image: np.ndarray, bboxes: List[TextWithBBox], page_num: int, attachments: List[PdfImageAttachment] = None) -> None:
def __init__(self, image: np.ndarray, bboxes: List[TextWithBBox], page_num: int, attachments: List[PdfImageAttachment] = None,
pdf_page_width: Optional[int] = None, pdf_page_height: Optional[int] = None) -> None:
self.image = image
self.bboxes = bboxes
self.page_num = page_num
self.attachments = [] if attachments is None else attachments
self.pdf_page_width = pdf_page_width
self.pdf_page_height = pdf_page_height
19 changes: 19 additions & 0 deletions dedoc/readers/pdf_reader/data_classes/tables/cell.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from dedocutils.data_structures import BBox

from dedoc.data_structures import BBoxAnnotation
from dedoc.data_structures.annotation import Annotation
from dedoc.data_structures.line_with_meta import LineWithMeta

Expand Down Expand Up @@ -70,6 +71,24 @@ def get_text(self) -> str:
def get_annotations(self) -> List[Annotation]:
return LineWithMeta.join(self.lines, delimiter="\n").annotations

def change_lines_boxes_page_width_height(self, new_page_width: int, new_page_height: int) -> None:
for i_line, _ in enumerate(self.lines):
for i_ann, annotation in enumerate(self.lines[i_line].annotations):
if annotation.name != "bounding box":
continue

bbox, page_width, page_height = BBoxAnnotation.get_bbox_from_value(annotation.value)
k_w = new_page_width / page_width
k_h = new_page_height / page_height
new_bbox = BBox(x_top_left=int(bbox.x_top_left * k_w), y_top_left=int(bbox.y_top_left * k_h),
width=int(bbox.width * k_w), height=int(bbox.height * k_h))

self.lines[i_line].annotations[i_ann] = BBoxAnnotation(start=annotation.start,
end=annotation.end,
value=new_bbox,
page_width=new_page_width,
page_height=new_page_height)

def __repr__(self) -> str:
return self.__str__()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,24 @@ def _process_one_page(self,
unreadable_blocks = [location.bbox for table in tables for location in table.locations]
page.bboxes = [bbox for bbox in page.bboxes if not self._inside_any_unreadable_block(bbox.bbox, unreadable_blocks)]
lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False)
self.__change_table_boxes_page_width_heigth(pdf_width=page.pdf_page_width, pdf_height=page.pdf_page_height, tables=tables)

if self.config.get("labeling_mode"):
save_page_with_bbox(page=page, config=self.config, document_name=os.path.basename(path))

return lines, tables, page.attachments, []

def __change_table_boxes_page_width_heigth(self, pdf_width: int, pdf_height: int, tables: List[ScanTable]) -> None:
"""
Change table boxes's width height into pdf space like textual lines
"""

for table in tables:
for row in table.matrix_cells:

for cell in row:
cell.change_lines_boxes_page_width_height(new_page_width=pdf_width, new_page_height=pdf_height)

def _inside_any_unreadable_block(self, obj_bbox: BBox, unreadable_blocks: List[BBox]) -> bool:
"""
Check obj_bbox inside some unreadable blocks or not
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def __handle_page(self, page: PDFPage, page_number: int, path: str) -> PageWithB

attachments = images if len(images) < 10 else []

return PageWithBBox(bboxes=bboxes, image=image_page, page_num=page_number, attachments=attachments)
return PageWithBBox(bboxes=bboxes, image=image_page, page_num=page_number, attachments=attachments, pdf_page_height=height, pdf_page_width=width)

def __extract_image(self,
directory: str,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import json
import re
from typing import IO, List, Match, Optional, Tuple

Expand Down Expand Up @@ -33,9 +32,13 @@ def draw_layout_element(image_src: np.ndarray,

def draw_annotation(image: np.ndarray, annotations: List[BBoxAnnotation]) -> None:
for ann in annotations:
bbox = json.loads(ann.value)
p1 = (int(bbox["x_top_left"] * bbox["page_width"]), int(bbox["y_top_left"] * bbox["page_height"]))
p2 = (int((bbox["x_top_left"] + bbox["width"]) * bbox["page_width"]), int((bbox["y_top_left"] + bbox["height"]) * bbox["page_height"]))
bbox, page_width, page_height = BBoxAnnotation.get_bbox_from_value(ann.value)

if page_height != image.shape[0] or page_width != image.shape[1]:
image = cv2.resize(image, dsize=(page_width, page_height), interpolation=cv2.INTER_CUBIC)

p1 = (bbox.x_top_left, bbox.y_top_left)
p2 = (bbox.x_bottom_right, bbox.y_bottom_right)
cv2.rectangle(image, p1, p2, (0, 255, 0))


Expand Down

0 comments on commit 0fae5f3

Please sign in to comment.