ESL-159 fixed extract boxes from pdfminer reader (#350)

ispras · Oct 10, 2023 · 0fae5f3 · 0fae5f3
1 parent f174ad6
commit 0fae5f3
Show file tree

Hide file tree

Showing 6 changed files with 57 additions and 7 deletions.
diff --git a/dedoc/data_structures/concrete_annotations/bbox_annotation.py b/dedoc/data_structures/concrete_annotations/bbox_annotation.py
@@ -1,4 +1,5 @@
 import json
+from typing import Tuple
 
 from dedocutils.data_structures import BBox
 from flask_restx import Api, Model, fields
@@ -25,6 +26,18 @@ def __init__(self, start: int, end: int, value: BBox, page_width: int, page_heig
 
         super().__init__(start=start, end=end, name=BBoxAnnotation.name, value=json.dumps(value.to_relative_dict(page_width, page_height)))
 
+    @staticmethod
+    def get_bbox_from_value(value: str) -> Tuple[BBox, int, int]:
+        """
+        Returns: BBox object, page_width, page_height
+        """
+        bbox_dict = json.loads(value)
+        bbox = BBox(x_top_left=int(bbox_dict["x_top_left"] * bbox_dict["page_width"]),
+                    y_top_left=int(bbox_dict["y_top_left"] * bbox_dict["page_height"]),
+                    width=int(bbox_dict["width"] * bbox_dict["page_width"]),
+                    height=int(bbox_dict["height"] * bbox_dict["page_height"]))
+        return bbox, bbox_dict["page_width"], bbox_dict["page_height"]
+
     @staticmethod
     def get_api_dict(api: Api) -> Model:
         return api.model("BBoxAnnotation", {

diff --git a/dedoc/readers/pdf_reader/data_classes/page_with_bboxes.py b/dedoc/readers/pdf_reader/data_classes/page_with_bboxes.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Optional
 
 import numpy as np
 
@@ -8,8 +8,11 @@
 
 class PageWithBBox:
 
-    def __init__(self, image: np.ndarray, bboxes: List[TextWithBBox], page_num: int, attachments: List[PdfImageAttachment] = None) -> None:
+    def __init__(self, image: np.ndarray, bboxes: List[TextWithBBox], page_num: int, attachments: List[PdfImageAttachment] = None,
+                 pdf_page_width: Optional[int] = None, pdf_page_height: Optional[int] = None) -> None:
         self.image = image
         self.bboxes = bboxes
         self.page_num = page_num
         self.attachments = [] if attachments is None else attachments
+        self.pdf_page_width = pdf_page_width
+        self.pdf_page_height = pdf_page_height
diff --git a/dedoc/readers/pdf_reader/data_classes/tables/cell.py b/dedoc/readers/pdf_reader/data_classes/tables/cell.py
@@ -3,6 +3,7 @@
 
 from dedocutils.data_structures import BBox
 
+from dedoc.data_structures import BBoxAnnotation
 from dedoc.data_structures.annotation import Annotation
 from dedoc.data_structures.line_with_meta import LineWithMeta
 
@@ -70,6 +71,24 @@ def get_text(self) -> str:
     def get_annotations(self) -> List[Annotation]:
         return LineWithMeta.join(self.lines, delimiter="\n").annotations
 
+    def change_lines_boxes_page_width_height(self, new_page_width: int, new_page_height: int) -> None:
+        for i_line, _ in enumerate(self.lines):
+            for i_ann, annotation in enumerate(self.lines[i_line].annotations):
+                if annotation.name != "bounding box":
+                    continue
+
+                bbox, page_width, page_height = BBoxAnnotation.get_bbox_from_value(annotation.value)
+                k_w = new_page_width / page_width
+                k_h = new_page_height / page_height
+                new_bbox = BBox(x_top_left=int(bbox.x_top_left * k_w), y_top_left=int(bbox.y_top_left * k_h),
+                                width=int(bbox.width * k_w), height=int(bbox.height * k_h))
+
+                self.lines[i_line].annotations[i_ann] = BBoxAnnotation(start=annotation.start,
+                                                                       end=annotation.end,
+                                                                       value=new_bbox,
+                                                                       page_width=new_page_width,
+                                                                       page_height=new_page_height)
+
     def __repr__(self) -> str:
         return self.__str__()
 

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
@@ -63,12 +63,24 @@ def _process_one_page(self,
         unreadable_blocks = [location.bbox for table in tables for location in table.locations]
         page.bboxes = [bbox for bbox in page.bboxes if not self._inside_any_unreadable_block(bbox.bbox, unreadable_blocks)]
         lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False)
+        self.__change_table_boxes_page_width_heigth(pdf_width=page.pdf_page_width, pdf_height=page.pdf_page_height, tables=tables)
 
         if self.config.get("labeling_mode"):
             save_page_with_bbox(page=page, config=self.config, document_name=os.path.basename(path))
 
         return lines, tables, page.attachments, []
 
+    def __change_table_boxes_page_width_heigth(self, pdf_width: int, pdf_height: int, tables: List[ScanTable]) -> None:
+        """
+        Change table boxes's width height into pdf space like textual lines
+        """
+
+        for table in tables:
+            for row in table.matrix_cells:
+
+                for cell in row:
+                    cell.change_lines_boxes_page_width_height(new_page_width=pdf_width, new_page_height=pdf_height)
+
     def _inside_any_unreadable_block(self, obj_bbox: BBox, unreadable_blocks: List[BBox]) -> bool:
         """
         Check obj_bbox inside some unreadable blocks or not

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py
@@ -107,7 +107,7 @@ def __handle_page(self, page: PDFPage, page_number: int, path: str) -> PageWithB
 
         attachments = images if len(images) < 10 else []
 
-        return PageWithBBox(bboxes=bboxes, image=image_page, page_num=page_number, attachments=attachments)
+        return PageWithBBox(bboxes=bboxes, image=image_page, page_num=page_number, attachments=attachments, pdf_page_height=height, pdf_page_width=width)
 
     def __extract_image(self,
                         directory: str,

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_utils.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_utils.py
@@ -1,4 +1,3 @@
-import json
 import re
 from typing import IO, List, Match, Optional, Tuple
 
@@ -33,9 +32,13 @@ def draw_layout_element(image_src: np.ndarray,
 
 def draw_annotation(image: np.ndarray, annotations: List[BBoxAnnotation]) -> None:
     for ann in annotations:
-        bbox = json.loads(ann.value)
-        p1 = (int(bbox["x_top_left"] * bbox["page_width"]), int(bbox["y_top_left"] * bbox["page_height"]))
-        p2 = (int((bbox["x_top_left"] + bbox["width"]) * bbox["page_width"]), int((bbox["y_top_left"] + bbox["height"]) * bbox["page_height"]))
+        bbox, page_width, page_height = BBoxAnnotation.get_bbox_from_value(ann.value)
+
+        if page_height != image.shape[0] or page_width != image.shape[1]:
+            image = cv2.resize(image, dsize=(page_width, page_height), interpolation=cv2.INTER_CUBIC)
+
+        p1 = (bbox.x_top_left, bbox.y_top_left)
+        p2 = (bbox.x_bottom_right, bbox.y_bottom_right)
         cv2.rectangle(image, p1, p2, (0, 255, 0))