Fix LAYOUT_TABLE prefix and suffix not being properly added

Belval · web-flow · commit b7510e68f946 · 2024-01-29T16:56:52.000-08:00
diff --git a/tests/test_get_text_and_words.py b/tests/test_get_text_and_words.py
@@ -9,6 +9,7 @@
 
 from textractor import Textractor
 from textractor.data.constants import TextractFeatures
+from textractor.data.text_linearization_config import TextLinearizationConfig
 from textractor.entities.document import Document
 from textractor.exceptions import InvalidProfileNameError, S3FilePathMissing
 from textractor.utils.s3_utils import upload_to_s3, delete_from_s3
@@ -85,3 +86,165 @@ def test_detect_no_missing_words(self):
             word_ids = set([w.id for w in words])
 
             self.assertTrue(original_word_set.issubset(word_ids))
+
+    def test_table_prefixes_and_suffixes_in_text(self):
+        if os.environ.get("CALL_TEXTRACT"):
+            document = self.extractor.analyze_document(
+                os.path.join(self.current_directory, "fixtures/single-page-1.png"),
+                features=[
+                    TextractFeatures.LAYOUT,
+                    TextractFeatures.TABLES,
+                    TextractFeatures.FORMS,
+                    TextractFeatures.SIGNATURES
+                ]
+            )
+            with open(get_fixture_path(), "w") as f:
+                json.dump(document.response, f)
+        else:
+            document = Document.open(get_fixture_path())
+
+        config = TextLinearizationConfig(
+            title_prefix = "<title>",  #: Prefix for title layout elements
+            title_suffix = "</title>",  #: Suffix for title layout elements
+            table_layout_prefix = "<table_layout>",  #: Prefix for table elements
+            table_layout_suffix = "</table_layout>",  #: Suffix for table elements
+            table_prefix = "<table>",
+            table_suffix = "</table>",
+            table_row_prefix = "<tr>",  #: Prefix for table row
+            table_row_suffix = "</tr>",  #: Suffix for table row
+            table_cell_prefix = "<td>",  #: Prefix for table cell
+            table_cell_suffix = "</td>",  #: Suffix for table cell
+            table_cell_header_prefix = "<th>",  #: Prefix for header cell
+            table_cell_header_suffix = "</th>",  #: Suffix for header cell
+            header_prefix = "<header>",  #: Prefix for header layout elements
+            header_suffix = "</header>",  #: Suffix for header layout elements
+            section_header_prefix = "<section_header>",  #: Prefix for section header layout elements
+            section_header_suffix = "</section_header>",  #: Suffix for section header layout elements
+            text_prefix = "<text>",  #: Prefix for text layout elements
+            text_suffix = "</text>",  #: Suffix for text layout elements
+            key_value_layout_prefix = "<kv_layout>",  #: Prefix for key_value layout elements (not for individual key-value elements)
+            key_value_layout_suffix = "</kv_layout>",  #: Suffix for key_value layout elements (not for individual key-value elements)
+            key_value_prefix = "<kv>",  #: Prefix for key-value elements
+            key_value_suffix = "</kv>",  #: Suffix for key-value elements
+            key_prefix = "<key>",  #: Prefix for key elements
+            key_suffix = "</key>",  #: Suffix for key elements
+            value_prefix = "<value>",  #: Prefix for value elements
+            value_suffix = "</value>",  #: Suffix for value elements
+            add_prefixes_and_suffixes_in_text=True,
+            add_prefixes_and_suffixes_as_words=True,
+        )
+
+        text, _ = document.get_text_and_words(config)
+
+        for token in [
+            "<title>",
+            "</title>",
+            "<table>",
+            "</table>",
+            "<table_layout>",
+            "</table_layout>",
+            "<tr>",
+            "</tr>",
+            "<td>",
+            "</td>",
+            # Sample does not have header cells
+            #"<th>",
+            #"</th>",
+            # Sample does not have header
+            #"<header>",
+            #"</header>",
+            "<section_header>",
+            "</section_header>",
+            # Sample does not have header
+            #"<kv_layout>",
+            #"</kv_layout>",
+            "<kv>",
+            "</kv>",
+            "<key>",
+            "</key>",
+            "<value>",
+            "</value>",
+        ]:
+            self.assertTrue(token in text, f"{token} is not in text")
+
+    def test_table_prefixes_and_suffixes_in_words(self):
+        if os.environ.get("CALL_TEXTRACT"):
+            document = self.extractor.analyze_document(
+                os.path.join(self.current_directory, "fixtures/single-page-1.png"),
+                features=[
+                    TextractFeatures.LAYOUT,
+                    TextractFeatures.TABLES,
+                    TextractFeatures.FORMS,
+                    TextractFeatures.SIGNATURES
+                ]
+            )
+            with open(get_fixture_path(), "w") as f:
+                json.dump(document.response, f)
+        else:
+            document = Document.open(get_fixture_path())
+
+        config = TextLinearizationConfig(
+            title_prefix = "<title>",  #: Prefix for title layout elements
+            title_suffix = "</title>",  #: Suffix for title layout elements
+            table_layout_prefix = "<table_layout>",  #: Prefix for table elements
+            table_layout_suffix = "</table_layout>",  #: Suffix for table elements
+            table_prefix = "<table>",
+            table_suffix = "</table>",
+            table_row_prefix = "<tr>",  #: Prefix for table row
+            table_row_suffix = "</tr>",  #: Suffix for table row
+            table_cell_prefix = "<td>",  #: Prefix for table cell
+            table_cell_suffix = "</td>",  #: Suffix for table cell
+            table_cell_header_prefix = "<th>",  #: Prefix for header cell
+            table_cell_header_suffix = "</th>",  #: Suffix for header cell
+            header_prefix = "<header>",  #: Prefix for header layout elements
+            header_suffix = "</header>",  #: Suffix for header layout elements
+            section_header_prefix = "<section_header>",  #: Prefix for section header layout elements
+            section_header_suffix = "</section_header>",  #: Suffix for section header layout elements
+            text_prefix = "<text>",  #: Prefix for text layout elements
+            text_suffix = "</text>",  #: Suffix for text layout elements
+            key_value_layout_prefix = "<kv_layout>",  #: Prefix for key_value layout elements (not for individual key-value elements)
+            key_value_layout_suffix = "</kv_layout>",  #: Suffix for key_value layout elements (not for individual key-value elements)
+            key_value_prefix = "<kv>",  #: Prefix for key-value elements
+            key_value_suffix = "</kv>",  #: Suffix for key-value elements
+            key_prefix = "<key>",  #: Prefix for key elements
+            key_suffix = "</key>",  #: Suffix for key elements
+            value_prefix = "<value>",  #: Prefix for value elements
+            value_suffix = "</value>",  #: Suffix for value elements
+            add_prefixes_and_suffixes_in_text=True,
+            add_prefixes_and_suffixes_as_words=True,
+        )
+
+        _, words = document.get_text_and_words(config)
+
+        words = [w.text for w in words]
+
+        for token in [
+            "<title>",
+            "</title>",
+            "<table>",
+            "</table>",
+            "<table_layout>",
+            "</table_layout>",
+            "<tr>",
+            "</tr>",
+            "<td>",
+            "</td>",
+            # Sample does not have header cells
+            #"<th>",
+            #"</th>",
+            # Sample does not have header
+            #"<header>",
+            #"</header>",
+            "<section_header>",
+            "</section_header>",
+            # Sample does not have header
+            #"<kv_layout>",
+            #"</kv_layout>",
+            "<kv>",
+            "</kv>",
+            "<key>",
+            "</key>",
+            "<value>",
+            "</value>",
+        ]:
+            self.assertTrue(token in words, f"{token} is not in text")
diff --git a/textractor/entities/layout.py b/textractor/entities/layout.py
@@ -203,9 +203,9 @@ def get_text_and_words(
                 )
             if config.add_prefixes_and_suffixes_as_words:
                 final_words = (
-                    ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.header_prefix, is_structure=True)] if config.title_prefix else []) + 
+                    ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.header_prefix, is_structure=True)] if config.header_prefix else []) + 
                     final_words + 
-                    ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.header_suffix, is_structure=True)] if config.title_suffix else []) 
+                    ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.header_suffix, is_structure=True)] if config.header_suffix else []) 
                 )
         elif self.layout_type == LAYOUT_SECTION_HEADER:
             final_text, final_words = linearize_children(
@@ -217,9 +217,9 @@ def get_text_and_words(
                 )
             if config.add_prefixes_and_suffixes_as_words:
                 final_words = (
-                    ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.section_header_prefix, is_structure=True)] if config.title_prefix else []) + 
+                    ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.section_header_prefix, is_structure=True)] if config.section_header_prefix else []) + 
                     final_words + 
-                    ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.section_header_suffix, is_structure=True)] if config.title_suffix else []) 
+                    ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.section_header_suffix, is_structure=True)] if config.section_header_suffix else []) 
                 )
         elif self.layout_type == LAYOUT_TEXT:
             final_text, final_words = linearize_children(
@@ -235,6 +235,29 @@ def get_text_and_words(
                 is_layout_table=self.layout_type == LAYOUT_TABLE,
             )
 
+            if config.add_prefixes_and_suffixes_in_text:
+                if self.layout_type == LAYOUT_TABLE:
+                    final_text = (
+                        config.table_layout_prefix + final_text + config.table_layout_suffix
+                    )
+                elif self.layout_type == LAYOUT_KEY_VALUE:
+                    final_text = (
+                        config.table_layout_prefix + final_text + config.table_layout_suffix
+                    )
+            if config.add_prefixes_and_suffixes_as_words:
+                if self.layout_type == LAYOUT_TABLE:
+                    final_words = (
+                        ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.table_layout_prefix, is_structure=True)] if config.table_layout_prefix else []) + 
+                        final_words + 
+                        ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.table_layout_suffix, is_structure=True)] if config.table_layout_suffix else []) 
+                    )
+                elif self.layout_type == LAYOUT_KEY_VALUE:
+                    final_words = (
+                        ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.key_value_layout_prefix, is_structure=True)] if config.key_value_layout_prefix else []) + 
+                        final_words + 
+                        ([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.key_value_layout_suffix, is_structure=True)] if config.key_value_layout_suffix else []) 
+                    )
+
         while (
             config.layout_element_separator * (config.max_number_of_consecutive_new_lines + 1) in final_text
         ):
diff --git a/textractor/entities/table.py b/textractor/entities/table.py
@@ -716,6 +716,8 @@ def get_text_and_words(
             w.table_id = str(self.id)
             w.table_bbox = self.bbox
 
+
+        text = (config.table_prefix if config.add_prefixes_and_suffixes_in_text else "")
         # Markdown
         if config.table_linearization_format == "markdown":
             df = self.to_pandas(
@@ -727,12 +729,11 @@ def get_text_and_words(
                 headers = df.columns if has_column else []
             else:
                 headers = df.columns
-            text = df.to_markdown(
+            text += df.to_markdown(
                 index=False, tablefmt=config.table_tabulate_format, headers=headers
             )
         # Plaintext
         else:
-            text = ""
             rows = itertools.groupby(self.table_cells, key=lambda cell: cell.row_index)
             processed_cells = set()
             for _, row in rows:
@@ -764,7 +765,7 @@ def get_text_and_words(
                     text = text[:-1]
                 text += (config.table_row_suffix if config.add_prefixes_and_suffixes_in_text else "")
                 text += "\n"
-
+        text += (config.table_suffix if config.add_prefixes_and_suffixes_in_text else "")
         return text, words
 
     def to_txt(self):
diff --git a/textractor/utils/text_utils.py b/textractor/utils/text_utils.py
@@ -149,9 +149,7 @@ def part_of_same_paragraph(element1, element2, config=config):
         for idx, element in enumerate(sorted_group):
             text_element, words_element = element.get_text_and_words(config)
             if "Table" in element.__class__.__name__ and len(words_element):
-                result += (
-                    config.table_layout_prefix if config.add_prefixes_and_suffixes_in_text else ""
-                ) + text_element
+                result += text_element
                 for w in words_element:
                     added_words.add(w.id)
                 words_output += words_element
@@ -161,11 +159,7 @@ def part_of_same_paragraph(element1, element2, config=config):
                     if prev_element and part_of_same_paragraph(prev_element, element, config) else
                     config.same_layout_element_separator
                 )
-                result += separator + (
-                    (config.key_value_layout_prefix if config.add_prefixes_and_suffixes_in_text else "") +
-                    text_element + 
-                    (config.key_value_layout_suffix if config.add_prefixes_and_suffixes_in_text else "")
-                )
+                result += separator + text_element
                 for w in words_element:
                     added_words.add(w.id)
                 words_output += words_element