Skip to content

Commit b7510e6

Browse files
authored
Fix LAYOUT_TABLE prefix and suffix not being properly added
2 parents 336f655 + 76ce648 commit b7510e6

File tree

4 files changed

+196
-15
lines changed

4 files changed

+196
-15
lines changed

tests/test_get_text_and_words.py

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from textractor import Textractor
1111
from textractor.data.constants import TextractFeatures
12+
from textractor.data.text_linearization_config import TextLinearizationConfig
1213
from textractor.entities.document import Document
1314
from textractor.exceptions import InvalidProfileNameError, S3FilePathMissing
1415
from textractor.utils.s3_utils import upload_to_s3, delete_from_s3
@@ -85,3 +86,165 @@ def test_detect_no_missing_words(self):
8586
word_ids = set([w.id for w in words])
8687

8788
self.assertTrue(original_word_set.issubset(word_ids))
89+
90+
def test_table_prefixes_and_suffixes_in_text(self):
91+
if os.environ.get("CALL_TEXTRACT"):
92+
document = self.extractor.analyze_document(
93+
os.path.join(self.current_directory, "fixtures/single-page-1.png"),
94+
features=[
95+
TextractFeatures.LAYOUT,
96+
TextractFeatures.TABLES,
97+
TextractFeatures.FORMS,
98+
TextractFeatures.SIGNATURES
99+
]
100+
)
101+
with open(get_fixture_path(), "w") as f:
102+
json.dump(document.response, f)
103+
else:
104+
document = Document.open(get_fixture_path())
105+
106+
config = TextLinearizationConfig(
107+
title_prefix = "<title>", #: Prefix for title layout elements
108+
title_suffix = "</title>", #: Suffix for title layout elements
109+
table_layout_prefix = "<table_layout>", #: Prefix for table elements
110+
table_layout_suffix = "</table_layout>", #: Suffix for table elements
111+
table_prefix = "<table>",
112+
table_suffix = "</table>",
113+
table_row_prefix = "<tr>", #: Prefix for table row
114+
table_row_suffix = "</tr>", #: Suffix for table row
115+
table_cell_prefix = "<td>", #: Prefix for table cell
116+
table_cell_suffix = "</td>", #: Suffix for table cell
117+
table_cell_header_prefix = "<th>", #: Prefix for header cell
118+
table_cell_header_suffix = "</th>", #: Suffix for header cell
119+
header_prefix = "<header>", #: Prefix for header layout elements
120+
header_suffix = "</header>", #: Suffix for header layout elements
121+
section_header_prefix = "<section_header>", #: Prefix for section header layout elements
122+
section_header_suffix = "</section_header>", #: Suffix for section header layout elements
123+
text_prefix = "<text>", #: Prefix for text layout elements
124+
text_suffix = "</text>", #: Suffix for text layout elements
125+
key_value_layout_prefix = "<kv_layout>", #: Prefix for key_value layout elements (not for individual key-value elements)
126+
key_value_layout_suffix = "</kv_layout>", #: Suffix for key_value layout elements (not for individual key-value elements)
127+
key_value_prefix = "<kv>", #: Prefix for key-value elements
128+
key_value_suffix = "</kv>", #: Suffix for key-value elements
129+
key_prefix = "<key>", #: Prefix for key elements
130+
key_suffix = "</key>", #: Suffix for key elements
131+
value_prefix = "<value>", #: Prefix for value elements
132+
value_suffix = "</value>", #: Suffix for value elements
133+
add_prefixes_and_suffixes_in_text=True,
134+
add_prefixes_and_suffixes_as_words=True,
135+
)
136+
137+
text, _ = document.get_text_and_words(config)
138+
139+
for token in [
140+
"<title>",
141+
"</title>",
142+
"<table>",
143+
"</table>",
144+
"<table_layout>",
145+
"</table_layout>",
146+
"<tr>",
147+
"</tr>",
148+
"<td>",
149+
"</td>",
150+
# Sample does not have header cells
151+
#"<th>",
152+
#"</th>",
153+
# Sample does not have header
154+
#"<header>",
155+
#"</header>",
156+
"<section_header>",
157+
"</section_header>",
158+
# Sample does not have header
159+
#"<kv_layout>",
160+
#"</kv_layout>",
161+
"<kv>",
162+
"</kv>",
163+
"<key>",
164+
"</key>",
165+
"<value>",
166+
"</value>",
167+
]:
168+
self.assertTrue(token in text, f"{token} is not in text")
169+
170+
def test_table_prefixes_and_suffixes_in_words(self):
171+
if os.environ.get("CALL_TEXTRACT"):
172+
document = self.extractor.analyze_document(
173+
os.path.join(self.current_directory, "fixtures/single-page-1.png"),
174+
features=[
175+
TextractFeatures.LAYOUT,
176+
TextractFeatures.TABLES,
177+
TextractFeatures.FORMS,
178+
TextractFeatures.SIGNATURES
179+
]
180+
)
181+
with open(get_fixture_path(), "w") as f:
182+
json.dump(document.response, f)
183+
else:
184+
document = Document.open(get_fixture_path())
185+
186+
config = TextLinearizationConfig(
187+
title_prefix = "<title>", #: Prefix for title layout elements
188+
title_suffix = "</title>", #: Suffix for title layout elements
189+
table_layout_prefix = "<table_layout>", #: Prefix for table elements
190+
table_layout_suffix = "</table_layout>", #: Suffix for table elements
191+
table_prefix = "<table>",
192+
table_suffix = "</table>",
193+
table_row_prefix = "<tr>", #: Prefix for table row
194+
table_row_suffix = "</tr>", #: Suffix for table row
195+
table_cell_prefix = "<td>", #: Prefix for table cell
196+
table_cell_suffix = "</td>", #: Suffix for table cell
197+
table_cell_header_prefix = "<th>", #: Prefix for header cell
198+
table_cell_header_suffix = "</th>", #: Suffix for header cell
199+
header_prefix = "<header>", #: Prefix for header layout elements
200+
header_suffix = "</header>", #: Suffix for header layout elements
201+
section_header_prefix = "<section_header>", #: Prefix for section header layout elements
202+
section_header_suffix = "</section_header>", #: Suffix for section header layout elements
203+
text_prefix = "<text>", #: Prefix for text layout elements
204+
text_suffix = "</text>", #: Suffix for text layout elements
205+
key_value_layout_prefix = "<kv_layout>", #: Prefix for key_value layout elements (not for individual key-value elements)
206+
key_value_layout_suffix = "</kv_layout>", #: Suffix for key_value layout elements (not for individual key-value elements)
207+
key_value_prefix = "<kv>", #: Prefix for key-value elements
208+
key_value_suffix = "</kv>", #: Suffix for key-value elements
209+
key_prefix = "<key>", #: Prefix for key elements
210+
key_suffix = "</key>", #: Suffix for key elements
211+
value_prefix = "<value>", #: Prefix for value elements
212+
value_suffix = "</value>", #: Suffix for value elements
213+
add_prefixes_and_suffixes_in_text=True,
214+
add_prefixes_and_suffixes_as_words=True,
215+
)
216+
217+
_, words = document.get_text_and_words(config)
218+
219+
words = [w.text for w in words]
220+
221+
for token in [
222+
"<title>",
223+
"</title>",
224+
"<table>",
225+
"</table>",
226+
"<table_layout>",
227+
"</table_layout>",
228+
"<tr>",
229+
"</tr>",
230+
"<td>",
231+
"</td>",
232+
# Sample does not have header cells
233+
#"<th>",
234+
#"</th>",
235+
# Sample does not have header
236+
#"<header>",
237+
#"</header>",
238+
"<section_header>",
239+
"</section_header>",
240+
# Sample does not have header
241+
#"<kv_layout>",
242+
#"</kv_layout>",
243+
"<kv>",
244+
"</kv>",
245+
"<key>",
246+
"</key>",
247+
"<value>",
248+
"</value>",
249+
]:
250+
self.assertTrue(token in words, f"{token} is not in text")

textractor/entities/layout.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -203,9 +203,9 @@ def get_text_and_words(
203203
)
204204
if config.add_prefixes_and_suffixes_as_words:
205205
final_words = (
206-
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.header_prefix, is_structure=True)] if config.title_prefix else []) +
206+
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.header_prefix, is_structure=True)] if config.header_prefix else []) +
207207
final_words +
208-
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.header_suffix, is_structure=True)] if config.title_suffix else [])
208+
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.header_suffix, is_structure=True)] if config.header_suffix else [])
209209
)
210210
elif self.layout_type == LAYOUT_SECTION_HEADER:
211211
final_text, final_words = linearize_children(
@@ -217,9 +217,9 @@ def get_text_and_words(
217217
)
218218
if config.add_prefixes_and_suffixes_as_words:
219219
final_words = (
220-
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.section_header_prefix, is_structure=True)] if config.title_prefix else []) +
220+
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.section_header_prefix, is_structure=True)] if config.section_header_prefix else []) +
221221
final_words +
222-
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.section_header_suffix, is_structure=True)] if config.title_suffix else [])
222+
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.section_header_suffix, is_structure=True)] if config.section_header_suffix else [])
223223
)
224224
elif self.layout_type == LAYOUT_TEXT:
225225
final_text, final_words = linearize_children(
@@ -235,6 +235,29 @@ def get_text_and_words(
235235
is_layout_table=self.layout_type == LAYOUT_TABLE,
236236
)
237237

238+
if config.add_prefixes_and_suffixes_in_text:
239+
if self.layout_type == LAYOUT_TABLE:
240+
final_text = (
241+
config.table_layout_prefix + final_text + config.table_layout_suffix
242+
)
243+
elif self.layout_type == LAYOUT_KEY_VALUE:
244+
final_text = (
245+
config.table_layout_prefix + final_text + config.table_layout_suffix
246+
)
247+
if config.add_prefixes_and_suffixes_as_words:
248+
if self.layout_type == LAYOUT_TABLE:
249+
final_words = (
250+
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.table_layout_prefix, is_structure=True)] if config.table_layout_prefix else []) +
251+
final_words +
252+
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.table_layout_suffix, is_structure=True)] if config.table_layout_suffix else [])
253+
)
254+
elif self.layout_type == LAYOUT_KEY_VALUE:
255+
final_words = (
256+
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.key_value_layout_prefix, is_structure=True)] if config.key_value_layout_prefix else []) +
257+
final_words +
258+
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.key_value_layout_suffix, is_structure=True)] if config.key_value_layout_suffix else [])
259+
)
260+
238261
while (
239262
config.layout_element_separator * (config.max_number_of_consecutive_new_lines + 1) in final_text
240263
):

textractor/entities/table.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -716,6 +716,8 @@ def get_text_and_words(
716716
w.table_id = str(self.id)
717717
w.table_bbox = self.bbox
718718

719+
720+
text = (config.table_prefix if config.add_prefixes_and_suffixes_in_text else "")
719721
# Markdown
720722
if config.table_linearization_format == "markdown":
721723
df = self.to_pandas(
@@ -727,12 +729,11 @@ def get_text_and_words(
727729
headers = df.columns if has_column else []
728730
else:
729731
headers = df.columns
730-
text = df.to_markdown(
732+
text += df.to_markdown(
731733
index=False, tablefmt=config.table_tabulate_format, headers=headers
732734
)
733735
# Plaintext
734736
else:
735-
text = ""
736737
rows = itertools.groupby(self.table_cells, key=lambda cell: cell.row_index)
737738
processed_cells = set()
738739
for _, row in rows:
@@ -764,7 +765,7 @@ def get_text_and_words(
764765
text = text[:-1]
765766
text += (config.table_row_suffix if config.add_prefixes_and_suffixes_in_text else "")
766767
text += "\n"
767-
768+
text += (config.table_suffix if config.add_prefixes_and_suffixes_in_text else "")
768769
return text, words
769770

770771
def to_txt(self):

textractor/utils/text_utils.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -149,9 +149,7 @@ def part_of_same_paragraph(element1, element2, config=config):
149149
for idx, element in enumerate(sorted_group):
150150
text_element, words_element = element.get_text_and_words(config)
151151
if "Table" in element.__class__.__name__ and len(words_element):
152-
result += (
153-
config.table_layout_prefix if config.add_prefixes_and_suffixes_in_text else ""
154-
) + text_element
152+
result += text_element
155153
for w in words_element:
156154
added_words.add(w.id)
157155
words_output += words_element
@@ -161,11 +159,7 @@ def part_of_same_paragraph(element1, element2, config=config):
161159
if prev_element and part_of_same_paragraph(prev_element, element, config) else
162160
config.same_layout_element_separator
163161
)
164-
result += separator + (
165-
(config.key_value_layout_prefix if config.add_prefixes_and_suffixes_in_text else "") +
166-
text_element +
167-
(config.key_value_layout_suffix if config.add_prefixes_and_suffixes_in_text else "")
168-
)
162+
result += separator + text_element
169163
for w in words_element:
170164
added_words.add(w.id)
171165
words_output += words_element

0 commit comments

Comments
 (0)