Skip to content

Commit 7c1a0ac

Browse files
committed
Fix LAYOUT_TABLE prefix and suffix not being properly added
1 parent 336f655 commit 7c1a0ac

File tree

3 files changed

+33
-15
lines changed

3 files changed

+33
-15
lines changed

textractor/entities/layout.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -203,9 +203,9 @@ def get_text_and_words(
203203
)
204204
if config.add_prefixes_and_suffixes_as_words:
205205
final_words = (
206-
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.header_prefix, is_structure=True)] if config.title_prefix else []) +
206+
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.header_prefix, is_structure=True)] if config.header_prefix else []) +
207207
final_words +
208-
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.header_suffix, is_structure=True)] if config.title_suffix else [])
208+
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.header_suffix, is_structure=True)] if config.header_suffix else [])
209209
)
210210
elif self.layout_type == LAYOUT_SECTION_HEADER:
211211
final_text, final_words = linearize_children(
@@ -217,9 +217,9 @@ def get_text_and_words(
217217
)
218218
if config.add_prefixes_and_suffixes_as_words:
219219
final_words = (
220-
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.section_header_prefix, is_structure=True)] if config.title_prefix else []) +
220+
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.section_header_prefix, is_structure=True)] if config.section_header_prefix else []) +
221221
final_words +
222-
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.section_header_suffix, is_structure=True)] if config.title_suffix else [])
222+
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.section_header_suffix, is_structure=True)] if config.section_header_suffix else [])
223223
)
224224
elif self.layout_type == LAYOUT_TEXT:
225225
final_text, final_words = linearize_children(
@@ -235,6 +235,29 @@ def get_text_and_words(
235235
is_layout_table=self.layout_type == LAYOUT_TABLE,
236236
)
237237

238+
if config.add_prefixes_and_suffixes_in_text:
239+
if self.layout_type == LAYOUT_TABLE:
240+
final_text = (
241+
config.table_layout_prefix + final_text + config.table_layout_suffix
242+
)
243+
elif self.layout_type == LAYOUT_KEY_VALUE:
244+
final_text = (
245+
config.table_layout_prefix + final_text + config.table_layout_suffix
246+
)
247+
if config.add_prefixes_and_suffixes_as_words:
248+
if self.layout_type == LAYOUT_TABLE:
249+
final_words = (
250+
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.table_layout_prefix, is_structure=True)] if config.table_layout_prefix else []) +
251+
final_words +
252+
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.table_layout_suffix, is_structure=True)] if config.table_layout_suffix else [])
253+
)
254+
elif self.layout_type == LAYOUT_KEY_VALUE:
255+
final_words = (
256+
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.key_value_layout_prefix, is_structure=True)] if config.key_value_layout_prefix else []) +
257+
final_words +
258+
([Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(final_words), config.key_value_layout_suffix, is_structure=True)] if config.key_value_layout_suffix else [])
259+
)
260+
238261
while (
239262
config.layout_element_separator * (config.max_number_of_consecutive_new_lines + 1) in final_text
240263
):

textractor/entities/table.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -716,6 +716,8 @@ def get_text_and_words(
716716
w.table_id = str(self.id)
717717
w.table_bbox = self.bbox
718718

719+
720+
text = (config.table_prefix if config.add_prefixes_and_suffixes_in_text else "")
719721
# Markdown
720722
if config.table_linearization_format == "markdown":
721723
df = self.to_pandas(
@@ -727,12 +729,11 @@ def get_text_and_words(
727729
headers = df.columns if has_column else []
728730
else:
729731
headers = df.columns
730-
text = df.to_markdown(
732+
text += df.to_markdown(
731733
index=False, tablefmt=config.table_tabulate_format, headers=headers
732734
)
733735
# Plaintext
734736
else:
735-
text = ""
736737
rows = itertools.groupby(self.table_cells, key=lambda cell: cell.row_index)
737738
processed_cells = set()
738739
for _, row in rows:
@@ -764,7 +765,7 @@ def get_text_and_words(
764765
text = text[:-1]
765766
text += (config.table_row_suffix if config.add_prefixes_and_suffixes_in_text else "")
766767
text += "\n"
767-
768+
text += (config.table_suffix if config.add_prefixes_and_suffixes_in_text else "")
768769
return text, words
769770

770771
def to_txt(self):

textractor/utils/text_utils.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -149,9 +149,7 @@ def part_of_same_paragraph(element1, element2, config=config):
149149
for idx, element in enumerate(sorted_group):
150150
text_element, words_element = element.get_text_and_words(config)
151151
if "Table" in element.__class__.__name__ and len(words_element):
152-
result += (
153-
config.table_layout_prefix if config.add_prefixes_and_suffixes_in_text else ""
154-
) + text_element
152+
result += text_element
155153
for w in words_element:
156154
added_words.add(w.id)
157155
words_output += words_element
@@ -161,11 +159,7 @@ def part_of_same_paragraph(element1, element2, config=config):
161159
if prev_element and part_of_same_paragraph(prev_element, element, config) else
162160
config.same_layout_element_separator
163161
)
164-
result += separator + (
165-
(config.key_value_layout_prefix if config.add_prefixes_and_suffixes_in_text else "") +
166-
text_element +
167-
(config.key_value_layout_suffix if config.add_prefixes_and_suffixes_in_text else "")
168-
)
162+
result += separator + text_element
169163
for w in words_element:
170164
added_words.add(w.id)
171165
words_output += words_element

0 commit comments

Comments
 (0)