Skip to content

Commit 76ce648

Browse files
committed
Add tests
1 parent 7c1a0ac commit 76ce648

File tree

1 file changed

+163
-0
lines changed

1 file changed

+163
-0
lines changed

tests/test_get_text_and_words.py

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from textractor import Textractor
1111
from textractor.data.constants import TextractFeatures
12+
from textractor.data.text_linearization_config import TextLinearizationConfig
1213
from textractor.entities.document import Document
1314
from textractor.exceptions import InvalidProfileNameError, S3FilePathMissing
1415
from textractor.utils.s3_utils import upload_to_s3, delete_from_s3
@@ -85,3 +86,165 @@ def test_detect_no_missing_words(self):
8586
word_ids = set([w.id for w in words])
8687

8788
self.assertTrue(original_word_set.issubset(word_ids))
89+
90+
def test_table_prefixes_and_suffixes_in_text(self):
91+
if os.environ.get("CALL_TEXTRACT"):
92+
document = self.extractor.analyze_document(
93+
os.path.join(self.current_directory, "fixtures/single-page-1.png"),
94+
features=[
95+
TextractFeatures.LAYOUT,
96+
TextractFeatures.TABLES,
97+
TextractFeatures.FORMS,
98+
TextractFeatures.SIGNATURES
99+
]
100+
)
101+
with open(get_fixture_path(), "w") as f:
102+
json.dump(document.response, f)
103+
else:
104+
document = Document.open(get_fixture_path())
105+
106+
config = TextLinearizationConfig(
107+
title_prefix = "<title>", #: Prefix for title layout elements
108+
title_suffix = "</title>", #: Suffix for title layout elements
109+
table_layout_prefix = "<table_layout>", #: Prefix for table elements
110+
table_layout_suffix = "</table_layout>", #: Suffix for table elements
111+
table_prefix = "<table>",
112+
table_suffix = "</table>",
113+
table_row_prefix = "<tr>", #: Prefix for table row
114+
table_row_suffix = "</tr>", #: Suffix for table row
115+
table_cell_prefix = "<td>", #: Prefix for table cell
116+
table_cell_suffix = "</td>", #: Suffix for table cell
117+
table_cell_header_prefix = "<th>", #: Prefix for header cell
118+
table_cell_header_suffix = "</th>", #: Suffix for header cell
119+
header_prefix = "<header>", #: Prefix for header layout elements
120+
header_suffix = "</header>", #: Suffix for header layout elements
121+
section_header_prefix = "<section_header>", #: Prefix for section header layout elements
122+
section_header_suffix = "</section_header>", #: Suffix for section header layout elements
123+
text_prefix = "<text>", #: Prefix for text layout elements
124+
text_suffix = "</text>", #: Suffix for text layout elements
125+
key_value_layout_prefix = "<kv_layout>", #: Prefix for key_value layout elements (not for individual key-value elements)
126+
key_value_layout_suffix = "</kv_layout>", #: Suffix for key_value layout elements (not for individual key-value elements)
127+
key_value_prefix = "<kv>", #: Prefix for key-value elements
128+
key_value_suffix = "</kv>", #: Suffix for key-value elements
129+
key_prefix = "<key>", #: Prefix for key elements
130+
key_suffix = "</key>", #: Suffix for key elements
131+
value_prefix = "<value>", #: Prefix for value elements
132+
value_suffix = "</value>", #: Suffix for value elements
133+
add_prefixes_and_suffixes_in_text=True,
134+
add_prefixes_and_suffixes_as_words=True,
135+
)
136+
137+
text, _ = document.get_text_and_words(config)
138+
139+
for token in [
140+
"<title>",
141+
"</title>",
142+
"<table>",
143+
"</table>",
144+
"<table_layout>",
145+
"</table_layout>",
146+
"<tr>",
147+
"</tr>",
148+
"<td>",
149+
"</td>",
150+
# Sample does not have header cells
151+
#"<th>",
152+
#"</th>",
153+
# Sample does not have header
154+
#"<header>",
155+
#"</header>",
156+
"<section_header>",
157+
"</section_header>",
158+
# Sample does not have header
159+
#"<kv_layout>",
160+
#"</kv_layout>",
161+
"<kv>",
162+
"</kv>",
163+
"<key>",
164+
"</key>",
165+
"<value>",
166+
"</value>",
167+
]:
168+
self.assertTrue(token in text, f"{token} is not in text")
169+
170+
def test_table_prefixes_and_suffixes_in_words(self):
171+
if os.environ.get("CALL_TEXTRACT"):
172+
document = self.extractor.analyze_document(
173+
os.path.join(self.current_directory, "fixtures/single-page-1.png"),
174+
features=[
175+
TextractFeatures.LAYOUT,
176+
TextractFeatures.TABLES,
177+
TextractFeatures.FORMS,
178+
TextractFeatures.SIGNATURES
179+
]
180+
)
181+
with open(get_fixture_path(), "w") as f:
182+
json.dump(document.response, f)
183+
else:
184+
document = Document.open(get_fixture_path())
185+
186+
config = TextLinearizationConfig(
187+
title_prefix = "<title>", #: Prefix for title layout elements
188+
title_suffix = "</title>", #: Suffix for title layout elements
189+
table_layout_prefix = "<table_layout>", #: Prefix for table elements
190+
table_layout_suffix = "</table_layout>", #: Suffix for table elements
191+
table_prefix = "<table>",
192+
table_suffix = "</table>",
193+
table_row_prefix = "<tr>", #: Prefix for table row
194+
table_row_suffix = "</tr>", #: Suffix for table row
195+
table_cell_prefix = "<td>", #: Prefix for table cell
196+
table_cell_suffix = "</td>", #: Suffix for table cell
197+
table_cell_header_prefix = "<th>", #: Prefix for header cell
198+
table_cell_header_suffix = "</th>", #: Suffix for header cell
199+
header_prefix = "<header>", #: Prefix for header layout elements
200+
header_suffix = "</header>", #: Suffix for header layout elements
201+
section_header_prefix = "<section_header>", #: Prefix for section header layout elements
202+
section_header_suffix = "</section_header>", #: Suffix for section header layout elements
203+
text_prefix = "<text>", #: Prefix for text layout elements
204+
text_suffix = "</text>", #: Suffix for text layout elements
205+
key_value_layout_prefix = "<kv_layout>", #: Prefix for key_value layout elements (not for individual key-value elements)
206+
key_value_layout_suffix = "</kv_layout>", #: Suffix for key_value layout elements (not for individual key-value elements)
207+
key_value_prefix = "<kv>", #: Prefix for key-value elements
208+
key_value_suffix = "</kv>", #: Suffix for key-value elements
209+
key_prefix = "<key>", #: Prefix for key elements
210+
key_suffix = "</key>", #: Suffix for key elements
211+
value_prefix = "<value>", #: Prefix for value elements
212+
value_suffix = "</value>", #: Suffix for value elements
213+
add_prefixes_and_suffixes_in_text=True,
214+
add_prefixes_and_suffixes_as_words=True,
215+
)
216+
217+
_, words = document.get_text_and_words(config)
218+
219+
words = [w.text for w in words]
220+
221+
for token in [
222+
"<title>",
223+
"</title>",
224+
"<table>",
225+
"</table>",
226+
"<table_layout>",
227+
"</table_layout>",
228+
"<tr>",
229+
"</tr>",
230+
"<td>",
231+
"</td>",
232+
# Sample does not have header cells
233+
#"<th>",
234+
#"</th>",
235+
# Sample does not have header
236+
#"<header>",
237+
#"</header>",
238+
"<section_header>",
239+
"</section_header>",
240+
# Sample does not have header
241+
#"<kv_layout>",
242+
#"</kv_layout>",
243+
"<kv>",
244+
"</kv>",
245+
"<key>",
246+
"</key>",
247+
"<value>",
248+
"</value>",
249+
]:
250+
self.assertTrue(token in words, f"{token} is not in text")

0 commit comments

Comments
 (0)