Skip to content

Commit b54ff08

Browse files
committed
add an additional test, make some tests work :)
1 parent 0f956ff commit b54ff08

File tree

3 files changed

+26
-6
lines changed

3 files changed

+26
-6
lines changed

analysis/pydoxtools_extraction_demo.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@
9999
# ## Do some layout document analysis
100100

101101
# %%
102-
page = 18
102+
page = 17
103103
pdf = pdx.Document(pdf_file, page_numbers=[page])
104104
vda.plot_box_layers(
105105
box_layers=[
@@ -197,7 +197,7 @@
197197
svg
198198

199199
# %%
200-
pdf = pydoxtools.Document("https://en.wikipedia.org/wiki/Rocket",
200+
pdf = pdx.Document("https://en.wikipedia.org/wiki/Rocket",
201201
spacy_model_size="lg", coreference_method="fast")
202202

203203
# %% [markdown]
@@ -207,7 +207,7 @@
207207
#pdf.x("graph_nodes",disk_cache=True)
208208

209209
# %%
210-
pdf = pydoxtools.Document("https://en.wikipedia.org/wiki/Rocket",
210+
pdf = pdx.Document("https://en.wikipedia.org/wiki/Rocket",
211211
spacy_model_size="lg", coreference_method="fast")
212212
KG = pdf.x("document_graph")
213213

pydoxtools/pdf_utils.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,8 @@ class PDFFileLoader(pydoxtools.operators_base.Operator):
197197
def __init__(
198198
self,
199199
# laparams=LAParams(detect_vertical=True, boxes_flow=-1.0, all_texts=False),
200-
laparams=LAParams(detect_vertical=False),
200+
# we set boxes_flow=None in order to prevent pdfminer from detecting layout...
201+
laparams=LAParams(detect_vertical=False, boxes_flow=None),
201202
**kwargs
202203
):
203204
"""

tests/test_extractor.py

+21-2
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ def test_qam_machine():
203203
))
204204
assert answers[0][0][0] == 'bst bat - 110'
205205
assert answers[1][0][0] == 'bst bat - 110'
206-
assert answers[2][0][0] == 'the bst'
206+
assert answers[2] == []
207207

208208

209209
def test_address_extraction():
@@ -679,6 +679,26 @@ def test_zero_shot_classifier():
679679
assert res[0]['encyclopdia article'] > res[0]['license']
680680

681681

682+
def test_pdf_speed_w_caching():
683+
doc = Document(make_path_absolute("./data/Doxcavator.pdf"))
684+
685+
start_time = time.time()
686+
# Assuming this is how you load and extract elements using your library
687+
x = doc.x("elements", disk_cache=False)
688+
end_time = time.time()
689+
690+
timer1 = end_time - start_time
691+
692+
x = doc.x("elements", disk_cache=True)
693+
start_time = time.time()
694+
# Assuming this is how you load and extract elements using your library
695+
x = doc.x("elements", disk_cache=True)
696+
end_time = time.time()
697+
698+
timer2 = end_time - start_time
699+
700+
assert timer1 > timer2
701+
682702
# TODO: write a test which checks if the output of all operators confirm to their type
683703

684704
if __name__ == "__main__":
@@ -690,7 +710,6 @@ def test_zero_shot_classifier():
690710
op_types = Document.operator_types()
691711
# doc.text_box_elements
692712
# test_zero_shot_classifier()
693-
# test_document_graph()
694713
test_typing()
695714

696715
# doc.text_box_elements

0 commit comments

Comments
 (0)