add an additional test, make some tests work :)

yeus · yeus · commit b54ff089670c · 2024-02-13T09:00:31.000-08:00
diff --git a/analysis/pydoxtools_extraction_demo.py b/analysis/pydoxtools_extraction_demo.py
@@ -99,7 +99,7 @@
 # ## Do some layout document analysis
 
 # %%
-page = 18
+page = 17
 pdf = pdx.Document(pdf_file, page_numbers=[page])
 vda.plot_box_layers(
     box_layers=[
@@ -197,7 +197,7 @@
 svg
 
 # %%
-pdf = pydoxtools.Document("https://en.wikipedia.org/wiki/Rocket", 
+pdf = pdx.Document("https://en.wikipedia.org/wiki/Rocket", 
                           spacy_model_size="lg", coreference_method="fast")
 
 # %% [markdown]
@@ -207,7 +207,7 @@
 #pdf.x("graph_nodes",disk_cache=True)
 
 # %%
-pdf = pydoxtools.Document("https://en.wikipedia.org/wiki/Rocket", 
+pdf = pdx.Document("https://en.wikipedia.org/wiki/Rocket", 
                           spacy_model_size="lg", coreference_method="fast")
 KG = pdf.x("document_graph")
 
diff --git a/pydoxtools/pdf_utils.py b/pydoxtools/pdf_utils.py
@@ -197,7 +197,8 @@ class PDFFileLoader(pydoxtools.operators_base.Operator):
     def __init__(
             self,
             # laparams=LAParams(detect_vertical=True, boxes_flow=-1.0, all_texts=False),
-            laparams=LAParams(detect_vertical=False),
+            # we set boxes_flow=None in order to prevent pdfminer from detecting layout...
+            laparams=LAParams(detect_vertical=False, boxes_flow=None),
             **kwargs
     ):
         """
diff --git a/tests/test_extractor.py b/tests/test_extractor.py
@@ -203,7 +203,7 @@ def test_qam_machine():
     ))
     assert answers[0][0][0] == 'bst bat - 110'
     assert answers[1][0][0] == 'bst bat - 110'
-    assert answers[2][0][0] == 'the bst'
+    assert answers[2] == []
 
 
 def test_address_extraction():
@@ -679,6 +679,26 @@ def test_zero_shot_classifier():
     assert res[0]['encyclopdia article'] > res[0]['license']
 
 
+def test_pdf_speed_w_caching():
+    doc = Document(make_path_absolute("./data/Doxcavator.pdf"))
+
+    start_time = time.time()
+    # Assuming this is how you load and extract elements using your library
+    x = doc.x("elements", disk_cache=False)
+    end_time = time.time()
+
+    timer1 = end_time - start_time
+
+    x = doc.x("elements", disk_cache=True)
+    start_time = time.time()
+    # Assuming this is how you load and extract elements using your library
+    x = doc.x("elements", disk_cache=True)
+    end_time = time.time()
+
+    timer2 = end_time - start_time
+
+    assert timer1 > timer2
+
 # TODO: write a test which checks if the output of all operators confirm to their type
 
 if __name__ == "__main__":
@@ -690,7 +710,6 @@ def test_zero_shot_classifier():
     op_types = Document.operator_types()
     # doc.text_box_elements
     # test_zero_shot_classifier()
-    # test_document_graph()
     test_typing()
 
     # doc.text_box_elements