Fix aggregate paragraphs (#6)

* Fixed the `aggregate_paragraphs` method to handle missing edge cases * Rename attribute in base model Document from `type` to `category` --------- Co-authored-by: AnnaMarika01 <[email protected]>
OneOffTech · Sep 10, 2024 · d84570f · d84570f
1 parent e189d84
commit d84570f
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 1 deletion.
diff --git a/text_extractor/models/document.py b/text_extractor/models/document.py
@@ -36,5 +36,5 @@ class Node(BaseModel):
 
 
 class Document(BaseModel):
-    type: str = Field("doc")
+    category: str = Field("doc")
     content: List[Node]
diff --git a/text_extractor/parser/pdfact_parser.py b/text_extractor/parser/pdfact_parser.py
@@ -132,6 +132,11 @@ def aggregate_paragraphs(json_file):
     fonts = json_file["fonts"]
     colors = json_file["colors"]
     i = 0
+
+    # Base case: if the document consists of only one paragraph, the method terminates and returns the unmodified JSON
+    if len(json_file["paragraphs"]) == 1:
+        return json_file
+
     while i < len(json_file["paragraphs"][:-1]):
         paragraph1 = json_file["paragraphs"][i]
         paragraph2 = json_file["paragraphs"][i + 1]
@@ -148,6 +153,9 @@ def aggregate_paragraphs(json_file):
             elif i + 2 == len(json_file["paragraphs"][:-1]):
                 output.append(json_file["paragraphs"][i + 2])
                 break
+            # If there is no paragraph following the (i+1)-th one, terminate
+            elif i + 2 > len(json_file["paragraphs"][:-1]):
+                break
         else:
             output.append(json_file["paragraphs"][i])