Skip to content

Commit

Permalink
Fix aggregate paragraphs (#6)
Browse files Browse the repository at this point in the history
* Fixed the `aggregate_paragraphs` method to handle missing edge cases

* Rename attribute in base model Document from `type` to `category`

---------

Co-authored-by: AnnaMarika01 <[email protected]>
  • Loading branch information
avvertix and AnnaMarika01 authored Sep 10, 2024
1 parent e189d84 commit d84570f
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 1 deletion.
2 changes: 1 addition & 1 deletion text_extractor/models/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,5 @@ class Node(BaseModel):


class Document(BaseModel):
type: str = Field("doc")
category: str = Field("doc")
content: List[Node]
8 changes: 8 additions & 0 deletions text_extractor/parser/pdfact_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,11 @@ def aggregate_paragraphs(json_file):
fonts = json_file["fonts"]
colors = json_file["colors"]
i = 0

# Base case: if the document consists of only one paragraph, the method terminates and returns the unmodified JSON
if len(json_file["paragraphs"]) == 1:
return json_file

while i < len(json_file["paragraphs"][:-1]):
paragraph1 = json_file["paragraphs"][i]
paragraph2 = json_file["paragraphs"][i + 1]
Expand All @@ -148,6 +153,9 @@ def aggregate_paragraphs(json_file):
elif i + 2 == len(json_file["paragraphs"][:-1]):
output.append(json_file["paragraphs"][i + 2])
break
# If there is no paragraph following the (i+1)-th one, terminate
elif i + 2 > len(json_file["paragraphs"][:-1]):
break
else:
output.append(json_file["paragraphs"][i])

Expand Down

0 comments on commit d84570f

Please sign in to comment.