Skip to content

Commit

Permalink
Fix processing issues and improve error reporting (#8)
Browse files Browse the repository at this point in the history
* Refactor parsers, enhance error handling, and update dependencies

* Add URL authentication check before file download

* Refactor status check after handling auth requirements.

* Fix potential crash in PDF parser and update .gitignore

* Fix error when no headings found in `assign_heading_levels`

* Fix empty document error

* Remove unnecessary return statement from pdfact_parser

* Include additional loggin details

---------

Co-authored-by: AnnaMarika01 <[email protected]>
Co-authored-by: Andrea Ponti <[email protected]>
  • Loading branch information
3 people authored Nov 20, 2024
1 parent 58e0c35 commit 915f0ba
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 55 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.idea
logs
.env
__pycache__
__pycache__
docker-compose.override.yml
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
pandas==2.0.2
pymupdf==1.22.5
pymupdf~=1.24.3
numpy~=1.24.3
requests==2.32.3
fastapi~=0.111.0
Expand Down
58 changes: 31 additions & 27 deletions text_extractor/parser/pdfact_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,24 +20,21 @@ def __init__(self, url: str) -> None:

def parse(self, filename: str, **kwargs) -> Document:
body = {"url": filename}
unit = kwargs.get("unit", None)
roles = kwargs.get("roles", None)
if unit is not None:
body["unit"] = unit
body["unit"] = 'paragraph'
if roles is not None:
body["roles"] = roles
try:
response = requests.post(self.url, json=body)
response.raise_for_status()
res = response.json()
if unit == 'paragraph' or unit is None:
res = pdfact_formatter(res)
res = heading_filter(res)
res = pdfact_formatter(res)
res = heading_filter(res)
document = pdfact_to_document(res)
document = determine_heading_level(document)
return document
except RequestException as e:
logger.exception(f"An error occurred while trying to reach the API: {e}", exc_info=True)
logger.exception(f"PDFAct processing error: {e}", exc_info=True)
raise e


Expand Down Expand Up @@ -133,39 +130,40 @@ def pdfact_formatter(json_file):

def aggregate_paragraphs(json_file):
output = []
fonts = json_file["fonts"]
colors = json_file["colors"]
fonts = json_file.get("fonts", None) or []
colors = json_file.get("colors", None) or []
paragraphs = json_file.get("paragraphs", None) or []
i = 0

# Base case: if the document consists of only one paragraph, the method terminates and returns the unmodified JSON
if len(json_file["paragraphs"]) == 1:
if len(paragraphs) <= 1:
return json_file

while i < len(json_file["paragraphs"][:-1]):
paragraph1 = json_file["paragraphs"][i]
paragraph2 = json_file["paragraphs"][i + 1]
while i < len(paragraphs[:-1]):
paragraph1 = paragraphs[i]
paragraph2 = paragraphs[i + 1]

if compare_paragraphs(paragraph1, paragraph2):
paragraph = merge_pargraphs(paragraph1, paragraph2)
output.append(paragraph)

# After merging the two paragraphs, proceed to the paragraph following the (i+1)-th one
if i + 2 < len(json_file["paragraphs"][:-1]):
if i + 2 < len(paragraphs[:-1]):
i += 2
continue
# if the paragraph following the (i+1)-th one is the last one, then concatenate it
elif i + 2 == len(json_file["paragraphs"][:-1]):
output.append(json_file["paragraphs"][i + 2])
elif i + 2 == len(paragraphs[:-1]):
output.append(paragraphs[i + 2])
break
# If there is no paragraph following the (i+1)-th one, terminate
elif i + 2 > len(json_file["paragraphs"][:-1]):
elif i + 2 > len(paragraphs[:-1]):
break
else:
output.append(json_file["paragraphs"][i])
output.append(paragraphs[i])

# If the next paragraph is the last one, then concatenate it to the list of paragraphs
if i + 1 == len(json_file["paragraphs"][:-1]):
output.append(json_file["paragraphs"][i + 1])
if i + 1 == len(paragraphs[:-1]):
output.append(paragraphs[i + 1])
i += 1

paragraphs = {'fonts': fonts, 'paragraphs': output, 'colors': colors}
Expand Down Expand Up @@ -251,6 +249,7 @@ def determine_heading_level(document: Document) -> Document:
:return: The document with updated heading levels assigned to each heading node.
"""
heading_styles = []
largest_font_styles = []

for page in document.content:
for node in page.content:
Expand Down Expand Up @@ -283,10 +282,14 @@ def determine_heading_level(document: Document) -> Document:
# Sort the styles by font size in descending order
heading_styles = sorted(heading_styles, key=lambda x: x['font_size'], reverse=True)

largest_font_style = heading_styles[0] if heading_styles else None
if largest_font_style and largest_font_style['occurrences'] == 1:
heading_styles = heading_styles[1:]
# Assign levels to the sorted heading styles
for style in heading_styles:
if style['occurrences'] == 1:
largest_font_styles.append(style)
else:
break

heading_styles = [style for style in heading_styles if style not in largest_font_styles]

assigned_levels = assign_heading_levels(heading_styles)

for page in document.content:
Expand All @@ -302,9 +305,8 @@ def determine_heading_level(document: Document) -> Document:
font_size = mark.font.size

if font_name and font_size:
if (largest_font_style and
largest_font_style['font_name'] == font_name and
largest_font_style['font_size'] == font_size):
if any(style['font_name'] == font_name and style['font_size'] == font_size for style in
largest_font_styles):
node.category = "title"
else:
level = 4
Expand All @@ -329,6 +331,8 @@ def assign_heading_levels(heading_styles: List[Dict[str, Any]]) -> List[Dict[str
'font_size', and the assigned 'level' (from 1 to 4).
Level 1 is for the largest and level 4 is for the smallest.
"""
if len(heading_styles) == 0:
return []
# Count the number of occurrences for each font
font_count = Counter([font['font_name'] for font in heading_styles])

Expand Down
1 change: 0 additions & 1 deletion text_extractor_api/models/extract_text_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,4 @@ class ExtractTextRequest(BaseModel):
url: str
mime_type: str
driver: str
unit: Optional[str] = None
roles: Optional[List[str]] = None
77 changes: 52 additions & 25 deletions text_extractor_api/routers/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,36 +44,63 @@ async def parse_pdf(request: ExtractTextRequest) -> Document:
extension = request.mime_type.split("/")[-1]
filename = f"{filename}.{extension}"
logger.info(f"Parsing {filename}")

file_path = os.path.join(resource_path, filename)

try:
resp = requests.get(request.url, allow_redirects=True, timeout=120)
resp.raise_for_status()
with open(file_path, 'wb') as f:
f.write(resp.content)
except HTTPError as http_err:
logger.exception("Error while downloading file.", exc_info=True)
raise HTTPException(status_code=400, detail=f"Error while downloading file [{http_err}]")
except Timeout as http_timeout:
logger.exception("Timeout while downloading file.", exc_info=True)
raise HTTPException(status_code=408, detail=f"File download not completed [{http_timeout}]")
head_resp = requests.head(request.url, allow_redirects=True, timeout=30)

try:
document = None
if request.driver.lower() == "pdfact":
if head_resp.status_code in [401, 403]:
logger.warning(f"Authentication required for URL: {request.url}")
raise HTTPException(
status_code=422,
detail=f"The provided file URL [{request.url}] requires authentication. "
"Authentication protected URLs are currently not supported."
)

head_resp.raise_for_status()
except (HTTPError, RequestException) as http_err:
logger.exception("Error while checking URL status.", exc_info=True)
raise HTTPException(status_code=400, detail=f"Error while checking URL status [{http_err}]")

document = None
if request.driver.lower() == "pdfact":
try:
logger.info(f"Forwarding request to PDFAct for {request.url}")
parser = PdfactParser(settings.pdfact_url)
document = parser.parse(filename=request.url, unit=request.unit, roles=request.roles)
elif request.driver.lower() == "pymupdf":
document = parser.parse(filename=request.url, roles=request.roles)
except RequestException as e:
if isinstance(e, HTTPError):
logger.exception(f"PDFAct returned an error for {request.url}", exc_info=True)
raise HTTPException(
status_code=e.response.status_code,
detail=f"Unexpected error while processing [{request.url}]: {e}"
)
logger.exception(f"Error while connecting to PDFAct or parsing file. {str(e)}", exc_info=True)
raise HTTPException(status_code=500, detail=f"Unexpected error while processing [{request.url}]")
except Exception as err:
logger.exception(f"Error while parsing file. {str(err)}", exc_info=True)
raise HTTPException(status_code=400, detail="Error while parsing file")

elif request.driver.lower() == "pymupdf":
try:
resp = requests.get(request.url, allow_redirects=True, timeout=120)
resp.raise_for_status()
with open(file_path, 'wb') as f:
f.write(resp.content)
logger.info(f"Parsing {filename} with PyMuPDF")
parser = PymupdfParser()
document = parser.parse(filename=file_path)
except RequestException as e:
logger.exception(f"Error while connecting to pdfact. {str(e)}", exc_info=True)
raise HTTPException(status_code=503, detail="The pdfact service is not reachable")
except Exception as err:
logger.exception(f"Error while parsing file. {str(err)}", exc_info=True)
raise HTTPException(status_code=400, detail="Error while parsing file")
finally:
if os.path.exists(file_path):
os.remove(file_path)
except HTTPError as http_err:
logger.exception("Error while downloading file.", exc_info=True)
raise HTTPException(status_code=400, detail=f"Error while downloading file [{http_err}]")
except Timeout as http_timeout:
logger.exception("Timeout while downloading file.", exc_info=True)
raise HTTPException(status_code=408, detail=f"File download not completed [{http_timeout}]")
except Exception as err:
logger.exception(f"Error while parsing file with pymupdf. {str(err)}", exc_info=True)
raise HTTPException(status_code=400, detail="Error while parsing file")
finally:
if os.path.exists(file_path):
os.remove(file_path)
return document

0 comments on commit 915f0ba

Please sign in to comment.