Fix processing issues and improve error reporting (#8)

* Refactor parsers, enhance error handling, and update dependencies * Add URL authentication check before file download * Refactor status check after handling auth requirements. * Fix potential crash in PDF parser and update .gitignore * Fix error when no headings found in `assign_heading_levels` * Fix empty document error * Remove unnecessary return statement from pdfact_parser * Include additional loggin details --------- Co-authored-by: AnnaMarika01 <[email protected]> Co-authored-by: Andrea Ponti <[email protected]>
OneOffTech · Nov 20, 2024 · 915f0ba · 915f0ba
1 parent 58e0c35
commit 915f0ba
Show file tree

Hide file tree

Showing 5 changed files with 86 additions and 55 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 .idea
 logs
 .env
-__pycache__
+__pycache__
+docker-compose.override.yml
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 pandas==2.0.2
-pymupdf==1.22.5
+pymupdf~=1.24.3
 numpy~=1.24.3
 requests==2.32.3
 fastapi~=0.111.0

diff --git a/text_extractor/parser/pdfact_parser.py b/text_extractor/parser/pdfact_parser.py
@@ -20,24 +20,21 @@ def __init__(self, url: str) -> None:
 
     def parse(self, filename: str, **kwargs) -> Document:
         body = {"url": filename}
-        unit = kwargs.get("unit", None)
         roles = kwargs.get("roles", None)
-        if unit is not None:
-            body["unit"] = unit
+        body["unit"] = 'paragraph'
         if roles is not None:
             body["roles"] = roles
         try:
             response = requests.post(self.url, json=body)
             response.raise_for_status()
             res = response.json()
-            if unit == 'paragraph' or unit is None:
-                res = pdfact_formatter(res)
-                res = heading_filter(res)
+            res = pdfact_formatter(res)
+            res = heading_filter(res)
             document = pdfact_to_document(res)
             document = determine_heading_level(document)
             return document
         except RequestException as e:
-            logger.exception(f"An error occurred while trying to reach the API: {e}", exc_info=True)
+            logger.exception(f"PDFAct processing error: {e}", exc_info=True)
             raise e
 
 
@@ -133,39 +130,40 @@ def pdfact_formatter(json_file):
 
 def aggregate_paragraphs(json_file):
     output = []
-    fonts = json_file["fonts"]
-    colors = json_file["colors"]
+    fonts = json_file.get("fonts", None) or []
+    colors = json_file.get("colors", None) or []
+    paragraphs = json_file.get("paragraphs", None) or []
     i = 0
 
     # Base case: if the document consists of only one paragraph, the method terminates and returns the unmodified JSON
-    if len(json_file["paragraphs"]) == 1:
+    if len(paragraphs) <= 1:
         return json_file
 
-    while i < len(json_file["paragraphs"][:-1]):
-        paragraph1 = json_file["paragraphs"][i]
-        paragraph2 = json_file["paragraphs"][i + 1]
+    while i < len(paragraphs[:-1]):
+        paragraph1 = paragraphs[i]
+        paragraph2 = paragraphs[i + 1]
 
         if compare_paragraphs(paragraph1, paragraph2):
             paragraph = merge_pargraphs(paragraph1, paragraph2)
             output.append(paragraph)
 
             # After merging the two paragraphs, proceed to the paragraph following the (i+1)-th one
-            if i + 2 < len(json_file["paragraphs"][:-1]):
+            if i + 2 < len(paragraphs[:-1]):
                 i += 2
                 continue
             # if the paragraph following the (i+1)-th one is the last one, then concatenate it
-            elif i + 2 == len(json_file["paragraphs"][:-1]):
-                output.append(json_file["paragraphs"][i + 2])
+            elif i + 2 == len(paragraphs[:-1]):
+                output.append(paragraphs[i + 2])
                 break
             # If there is no paragraph following the (i+1)-th one, terminate
-            elif i + 2 > len(json_file["paragraphs"][:-1]):
+            elif i + 2 > len(paragraphs[:-1]):
                 break
         else:
-            output.append(json_file["paragraphs"][i])
+            output.append(paragraphs[i])
 
             # If the next paragraph is the last one, then concatenate it to the list of paragraphs
-            if i + 1 == len(json_file["paragraphs"][:-1]):
-                output.append(json_file["paragraphs"][i + 1])
+            if i + 1 == len(paragraphs[:-1]):
+                output.append(paragraphs[i + 1])
         i += 1
 
     paragraphs = {'fonts': fonts, 'paragraphs': output, 'colors': colors}
@@ -251,6 +249,7 @@ def determine_heading_level(document: Document) -> Document:
     :return: The document with updated heading levels assigned to each heading node.
     """
     heading_styles = []
+    largest_font_styles = []
 
     for page in document.content:
         for node in page.content:
@@ -283,10 +282,14 @@ def determine_heading_level(document: Document) -> Document:
     # Sort the styles by font size in descending order
     heading_styles = sorted(heading_styles, key=lambda x: x['font_size'], reverse=True)
 
-    largest_font_style = heading_styles[0] if heading_styles else None
-    if largest_font_style and largest_font_style['occurrences'] == 1:
-        heading_styles = heading_styles[1:]
-    # Assign levels to the sorted heading styles
+    for style in heading_styles:
+        if style['occurrences'] == 1:
+            largest_font_styles.append(style)
+        else:
+            break
+
+    heading_styles = [style for style in heading_styles if style not in largest_font_styles]
+
     assigned_levels = assign_heading_levels(heading_styles)
 
     for page in document.content:
@@ -302,9 +305,8 @@ def determine_heading_level(document: Document) -> Document:
                         font_size = mark.font.size
 
                 if font_name and font_size:
-                    if (largest_font_style and
-                            largest_font_style['font_name'] == font_name and
-                            largest_font_style['font_size'] == font_size):
+                    if any(style['font_name'] == font_name and style['font_size'] == font_size for style in
+                           largest_font_styles):
                         node.category = "title"
                     else:
                         level = 4
@@ -329,6 +331,8 @@ def assign_heading_levels(heading_styles: List[Dict[str, Any]]) -> List[Dict[str
             'font_size', and the assigned 'level' (from 1 to 4).
             Level 1 is for the largest and level 4 is for the smallest.
     """
+    if len(heading_styles) == 0:
+        return []
     # Count the number of occurrences for each font
     font_count = Counter([font['font_name'] for font in heading_styles])
 

diff --git a/text_extractor_api/models/extract_text_request.py b/text_extractor_api/models/extract_text_request.py
@@ -7,5 +7,4 @@ class ExtractTextRequest(BaseModel):
     url: str
     mime_type: str
     driver: str
-    unit: Optional[str] = None
     roles: Optional[List[str]] = None
diff --git a/text_extractor_api/routers/parser.py b/text_extractor_api/routers/parser.py
@@ -44,36 +44,63 @@ async def parse_pdf(request: ExtractTextRequest) -> Document:
     extension = request.mime_type.split("/")[-1]
     filename = f"{filename}.{extension}"
     logger.info(f"Parsing {filename}")
-
     file_path = os.path.join(resource_path, filename)
 
     try:
-        resp = requests.get(request.url, allow_redirects=True, timeout=120)
-        resp.raise_for_status()
-        with open(file_path, 'wb') as f:
-            f.write(resp.content)
-    except HTTPError as http_err:
-        logger.exception("Error while downloading file.", exc_info=True)
-        raise HTTPException(status_code=400, detail=f"Error while downloading file [{http_err}]")
-    except Timeout as http_timeout:
-        logger.exception("Timeout while downloading file.", exc_info=True)
-        raise HTTPException(status_code=408, detail=f"File download not completed [{http_timeout}]")
+        head_resp = requests.head(request.url, allow_redirects=True, timeout=30)
 
-    try:
-        document = None
-        if request.driver.lower() == "pdfact":
+        if head_resp.status_code in [401, 403]:
+            logger.warning(f"Authentication required for URL: {request.url}")
+            raise HTTPException(
+                status_code=422,
+                detail=f"The provided file URL [{request.url}] requires authentication. "
+                       "Authentication protected URLs are currently not supported."
+            )
+
+        head_resp.raise_for_status()
+    except (HTTPError, RequestException) as http_err:
+        logger.exception("Error while checking URL status.", exc_info=True)
+        raise HTTPException(status_code=400, detail=f"Error while checking URL status [{http_err}]")
+
+    document = None
+    if request.driver.lower() == "pdfact":
+        try:
+            logger.info(f"Forwarding request to PDFAct for {request.url}")
             parser = PdfactParser(settings.pdfact_url)
-            document = parser.parse(filename=request.url, unit=request.unit, roles=request.roles)
-        elif request.driver.lower() == "pymupdf":
+            document = parser.parse(filename=request.url, roles=request.roles)
+        except RequestException as e:
+            if isinstance(e, HTTPError):
+                logger.exception(f"PDFAct returned an error for {request.url}", exc_info=True)
+                raise HTTPException(
+                    status_code=e.response.status_code,
+                    detail=f"Unexpected error while processing [{request.url}]: {e}"
+                )
+            logger.exception(f"Error while connecting to PDFAct or parsing file. {str(e)}", exc_info=True)
+            raise HTTPException(status_code=500, detail=f"Unexpected error while processing [{request.url}]")
+        except Exception as err:
+            logger.exception(f"Error while parsing file. {str(err)}", exc_info=True)
+            raise HTTPException(status_code=400, detail="Error while parsing file")
+
+    elif request.driver.lower() == "pymupdf":
+        try:
+            resp = requests.get(request.url, allow_redirects=True, timeout=120)
+            resp.raise_for_status()
+            with open(file_path, 'wb') as f:
+                f.write(resp.content)
+            logger.info(f"Parsing {filename} with PyMuPDF")
             parser = PymupdfParser()
             document = parser.parse(filename=file_path)
-    except RequestException as e:
-        logger.exception(f"Error while connecting to pdfact. {str(e)}", exc_info=True)
-        raise HTTPException(status_code=503, detail="The pdfact service is not reachable")
-    except Exception as err:
-        logger.exception(f"Error while parsing file. {str(err)}", exc_info=True)
-        raise HTTPException(status_code=400, detail="Error while parsing file")
-    finally:
-        if os.path.exists(file_path):
-            os.remove(file_path)
+        except HTTPError as http_err:
+            logger.exception("Error while downloading file.", exc_info=True)
+            raise HTTPException(status_code=400, detail=f"Error while downloading file [{http_err}]")
+        except Timeout as http_timeout:
+            logger.exception("Timeout while downloading file.", exc_info=True)
+            raise HTTPException(status_code=408, detail=f"File download not completed [{http_timeout}]")
+        except Exception as err:
+            logger.exception(f"Error while parsing file with pymupdf. {str(err)}", exc_info=True)
+            raise HTTPException(status_code=400, detail="Error while parsing file")
+        finally:
+            if os.path.exists(file_path):
+                os.remove(file_path)
     return document
+