freelawproject · Luis-manzur · Oct 30, 2025 · Oct 30, 2025 · Oct 30, 2025 · Oct 31, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,8 @@
 ## Coming up
 The following changes are not yet released, but are code complete:
 
- - Fix extract_from_pdf function to return correct returncode
+Changes:
+ - check if text from pdf looks like valid readable content #219
 
 ## Current
 

diff --git a/doctor/tasks.py b/doctor/tasks.py
@@ -86,7 +86,20 @@ def make_pdftotext_process(path):
         stderr=subprocess.DEVNULL,
     )
     content, err = process.communicate()
-    return content.decode(), err, process.returncode
+    text = content.decode()
+
+    # Calculate fraction of bytes in `content` that are non-text control characters
+    # (ASCII < 32) excluding common whitespace bytes (tab=9, newline=10, carriage return=13).
+    # If more than 30% are non-text, consider the output unreadable.
+    nontext_ratio = sum(
+        b < 32 and b not in (9, 10, 13) for b in content
+    ) / max(len(content), 1)
+    unreadable = nontext_ratio > 0.3
+
+    if unreadable or not text.strip():
+        return "", b"pdftotext output looks unreadable", 1
+
+    return text, err, process.returncode
 
 
 def rasterize_pdf(path, destination):
@@ -225,7 +238,6 @@ def extract_from_pdf(
                 if len(ocr_content) > len(content):
                     content = ocr_content
                     extracted_by_ocr = True
-                    returncode = 0
             elif content == "" or not success:
                 content = "Unable to extract document content."
 

diff --git a/doctor/test_assets/unreadable-pdf.pdf b/doctor/test_assets/unreadable-pdf.pdf
diff --git a/doctor/tests.py b/doctor/tests.py
@@ -189,6 +189,24 @@ def test_pdf_v2_ocr_extraction(self):
             msg="Failed to extract by OCR",
         )
 
+    def test_unreadable_pdf_extraction(self):
+        files = make_file(filename="unreadable-pdf.pdf")
+        data = {}
+        response = requests.post(
+            "http://doctor:5050/extract/doc/text/", files=files, data=data
+        )
+        text = response.json()["content"]
+        self.assertEqual(
+            "pdftotext output looks unreadable",
+            response.json()["err"],
+            msg="Wrong error message",
+        )
+        self.assertEqual(
+            text,
+            "",
+            msg=text,
+        )
+
     def test_docx_format(self):
         files = make_file(filename="word-docx.docx")
         params = {"ocr_available": False}