diff --git a/CHANGELOG.md b/CHANGELOG.md index d40f1af..227619e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ ## Coming up The following changes are not yet released, but are code complete: - - Fix extract_from_pdf function to return correct returncode +Changes: + - check if text from pdf looks like valid readable content #219 ## Current diff --git a/doctor/tasks.py b/doctor/tasks.py index cb55802..17da13c 100644 --- a/doctor/tasks.py +++ b/doctor/tasks.py @@ -86,7 +86,20 @@ def make_pdftotext_process(path): stderr=subprocess.DEVNULL, ) content, err = process.communicate() - return content.decode(), err, process.returncode + text = content.decode() + + # Calculate fraction of bytes in `content` that are non-text control characters + # (ASCII < 32) excluding common whitespace bytes (tab=9, newline=10, carriage return=13). + # If more than 30% are non-text, consider the output unreadable. + nontext_ratio = sum( + b < 32 and b not in (9, 10, 13) for b in content + ) / max(len(content), 1) + unreadable = nontext_ratio > 0.3 + + if unreadable or not text.strip(): + return "", b"pdftotext output looks unreadable", 1 + + return text, err, process.returncode def rasterize_pdf(path, destination): @@ -225,7 +238,6 @@ def extract_from_pdf( if len(ocr_content) > len(content): content = ocr_content extracted_by_ocr = True - returncode = 0 elif content == "" or not success: content = "Unable to extract document content." diff --git a/doctor/test_assets/unreadable-pdf.pdf b/doctor/test_assets/unreadable-pdf.pdf new file mode 100644 index 0000000..9b15d8b Binary files /dev/null and b/doctor/test_assets/unreadable-pdf.pdf differ diff --git a/doctor/tests.py b/doctor/tests.py index 8184c49..139df70 100644 --- a/doctor/tests.py +++ b/doctor/tests.py @@ -189,6 +189,24 @@ def test_pdf_v2_ocr_extraction(self): msg="Failed to extract by OCR", ) + def test_unreadable_pdf_extraction(self): + files = make_file(filename="unreadable-pdf.pdf") + data = {} + response = requests.post( + "http://doctor:5050/extract/doc/text/", files=files, data=data + ) + text = response.json()["content"] + self.assertEqual( + "pdftotext output looks unreadable", + response.json()["err"], + msg="Wrong error message", + ) + self.assertEqual( + text, + "", + msg=text, + ) + def test_docx_format(self): files = make_file(filename="word-docx.docx") params = {"ocr_available": False}