Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
## Coming up
The following changes are not yet released, but are code complete:

- Fix extract_from_pdf function to return correct returncode
Changes:
- check if text from pdf looks like valid readable content #219

## Current

Expand Down
16 changes: 14 additions & 2 deletions doctor/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,20 @@ def make_pdftotext_process(path):
stderr=subprocess.DEVNULL,
)
content, err = process.communicate()
return content.decode(), err, process.returncode
text = content.decode()

# Calculate fraction of bytes in `content` that are non-text control characters
# (ASCII < 32) excluding common whitespace bytes (tab=9, newline=10, carriage return=13).
# If more than 30% are non-text, consider the output unreadable.
nontext_ratio = sum(
b < 32 and b not in (9, 10, 13) for b in content
) / max(len(content), 1)
unreadable = nontext_ratio > 0.3

if unreadable or not text.strip():
return "", b"pdftotext output looks unreadable", 1

return text, err, process.returncode


def rasterize_pdf(path, destination):
Expand Down Expand Up @@ -225,7 +238,6 @@ def extract_from_pdf(
if len(ocr_content) > len(content):
content = ocr_content
extracted_by_ocr = True
returncode = 0
elif content == "" or not success:
content = "Unable to extract document content."

Expand Down
Binary file added doctor/test_assets/unreadable-pdf.pdf
Binary file not shown.
18 changes: 18 additions & 0 deletions doctor/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,24 @@ def test_pdf_v2_ocr_extraction(self):
msg="Failed to extract by OCR",
)

def test_unreadable_pdf_extraction(self):
files = make_file(filename="unreadable-pdf.pdf")
data = {}
response = requests.post(
"http://doctor:5050/extract/doc/text/", files=files, data=data
)
text = response.json()["content"]
self.assertEqual(
"pdftotext output looks unreadable",
response.json()["err"],
msg="Wrong error message",
)
self.assertEqual(
text,
"",
msg=text,
)

def test_docx_format(self):
files = make_file(filename="word-docx.docx")
params = {"ocr_available": False}
Expand Down