diff --git a/CHANGELOG.md b/CHANGELOG.md index 95d2a51..44bdc49 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,9 @@ ## Coming up The following changes are not yet released, but are code complete: +Features: + - Enhanced OCR detection in PDFs by checking for curves that may indicate image-based content + ## Current - Migrated to async Django 6.0.1 diff --git a/doctor/lib/utils.py b/doctor/lib/utils.py index 374c4c1..0a7b88e 100644 --- a/doctor/lib/utils.py +++ b/doctor/lib/utils.py @@ -10,6 +10,7 @@ from pathlib import Path from typing import Any +import pdfplumber import six from PyPDF2 import PdfMerger from reportlab.pdfgen import canvas @@ -324,13 +325,24 @@ def pdf_has_images(path: str) -> bool: def ocr_needed(path: str, content: str) -> bool: """Check if OCR is needed on a PDF - Check if images are in PDF or content is empty. + Check if images are in PDF, content is empty or + has curves that might indicate image-based content. :param path: The path to the PDF :param content: The content extracted from the PDF. :return: Whether OCR should be run on the document. """ - return content.strip() == "" or pdf_has_images(path) + + # Check basic conditions first + if content.strip() == "" or pdf_has_images(path): + return True + + # Check for curves which can indicate image-based content + with pdfplumber.open(path) as pdf: + for page in pdf.pages: + if len(page.curves) > 10: + return True + return False def make_page_with_text(page, data, h, w):