freelawproject · Luis-manzur · Jan 19, 2026 · Jan 19, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,9 @@
 ## Coming up
 The following changes are not yet released, but are code complete:
 
+Features:  
+ - Enhanced OCR detection in PDFs by checking for curves that may indicate image-based content
+
 ## Current
 
  - Migrated to async Django 6.0.1

diff --git a/doctor/lib/utils.py b/doctor/lib/utils.py
@@ -10,6 +10,7 @@
 from pathlib import Path
 from typing import Any
 
+import pdfplumber
 import six
 from PyPDF2 import PdfMerger
 from reportlab.pdfgen import canvas
@@ -324,13 +325,24 @@ def pdf_has_images(path: str) -> bool:
 def ocr_needed(path: str, content: str) -> bool:
     """Check if OCR is needed on a PDF
 
-    Check if images are in PDF or content is empty.
+    Check if images are in PDF, content is empty or
+    has curves that might indicate image-based content.
 
     :param path: The path to the PDF
     :param content: The content extracted from the PDF.
     :return: Whether OCR should be run on the document.
     """
-    return content.strip() == "" or pdf_has_images(path)
+
+    # Check basic conditions first
+    if content.strip() == "" or pdf_has_images(path):
+        return True
+
+    # Check for curves which can indicate image-based content
+    with pdfplumber.open(path) as pdf:
+        for page in pdf.pages:
+            if len(page.curves) > 10:
+                return True
+    return False
 
 
 def make_page_with_text(page, data, h, w):