From 976309fdeb68f2a1d31171eb3dc4c1d9d53e841c Mon Sep 17 00:00:00 2001 From: Athanasios Oikonomou Date: Mon, 20 Oct 2025 21:46:30 +0300 Subject: [PATCH] feat: migrate document_loaders from rapidocr_onnxruntime to rapidocr The rapidocr_onnxruntime package is no longer actively maintained (see https://github.com/RapidAI/RapidOCR/issues/579). This commit migrates the document loader parsers to use the maintained rapidocr package. Changes include: - Replacing imports of rapidocr_onnxruntime with rapidocr - Updating OCR result handling from tuple (result, _) to single RapidOCROutput object - Using result.txts for text extraction - Updating import error messages accordingly This aligns the image and PDF parsers with the latest RapidOCR API. --- .../document_loaders/parsers/images.py | 12 ++++++------ .../document_loaders/parsers/pdf.py | 15 +++++++-------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/parsers/images.py b/libs/community/langchain_community/document_loaders/parsers/images.py index 1b4e1474a..b5ea5e2d4 100644 --- a/libs/community/langchain_community/document_loaders/parsers/images.py +++ b/libs/community/langchain_community/document_loaders/parsers/images.py @@ -99,18 +99,18 @@ def _analyze_image(self, img: "Image") -> str: """ if not self.ocr: try: - from rapidocr_onnxruntime import RapidOCR + from rapidocr import RapidOCR self.ocr = RapidOCR() except ImportError: raise ImportError( - "`rapidocr-onnxruntime` package not found, please install it with " - "`pip install rapidocr-onnxruntime`" + "`rapidocr` package not found, please install it with " + "`pip install rapidocr`" ) - ocr_result, _ = self.ocr(np.array(img)) # type: ignore[misc] + ocr_result = self.ocr(np.array(img)) # type: ignore[misc] content = "" - if ocr_result: - content = ("\n".join([text[1] for text in ocr_result])).strip() + if ocr_result and ocr_result.txts: + content = ("\n".join(ocr_result.txts)).strip() return content diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 4cdfa1b9f..bfea95b34 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -73,22 +73,21 @@ def extract_from_images_with_rapidocr( Text extracted from images. Raises: - ImportError: If `rapidocr-onnxruntime` package is not installed. + ImportError: If `rapidocr` package is not installed. """ try: - from rapidocr_onnxruntime import RapidOCR + from rapidocr import RapidOCR except ImportError: raise ImportError( - "`rapidocr-onnxruntime` package not found, please install it with " - "`pip install rapidocr-onnxruntime`" + "`rapidocr` package not found, please install it with " + "`pip install rapidocr`" ) ocr = RapidOCR() text = "" for img in images: - result, _ = ocr(img) - if result: - result = [text[1] for text in result] - text += "\n".join(result) + result = ocr(img) + if result and result.txts: + text += "\n".join(result.txts) return text