diff --git a/README.md b/README.md index 6535f108a..ad8368ab7 100644 --- a/README.md +++ b/README.md @@ -336,7 +336,7 @@ async def main(): # Create RAGAnything configuration config = RAGAnythingConfig( working_dir="./rag_storage", - parser="mineru", # Parser selection: mineru or docling + parser="mineru", # Parser selection: mineru, docling, or paddleocr parse_method="auto", # Parse method: auto, ocr, or txt enable_image_processing=True, enable_table_processing=True, @@ -1047,7 +1047,7 @@ Create a `.env` file (refer to `.env.example`): OPENAI_API_KEY=your_openai_api_key OPENAI_BASE_URL=your_base_url # Optional OUTPUT_DIR=./output # Default output directory for parsed documents -PARSER=mineru # Parser selection: mineru or docling +PARSER=mineru # Parser selection: mineru, docling, or paddleocr PARSE_METHOD=auto # Parse method: auto, ocr, or txt ``` @@ -1070,6 +1070,21 @@ RAGAnything now supports multiple parsers, each with specific advantages: - Better document structure preservation - Native support for multiple Office formats +#### PaddleOCR Parser +- OCR-focused parser for images and PDFs +- Produces text blocks compatible with existing `content_list` processing +- Supports optional Office/TXT/MD parsing by converting to PDF first + +Install PaddleOCR parser extras: + +```bash +pip install -e ".[paddleocr]" +# or +uv sync --extra paddleocr +``` + +> **Note**: PaddleOCR also requires `paddlepaddle` (CPU/GPU package varies by platform). Install it with the official guide: https://www.paddlepaddle.org.cn/install/quick + ### MinerU Configuration ```bash @@ -1091,7 +1106,7 @@ await rag.process_document_complete( file_path="document.pdf", output_dir="./output/", parse_method="auto", # or "ocr", "txt" - parser="mineru" # Optional: "mineru" or "docling" + parser="mineru" # Optional: "mineru", "docling", or "paddleocr" ) # Advanced parsing configuration with special parameters @@ -1099,7 +1114,7 @@ await rag.process_document_complete( file_path="document.pdf", output_dir="./output/", parse_method="auto", # Parsing method: "auto", "ocr", "txt" - parser="mineru", # Parser selection: "mineru" or "docling" + parser="mineru", # Parser selection: "mineru", "docling", or "paddleocr" # MinerU special parameters - all supported kwargs: lang="ch", # Document language for OCR optimization (e.g., "ch", "en", "ja") @@ -1119,7 +1134,7 @@ await rag.process_document_complete( ) ``` -> **Note**: MinerU 2.0 no longer uses the `magic-pdf.json` configuration file. All settings are now passed as command-line parameters or function arguments. RAG-Anything now supports multiple document parsers - you can choose between MinerU and Docling based on your needs. +> **Note**: MinerU 2.0 no longer uses the `magic-pdf.json` configuration file. All settings are now passed as command-line parameters or function arguments. RAG-Anything supports multiple document parsers, including MinerU, Docling, and PaddleOCR. ### Processing Requirements @@ -1128,6 +1143,7 @@ Different content types require specific optional dependencies: - **Office Documents** (.doc, .docx, .ppt, .pptx, .xls, .xlsx): Install [LibreOffice](https://www.libreoffice.org/download/download/) - **Extended Image Formats** (.bmp, .tiff, .gif, .webp): Install with `pip install raganything[image]` - **Text Files** (.txt, .md): Install with `pip install raganything[text]` +- **PaddleOCR Parser** (`parser="paddleocr"`): Install with `pip install raganything[paddleocr]`, then install `paddlepaddle` for your platform > **📋 Quick Install**: Use `pip install raganything[all]` to enable all format support (Python dependencies only - LibreOffice still needs separate installation) diff --git a/docs/batch_processing.md b/docs/batch_processing.md index fe51ab146..4005e689d 100644 --- a/docs/batch_processing.md +++ b/docs/batch_processing.md @@ -24,6 +24,9 @@ pip install raganything[all] # Required for batch processing pip install tqdm + +# Optional for parser='paddleocr' +pip install raganything[paddleocr] ``` ## Usage @@ -35,7 +38,7 @@ from raganything.batch_parser import BatchParser # Create batch parser batch_parser = BatchParser( - parser_type="mineru", # or "docling" + parser_type="mineru", # or "docling" or "paddleocr" max_workers=4, show_progress=True, timeout_per_file=300, @@ -123,6 +126,7 @@ python -m raganything.batch_parser examples/sample_docs/ --output ./output --wor # With specific parser python -m raganything.batch_parser examples/sample_docs/ --parser mineru --method auto +python -m raganything.batch_parser examples/sample_docs/ --parser paddleocr --method ocr # Without progress bar python -m raganything.batch_parser examples/sample_docs/ --output ./output --no-progress @@ -148,7 +152,7 @@ PARSER_OUTPUT_DIR=./parsed_output ### BatchParser Parameters -- **parser_type**: `"mineru"` or `"docling"` (default: `"mineru"`) +- **parser_type**: `"mineru"`, `"docling"`, or `"paddleocr"` (default: `"mineru"`) - **max_workers**: Number of parallel workers (default: `4`) - **show_progress**: Show progress bar (default: `True`) - **timeout_per_file**: Timeout per file in seconds (default: `300`) diff --git a/env.example b/env.example index 7e857166b..e6967e8c7 100644 --- a/env.example +++ b/env.example @@ -42,7 +42,7 @@ OLLAMA_EMULATING_MODEL_TAG=latest ### Parser Configuration # PARSE_METHOD=auto # OUTPUT_DIR=./output -# PARSER=mineru +# PARSER=mineru # Options: mineru, docling, paddleocr # DISPLAY_CONTENT_STATS=true ### Multimodal Processing Configuration diff --git a/examples/batch_dry_run_example.py b/examples/batch_dry_run_example.py index 707ab5668..35d229f6d 100644 --- a/examples/batch_dry_run_example.py +++ b/examples/batch_dry_run_example.py @@ -7,6 +7,7 @@ - pip install: python examples/batch_dry_run_example.py examples/sample_docs --parser mineru python examples/batch_dry_run_example.py examples/sample_docs/projects examples/sample_docs/web --parser docling + python examples/batch_dry_run_example.py examples/sample_docs --parser paddleocr - uv install: uv run python examples/batch_dry_run_example.py examples/sample_docs --parser mineru --recursive uv run python examples/batch_dry_run_example.py examples/sample_docs --parser mineru --no-recursive @@ -22,7 +23,7 @@ def main() -> int: parser.add_argument("paths", nargs="+", help="File paths or directories to scan") parser.add_argument( "--parser", - choices=["mineru", "docling"], + choices=["mineru", "docling", "paddleocr"], default="mineru", help="Parser to use for file-type support", ) diff --git a/examples/raganything_example.py b/examples/raganything_example.py index 5c22eeadc..c5e8e9e89 100644 --- a/examples/raganything_example.py +++ b/examples/raganything_example.py @@ -1,9 +1,9 @@ #!/usr/bin/env python """ -Example script demonstrating the integration of MinerU parser with RAGAnything +Example script demonstrating parser integration with RAGAnything This example shows how to: -1. Process documents with RAGAnything using MinerU parser +1. Process documents with RAGAnything using configurable parsers 2. Perform pure text queries using aquery() method 3. Perform multimodal queries with specific multimodal content using aquery_with_multimodal() method 4. Handle different types of multimodal content (tables, equations) in queries @@ -108,7 +108,7 @@ async def process_with_rag( # Create RAGAnything configuration config = RAGAnythingConfig( working_dir=working_dir or "./rag_storage", - parser=parser, # Parser selection: mineru or docling + parser=parser, # Parser selection: mineru, docling, or paddleocr parse_method="auto", # Parse method: auto, ocr, or txt enable_image_processing=True, enable_table_processing=True, @@ -289,7 +289,8 @@ def main(): parser.add_argument( "--parser", default=os.getenv("PARSER", "mineru"), - help="Optional base URL for API", + choices=["mineru", "docling", "paddleocr"], + help="Parser selection", ) args = parser.parse_args() diff --git a/pyproject.toml b/pyproject.toml index b8847d8d2..e612fa4e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,10 @@ dependencies = [ image = ["Pillow>=10.0.0"] text = ["reportlab>=4.0.0"] office = [] # Requires LibreOffice (external program) +paddleocr = [ + "paddleocr>=2.7.0", + "pypdfium2>=4.25.0", +] markdown = [ "markdown>=3.4.0", "weasyprint>=60.0", @@ -39,9 +43,11 @@ markdown = [ all = [ "Pillow>=10.0.0", "reportlab>=4.0.0", + "paddleocr>=2.7.0", + "pypdfium2>=4.25.0", "markdown>=3.4.0", "weasyprint>=60.0", - "pygments>=2.10.0" + "pygments>=2.10.0", ] [project.urls] @@ -73,3 +79,8 @@ version = {attr = "raganything.__version__"} [tool.ruff] target-version = "py310" + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test*.py"] +norecursedirs = ["examples"] diff --git a/raganything/batch_parser.py b/raganything/batch_parser.py index 7112a5831..2111ec501 100644 --- a/raganything/batch_parser.py +++ b/raganything/batch_parser.py @@ -15,7 +15,7 @@ from tqdm import tqdm -from .parser import MineruParser, DoclingParser +from .parser import SUPPORTED_PARSERS, get_parser @dataclass @@ -70,7 +70,7 @@ def __init__( Initialize batch parser Args: - parser_type: Type of parser to use ("mineru" or "docling") + parser_type: Type of parser to use ("mineru", "docling", or "paddleocr") max_workers: Maximum number of parallel workers show_progress: Whether to show progress bars timeout_per_file: Timeout in seconds for each file @@ -83,12 +83,10 @@ def __init__( self.logger = logging.getLogger(__name__) # Initialize parser - if parser_type == "mineru": - self.parser = MineruParser() - elif parser_type == "docling": - self.parser = DoclingParser() - else: - raise ValueError(f"Unsupported parser type: {parser_type}") + try: + self.parser = get_parser(parser_type) + except ValueError as exc: + raise ValueError(f"Unsupported parser type: {parser_type}") from exc # Check parser installation (optional) if not skip_installation_check: @@ -384,7 +382,7 @@ def main(): parser.add_argument("--output", "-o", required=True, help="Output directory") parser.add_argument( "--parser", - choices=["mineru", "docling"], + choices=list(SUPPORTED_PARSERS), default="mineru", help="Parser to use", ) diff --git a/raganything/config.py b/raganything/config.py index bfce0ac27..ec6f88c06 100644 --- a/raganything/config.py +++ b/raganything/config.py @@ -27,7 +27,7 @@ class RAGAnythingConfig: """Default output directory for parsed content.""" parser: str = field(default=get_env_value("PARSER", "mineru", str)) - """Parser selection: 'mineru' or 'docling'.""" + """Parser selection: 'mineru', 'docling', or 'paddleocr'.""" display_content_stats: bool = field( default=get_env_value("DISPLAY_CONTENT_STATS", True, bool) diff --git a/raganything/parser.py b/raganything/parser.py index a07443e24..7595e0bd3 100644 --- a/raganything/parser.py +++ b/raganything/parser.py @@ -26,6 +26,7 @@ Union, Tuple, Any, + Iterator, TypeVar, ) @@ -1740,12 +1741,354 @@ def check_installation(self) -> bool: return False +class PaddleOCRParser(Parser): + """PaddleOCR document parser with optional PDF page rendering support.""" + + def __init__(self, default_lang: str = "en") -> None: + super().__init__() + self.default_lang = default_lang + self._ocr_instances: Dict[str, Any] = {} + + def _require_paddleocr(self): + try: + from paddleocr import PaddleOCR + except ImportError as exc: + raise ImportError( + "PaddleOCR parser requires optional dependency `paddleocr`. " + "Install with `pip install -e '.[paddleocr]'` or " + "`uv sync --extra paddleocr`. " + "PaddleOCR also needs `paddlepaddle`; install it from " + "https://www.paddlepaddle.org.cn/install/quick." + ) from exc + return PaddleOCR + + def _get_ocr(self, lang: Optional[str] = None): + PaddleOCR = self._require_paddleocr() + language = (lang or self.default_lang).strip() or self.default_lang + cached = self._ocr_instances.get(language) + if cached is not None: + return cached + + init_candidates = [ + {"lang": language, "show_log": False}, + {"lang": language}, + {}, + ] + last_exception = None + for candidate_kwargs in init_candidates: + try: + ocr = PaddleOCR(**candidate_kwargs) + self._ocr_instances[language] = ocr + return ocr + except Exception as exc: # pragma: no cover - defensive fallback + last_exception = exc + continue + + raise RuntimeError( + f"Unable to initialize PaddleOCR for language '{language}': {last_exception}" + ) + + def _extract_text_lines(self, result: Any) -> List[str]: + lines: List[str] = [] + + def append_text(text: str) -> None: + clean_text = text.strip() + if clean_text: + lines.append(clean_text) + + if isinstance(result, str): + append_text(result) + return lines + + def visit(node: Any) -> None: + if node is None: + return + + if hasattr(node, "to_dict"): + try: + visit(node.to_dict()) + return + except Exception: + pass + + if isinstance(node, dict): + rec_texts = node.get("rec_texts") + if isinstance(rec_texts, list): + for item in rec_texts: + if isinstance(item, str): + append_text(item) + else: + visit(item) + + text_value = node.get("text") + if isinstance(text_value, str): + append_text(text_value) + + texts_value = node.get("texts") + if isinstance(texts_value, list): + for item in texts_value: + if isinstance(item, str): + append_text(item) + else: + visit(item) + + # Avoid double-visiting keys we already handled above; this prevents + # accidental duplication without content-level deduplication. + for key, value in node.items(): + if key in {"rec_texts", "text", "texts"}: + continue + visit(value) + return + + if isinstance(node, (list, tuple)): + if node and all(isinstance(item, str) for item in node): + for item in node: + append_text(item) + return + + if ( + len(node) >= 2 + and isinstance(node[1], (list, tuple)) + and len(node[1]) >= 1 + and isinstance(node[1][0], str) + ): + append_text(node[1][0]) + return + + if ( + len(node) >= 1 + and isinstance(node[0], str) + and (len(node) == 1 or isinstance(node[1], (int, float))) + ): + append_text(node[0]) + return + + for item in node: + visit(item) + return + + if isinstance(node, str): + append_text(node) + return + + visit(result) + return lines + + def _ocr_input( + self, input_data: Any, lang: Optional[str] = None, cls_enabled: bool = True + ) -> List[str]: + ocr = self._get_ocr(lang=lang) + + if hasattr(ocr, "ocr"): + try: + result = ocr.ocr(input_data, cls=cls_enabled) + except TypeError: + result = ocr.ocr(input_data) + return self._extract_text_lines(result) + + if hasattr(ocr, "predict"): + result = ocr.predict(input_data) + return self._extract_text_lines(result) + + raise RuntimeError( + "Unsupported PaddleOCR API: expected `ocr` or `predict` method." + ) + + def _extract_pdf_page_inputs(self, pdf_path: Path) -> Iterator[Tuple[int, Any]]: + try: + import pypdfium2 as pdfium + except ImportError as exc: + raise ImportError( + "PDF parsing with parser='paddleocr' requires `pypdfium2`. " + "Install with `pip install -e '.[paddleocr]'` or " + "`uv sync --extra paddleocr`." + ) from exc + + pdf = pdfium.PdfDocument(str(pdf_path)) + try: + total_pages = len(pdf) + for page_idx in range(total_pages): + page = pdf[page_idx] + try: + rendered = page.render(scale=2.0) + if hasattr(rendered, "to_pil"): + yield (page_idx, rendered.to_pil()) + elif hasattr(rendered, "to_numpy"): + yield (page_idx, rendered.to_numpy()) + else: + raise RuntimeError( + "Unsupported rendered page format from pypdfium2." + ) + finally: + if hasattr(page, "close"): + page.close() + finally: + if hasattr(pdf, "close"): + pdf.close() + + def _ocr_rendered_page( + self, rendered_page: Any, lang: Optional[str] = None, cls_enabled: bool = True + ) -> List[str]: + if hasattr(rendered_page, "save"): + temp_image_path: Optional[Path] = None + try: + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp: + temp_image_path = Path(temp.name) + rendered_page.save(temp_image_path) + return self._ocr_input( + str(temp_image_path), lang=lang, cls_enabled=cls_enabled + ) + finally: + if temp_image_path is not None and temp_image_path.exists(): + try: + temp_image_path.unlink() + except Exception: + pass + + return self._ocr_input(rendered_page, lang=lang, cls_enabled=cls_enabled) + + def parse_pdf( + self, + pdf_path: Union[str, Path], + output_dir: Optional[str] = None, + method: str = "auto", + lang: Optional[str] = None, + **kwargs, + ) -> List[Dict[str, Any]]: + del output_dir, method + pdf_path = Path(pdf_path) + if not pdf_path.exists(): + raise FileNotFoundError(f"PDF file does not exist: {pdf_path}") + + cls_enabled = kwargs.get("cls", True) + content_list: List[Dict[str, Any]] = [] + page_inputs = self._extract_pdf_page_inputs(pdf_path) + try: + for page_idx, rendered_page in page_inputs: + page_lines = self._ocr_rendered_page( + rendered_page, lang=lang, cls_enabled=cls_enabled + ) + for text in page_lines: + content_list.append( + {"type": "text", "text": text, "page_idx": int(page_idx)} + ) + finally: + # Ensure we promptly release PDF handles even if OCR fails mid-stream. + close = getattr(page_inputs, "close", None) + if callable(close): + close() + return content_list + + def parse_image( + self, + image_path: Union[str, Path], + output_dir: Optional[str] = None, + lang: Optional[str] = None, + **kwargs, + ) -> List[Dict[str, Any]]: + del output_dir + image_path = Path(image_path) + if not image_path.exists(): + raise FileNotFoundError(f"Image file does not exist: {image_path}") + + ext = image_path.suffix.lower() + if ext not in self.IMAGE_FORMATS: + raise ValueError( + f"Unsupported image format: {ext}. Supported formats: {', '.join(sorted(self.IMAGE_FORMATS))}" + ) + + cls_enabled = kwargs.get("cls", True) + page_idx = int(kwargs.get("page_idx", 0)) + text_lines = self._ocr_input( + str(image_path), lang=lang, cls_enabled=cls_enabled + ) + return [ + {"type": "text", "text": text, "page_idx": page_idx} for text in text_lines + ] + + def parse_office_doc( + self, + doc_path: Union[str, Path], + output_dir: Optional[str] = None, + lang: Optional[str] = None, + **kwargs, + ) -> List[Dict[str, Any]]: + pdf_path = self.convert_office_to_pdf(doc_path, output_dir) + return self.parse_pdf( + pdf_path=pdf_path, output_dir=output_dir, lang=lang, **kwargs + ) + + def parse_text_file( + self, + text_path: Union[str, Path], + output_dir: Optional[str] = None, + lang: Optional[str] = None, + **kwargs, + ) -> List[Dict[str, Any]]: + pdf_path = self.convert_text_to_pdf(text_path, output_dir) + return self.parse_pdf( + pdf_path=pdf_path, output_dir=output_dir, lang=lang, **kwargs + ) + + def parse_document( + self, + file_path: Union[str, Path], + method: str = "auto", + output_dir: Optional[str] = None, + lang: Optional[str] = None, + **kwargs, + ) -> List[Dict[str, Any]]: + del method + file_path = Path(file_path) + if not file_path.exists(): + raise FileNotFoundError(f"File does not exist: {file_path}") + + ext = file_path.suffix.lower() + if ext == ".pdf": + return self.parse_pdf(file_path, output_dir, lang=lang, **kwargs) + if ext in self.IMAGE_FORMATS: + return self.parse_image(file_path, output_dir, lang=lang, **kwargs) + if ext in self.OFFICE_FORMATS: + return self.parse_office_doc(file_path, output_dir, lang=lang, **kwargs) + if ext in self.TEXT_FORMATS: + return self.parse_text_file(file_path, output_dir, lang=lang, **kwargs) + + raise ValueError( + f"Unsupported file format: {ext}. " + "PaddleOCR parser supports PDF, image, office, and text formats." + ) + + def check_installation(self) -> bool: + try: + self._require_paddleocr() + return True + except ImportError: + return False + + +SUPPORTED_PARSERS = ("mineru", "docling", "paddleocr") + + +def get_parser(parser_type: str) -> Parser: + parser_name = (parser_type or "mineru").strip().lower() + if parser_name == "mineru": + return MineruParser() + if parser_name == "docling": + return DoclingParser() + if parser_name == "paddleocr": + return PaddleOCRParser() + raise ValueError( + f"Unsupported parser type: {parser_type}. " + f"Supported parsers: {', '.join(SUPPORTED_PARSERS)}" + ) + + def main(): """ Main function to run the document parser from command line """ parser = argparse.ArgumentParser( - description="Parse documents using MinerU 2.0 or Docling" + description="Parse documents using MinerU 2.0, Docling, or PaddleOCR" ) parser.add_argument("file_path", help="Path to the document to parse") parser.add_argument("--output", "-o", help="Output directory path") @@ -1805,7 +2148,7 @@ def main(): ) parser.add_argument( "--parser", - choices=["mineru", "docling"], + choices=list(SUPPORTED_PARSERS), default="mineru", help="Parser selection", ) @@ -1818,7 +2161,7 @@ def main(): # Check installation if requested if args.check: - doc_parser = DoclingParser() if args.parser == "docling" else MineruParser() + doc_parser = get_parser(args.parser) if doc_parser.check_installation(): print(f"✅ {args.parser.title()} is properly installed") return 0 @@ -1828,7 +2171,7 @@ def main(): try: # Parse the document - doc_parser = DoclingParser() if args.parser == "docling" else MineruParser() + doc_parser = get_parser(args.parser) content_list = doc_parser.parse_document( file_path=args.file_path, method=args.method, diff --git a/raganything/processor.py b/raganything/processor.py index 7c6026286..048af1c73 100644 --- a/raganything/processor.py +++ b/raganything/processor.py @@ -12,7 +12,7 @@ from pathlib import Path from raganything.base import DocStatus -from raganything.parser import MineruParser, DoclingParser, MineruExecutionError +from raganything.parser import MineruParser, MineruExecutionError, get_parser from raganything.utils import ( separate_content, insert_text_content, @@ -332,9 +332,10 @@ async def parse_document( ext = file_path.suffix.lower() try: - doc_parser = ( - DoclingParser() if self.config.parser == "docling" else MineruParser() - ) + doc_parser = getattr(self, "doc_parser", None) + if doc_parser is None: + doc_parser = get_parser(self.config.parser) + self.doc_parser = doc_parser # Log parser and method information self.logger.info( @@ -361,21 +362,23 @@ async def parse_document( ".webp", ]: self.logger.info("Detected image file, using parser for images...") - # Use the selected parser's image parsing capability - if hasattr(doc_parser, "parse_image"): + try: content_list = await asyncio.to_thread( doc_parser.parse_image, image_path=file_path, output_dir=output_dir, **kwargs, ) - else: + except NotImplementedError: # Fallback to MinerU for image parsing if current parser doesn't support it self.logger.warning( f"{self.config.parser} parser doesn't support image parsing, falling back to MinerU" ) - content_list = MineruParser().parse_image( - image_path=file_path, output_dir=output_dir, **kwargs + content_list = await asyncio.to_thread( + MineruParser().parse_image, + image_path=file_path, + output_dir=output_dir, + **kwargs, ) elif ext in [ ".doc", @@ -573,7 +576,7 @@ async def _process_multimodal_content_individual( try: content_type = item.get("type", "unknown") self.logger.info( - f"Processing item {i+1}/{len(multimodal_items)}: {content_type} content" + f"Processing item {i + 1}/{len(multimodal_items)}: {content_type} content" ) # Select appropriate processor diff --git a/raganything/raganything.py b/raganything/raganything.py index 321369d6b..3af8da6ce 100644 --- a/raganything/raganything.py +++ b/raganything/raganything.py @@ -33,7 +33,7 @@ from raganything.processor import ProcessorMixin from raganything.batch import BatchMixin from raganything.utils import get_processor_supports -from raganything.parser import MineruParser, DoclingParser +from raganything.parser import MineruParser, SUPPORTED_PARSERS, get_parser # Import specialized processors from raganything.modalprocessors import ( @@ -109,9 +109,7 @@ def __post_init__(self): self.logger = logger # Set up document parser - self.doc_parser = ( - DoclingParser() if self.config.parser == "docling" else MineruParser() - ) + self.doc_parser = get_parser(self.config.parser) # Register close method for cleanup atexit.register(self.close) @@ -554,6 +552,10 @@ def get_processor_info(self) -> Dict[str, Any]: """Get processor information""" base_info = { "mineru_installed": MineruParser.check_installation(MineruParser()), + "parser_installation": { + parser_name: get_parser(parser_name).check_installation() + for parser_name in SUPPORTED_PARSERS + }, "config": self.get_config_info(), "models": { "llm_model": "External function" diff --git a/requirements.txt b/requirements.txt index 9cd2d0e83..cbb59d951 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,6 +8,7 @@ tqdm # Note: Optional dependencies are now defined in setup.py extras_require: # - [image]: Pillow>=10.0.0 (for BMP, TIFF, GIF, WebP format conversion) # - [text]: reportlab>=4.0.0 (for TXT, MD to PDF conversion) +# - [paddleocr]: paddleocr + pypdfium2 (for parser='paddleocr') # - [office]: requires LibreOffice (external program, not Python package) # - [all]: includes all optional dependencies # diff --git a/setup.py b/setup.py index 3c2277aa1..2675229c9 100644 --- a/setup.py +++ b/setup.py @@ -63,7 +63,16 @@ def read_requirements(): "image": ["Pillow>=10.0.0"], # For image format conversion (BMP, TIFF, GIF, WebP) "text": ["reportlab>=4.0.0"], # For text file to PDF conversion (TXT, MD) "office": [], # Office document processing requires LibreOffice (external program) - "all": ["Pillow>=10.0.0", "reportlab>=4.0.0"], # All optional features + "paddleocr": ["paddleocr>=2.7.0", "pypdfium2>=4.25.0"], # PaddleOCR parser + "all": [ + "Pillow>=10.0.0", + "reportlab>=4.0.0", + "paddleocr>=2.7.0", + "pypdfium2>=4.25.0", + "markdown>=3.4.0", + "weasyprint>=60.0", + "pygments>=2.10.0", + ], # All optional features "markdown": [ "markdown>=3.4.0", "weasyprint>=60.0", diff --git a/tests/testpaddleocr_parser.py b/tests/testpaddleocr_parser.py new file mode 100644 index 000000000..cc9b04090 --- /dev/null +++ b/tests/testpaddleocr_parser.py @@ -0,0 +1,164 @@ +import importlib +import sys + +import pytest + +import raganything.parser as parser_module +from raganything.parser import PaddleOCRParser, SUPPORTED_PARSERS, get_parser + + +def test_supported_parsers_include_paddleocr(): + assert "paddleocr" in SUPPORTED_PARSERS + + +def test_get_parser_returns_paddleocr_parser(): + parser = get_parser("paddleocr") + assert isinstance(parser, PaddleOCRParser) + + +def test_get_parser_rejects_unknown_parser(): + with pytest.raises(ValueError, match="Unsupported parser type"): + get_parser("unknown-parser") + + +def test_parser_module_import_does_not_import_paddleocr(): + sys.modules.pop("paddleocr", None) + importlib.reload(parser_module) + assert "paddleocr" not in sys.modules + + +def test_check_installation_false_when_dependency_missing(monkeypatch): + parser = PaddleOCRParser() + + def missing_dependency(): + raise ImportError("missing paddleocr") + + monkeypatch.setattr(parser, "_require_paddleocr", missing_dependency) + assert parser.check_installation() is False + + +def test_check_installation_true_when_pdf_renderer_missing(monkeypatch): + parser = PaddleOCRParser() + + monkeypatch.setattr(parser, "_require_paddleocr", lambda: object()) + + import builtins + + real_import = builtins.__import__ + + def fake_import(name, globals=None, locals=None, fromlist=(), level=0): + if name == "pypdfium2": + raise ImportError("missing pypdfium2") + return real_import(name, globals, locals, fromlist, level) + + monkeypatch.setattr(builtins, "__import__", fake_import) + + assert parser.check_installation() is True + + +def test_parse_pdf_raises_import_error_when_pdf_renderer_missing(monkeypatch, tmp_path): + parser = PaddleOCRParser() + fake_pdf = tmp_path / "sample.pdf" + fake_pdf.write_bytes(b"%PDF-1.4\n") + + monkeypatch.setattr(parser, "_require_paddleocr", lambda: object()) + + import builtins + + real_import = builtins.__import__ + + def fake_import(name, globals=None, locals=None, fromlist=(), level=0): + if name == "pypdfium2": + raise ImportError("missing pypdfium2") + return real_import(name, globals, locals, fromlist, level) + + monkeypatch.setattr(builtins, "__import__", fake_import) + + with pytest.raises(ImportError, match="pypdfium2"): + parser.parse_pdf(fake_pdf) + + +def test_parse_image_raises_import_error_with_install_hint(monkeypatch, tmp_path): + parser = PaddleOCRParser() + fake_image = tmp_path / "sample.png" + fake_image.write_bytes(b"not-a-real-image") + + def missing_dependency(): + raise ImportError("missing paddleocr") + + monkeypatch.setattr(parser, "_require_paddleocr", missing_dependency) + + with pytest.raises(ImportError, match="paddleocr"): + parser.parse_image(fake_image) + + +def test_parse_image_returns_content_list_schema(monkeypatch, tmp_path): + parser = PaddleOCRParser() + fake_image = tmp_path / "sample.png" + fake_image.write_bytes(b"image-bytes") + + class FakeOCR: + def ocr(self, input_data, cls=True): + return [ + [ + [[[0, 0], [1, 0], [1, 1], [0, 1]], ("First line", 0.99)], + [[[0, 2], [1, 2], [1, 3], [0, 3]], ("Second line", 0.95)], + ] + ] + + monkeypatch.setattr(parser, "_get_ocr", lambda lang=None: FakeOCR()) + + content_list = parser.parse_image(fake_image, page_idx=7) + + assert content_list == [ + {"type": "text", "text": "First line", "page_idx": 7}, + {"type": "text", "text": "Second line", "page_idx": 7}, + ] + + +def test_parse_image_preserves_repeated_ocr_lines(monkeypatch, tmp_path): + parser = PaddleOCRParser() + fake_image = tmp_path / "sample.png" + fake_image.write_bytes(b"image-bytes") + + class FakeOCR: + def ocr(self, input_data, cls=True): + return [ + [ + [[[0, 0], [1, 0], [1, 1], [0, 1]], ("Same", 0.99)], + [[[0, 2], [1, 2], [1, 3], [0, 3]], ("Same", 0.95)], + ] + ] + + monkeypatch.setattr(parser, "_get_ocr", lambda lang=None: FakeOCR()) + + content_list = parser.parse_image(fake_image, page_idx=1) + + assert content_list == [ + {"type": "text", "text": "Same", "page_idx": 1}, + {"type": "text", "text": "Same", "page_idx": 1}, + ] + + +def test_parse_pdf_assigns_page_index(monkeypatch, tmp_path): + parser = PaddleOCRParser() + fake_pdf = tmp_path / "sample.pdf" + fake_pdf.write_bytes(b"%PDF-1.4\n") + + monkeypatch.setattr( + parser, + "_extract_pdf_page_inputs", + lambda pdf_path: [(0, "page0"), (1, "page1")], + ) + monkeypatch.setattr( + parser, + "_ocr_rendered_page", + lambda rendered_page, lang=None, cls_enabled=True: [f"{rendered_page}-text"], + ) + + content_list = parser.parse_pdf(fake_pdf) + + assert content_list == [ + {"type": "text", "text": "page0-text", "page_idx": 0}, + {"type": "text", "text": "page1-text", "page_idx": 1}, + ] diff --git a/tests/testparser_wiring.py b/tests/testparser_wiring.py new file mode 100644 index 000000000..13bb7ae9c --- /dev/null +++ b/tests/testparser_wiring.py @@ -0,0 +1,132 @@ +import pytest + +from raganything.batch_parser import BatchParser + + +def test_batch_parser_uses_paddleocr_parser(): + batch_parser = BatchParser( + parser_type="paddleocr", + show_progress=False, + skip_installation_check=True, + ) + assert batch_parser.parser.__class__.__name__ == "PaddleOCRParser" + + +def test_raganything_initializes_selected_parser(monkeypatch, tmp_path): + pytest.importorskip("lightrag") + + import raganything.raganything as rag_module + from raganything.config import RAGAnythingConfig + + class StubParser: + def check_installation(self): + return True + + captured = {} + + def fake_get_parser(parser_name): + captured["parser_name"] = parser_name + return StubParser() + + monkeypatch.setattr(rag_module, "get_parser", fake_get_parser) + monkeypatch.setattr(rag_module.atexit, "register", lambda *args, **kwargs: None) + + config = RAGAnythingConfig( + working_dir=str(tmp_path / "rag_workdir"), + parser="paddleocr", + ) + rag = rag_module.RAGAnything(config=config) + + assert captured["parser_name"] == "paddleocr" + assert isinstance(rag.doc_parser, StubParser) + + +@pytest.mark.asyncio +async def test_processor_parse_document_uses_selected_parser(monkeypatch, tmp_path): + import raganything.processor as processor_module + + class FakeLogger: + def info(self, *args, **kwargs): + pass + + def warning(self, *args, **kwargs): + pass + + def error(self, *args, **kwargs): + pass + + def debug(self, *args, **kwargs): + pass + + class FakeParser: + def parse_pdf(self, **kwargs): + return [{"type": "text", "text": "parsed by fake parser", "page_idx": 0}] + + def parse_image(self, **kwargs): + return [{"type": "text", "text": "image parsed", "page_idx": 0}] + + def parse_office_doc(self, **kwargs): + return [{"type": "text", "text": "office parsed", "page_idx": 0}] + + def parse_document(self, **kwargs): + return [{"type": "text", "text": "generic parsed", "page_idx": 0}] + + selected = {"calls": 0} + + def fake_get_parser(parser_name): + selected["parser_name"] = parser_name + selected["calls"] += 1 + return FakeParser() + + monkeypatch.setattr(processor_module, "get_parser", fake_get_parser) + + class DummyProcessor(processor_module.ProcessorMixin): + pass + + dummy = DummyProcessor() + dummy.config = type( + "Config", + (), + { + "parser": "paddleocr", + "parser_output_dir": str(tmp_path / "output"), + "parse_method": "auto", + "display_content_stats": False, + "use_full_path": False, + }, + )() + dummy.logger = FakeLogger() + dummy.parse_cache = None + + async def fake_store_cached_result(*args, **kwargs): + return None + + monkeypatch.setattr( + DummyProcessor, + "_store_cached_result", + fake_store_cached_result, + raising=False, + ) + monkeypatch.setattr( + DummyProcessor, + "_generate_content_based_doc_id", + lambda self, content_list: "doc-fixed", + raising=False, + ) + + fake_pdf = tmp_path / "sample.pdf" + fake_pdf.write_bytes(b"%PDF-1.4\n") + + content_list, doc_id = await dummy.parse_document(str(fake_pdf)) + content_list_2, doc_id_2 = await dummy.parse_document(str(fake_pdf)) + + assert selected["parser_name"] == "paddleocr" + assert selected["calls"] == 1 + assert doc_id == "doc-fixed" + assert doc_id_2 == "doc-fixed" + assert content_list == [ + {"type": "text", "text": "parsed by fake parser", "page_idx": 0} + ] + assert content_list_2 == [ + {"type": "text", "text": "parsed by fake parser", "page_idx": 0} + ]