Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 21 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ async def main():
# Create RAGAnything configuration
config = RAGAnythingConfig(
working_dir="./rag_storage",
parser="mineru", # Parser selection: mineru or docling
parser="mineru", # Parser selection: mineru, docling, or paddleocr
parse_method="auto", # Parse method: auto, ocr, or txt
enable_image_processing=True,
enable_table_processing=True,
Expand Down Expand Up @@ -1047,7 +1047,7 @@ Create a `.env` file (refer to `.env.example`):
OPENAI_API_KEY=your_openai_api_key
OPENAI_BASE_URL=your_base_url # Optional
OUTPUT_DIR=./output # Default output directory for parsed documents
PARSER=mineru # Parser selection: mineru or docling
PARSER=mineru # Parser selection: mineru, docling, or paddleocr
PARSE_METHOD=auto # Parse method: auto, ocr, or txt
```

Expand All @@ -1070,6 +1070,21 @@ RAGAnything now supports multiple parsers, each with specific advantages:
- Better document structure preservation
- Native support for multiple Office formats

#### PaddleOCR Parser
- OCR-focused parser for images and PDFs
- Produces text blocks compatible with existing `content_list` processing
- Supports optional Office/TXT/MD parsing by converting to PDF first

Install PaddleOCR parser extras:

```bash
pip install -e ".[paddleocr]"
# or
uv sync --extra paddleocr
```

> **Note**: PaddleOCR also requires `paddlepaddle` (CPU/GPU package varies by platform). Install it with the official guide: https://www.paddlepaddle.org.cn/install/quick

### MinerU Configuration

```bash
Expand All @@ -1091,15 +1106,15 @@ await rag.process_document_complete(
file_path="document.pdf",
output_dir="./output/",
parse_method="auto", # or "ocr", "txt"
parser="mineru" # Optional: "mineru" or "docling"
parser="mineru" # Optional: "mineru", "docling", or "paddleocr"
)

# Advanced parsing configuration with special parameters
await rag.process_document_complete(
file_path="document.pdf",
output_dir="./output/",
parse_method="auto", # Parsing method: "auto", "ocr", "txt"
parser="mineru", # Parser selection: "mineru" or "docling"
parser="mineru", # Parser selection: "mineru", "docling", or "paddleocr"

# MinerU special parameters - all supported kwargs:
lang="ch", # Document language for OCR optimization (e.g., "ch", "en", "ja")
Expand All @@ -1119,7 +1134,7 @@ await rag.process_document_complete(
)
```

> **Note**: MinerU 2.0 no longer uses the `magic-pdf.json` configuration file. All settings are now passed as command-line parameters or function arguments. RAG-Anything now supports multiple document parsers - you can choose between MinerU and Docling based on your needs.
> **Note**: MinerU 2.0 no longer uses the `magic-pdf.json` configuration file. All settings are now passed as command-line parameters or function arguments. RAG-Anything supports multiple document parsers, including MinerU, Docling, and PaddleOCR.

### Processing Requirements

Expand All @@ -1128,6 +1143,7 @@ Different content types require specific optional dependencies:
- **Office Documents** (.doc, .docx, .ppt, .pptx, .xls, .xlsx): Install [LibreOffice](https://www.libreoffice.org/download/download/)
- **Extended Image Formats** (.bmp, .tiff, .gif, .webp): Install with `pip install raganything[image]`
- **Text Files** (.txt, .md): Install with `pip install raganything[text]`
- **PaddleOCR Parser** (`parser="paddleocr"`): Install with `pip install raganything[paddleocr]`, then install `paddlepaddle` for your platform

> **📋 Quick Install**: Use `pip install raganything[all]` to enable all format support (Python dependencies only - LibreOffice still needs separate installation)

Expand Down
8 changes: 6 additions & 2 deletions docs/batch_processing.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ pip install raganything[all]

# Required for batch processing
pip install tqdm

# Optional for parser='paddleocr'
pip install raganything[paddleocr]
```

## Usage
Expand All @@ -35,7 +38,7 @@ from raganything.batch_parser import BatchParser

# Create batch parser
batch_parser = BatchParser(
parser_type="mineru", # or "docling"
parser_type="mineru", # or "docling" or "paddleocr"
max_workers=4,
show_progress=True,
timeout_per_file=300,
Expand Down Expand Up @@ -123,6 +126,7 @@ python -m raganything.batch_parser examples/sample_docs/ --output ./output --wor

# With specific parser
python -m raganything.batch_parser examples/sample_docs/ --parser mineru --method auto
python -m raganything.batch_parser examples/sample_docs/ --parser paddleocr --method ocr

# Without progress bar
python -m raganything.batch_parser examples/sample_docs/ --output ./output --no-progress
Expand All @@ -148,7 +152,7 @@ PARSER_OUTPUT_DIR=./parsed_output

### BatchParser Parameters

- **parser_type**: `"mineru"` or `"docling"` (default: `"mineru"`)
- **parser_type**: `"mineru"`, `"docling"`, or `"paddleocr"` (default: `"mineru"`)
- **max_workers**: Number of parallel workers (default: `4`)
- **show_progress**: Show progress bar (default: `True`)
- **timeout_per_file**: Timeout per file in seconds (default: `300`)
Expand Down
2 changes: 1 addition & 1 deletion env.example
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ OLLAMA_EMULATING_MODEL_TAG=latest
### Parser Configuration
# PARSE_METHOD=auto
# OUTPUT_DIR=./output
# PARSER=mineru
# PARSER=mineru # Options: mineru, docling, paddleocr
# DISPLAY_CONTENT_STATS=true

### Multimodal Processing Configuration
Expand Down
3 changes: 2 additions & 1 deletion examples/batch_dry_run_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
- pip install:
python examples/batch_dry_run_example.py examples/sample_docs --parser mineru
python examples/batch_dry_run_example.py examples/sample_docs/projects examples/sample_docs/web --parser docling
python examples/batch_dry_run_example.py examples/sample_docs --parser paddleocr
- uv install:
uv run python examples/batch_dry_run_example.py examples/sample_docs --parser mineru --recursive
uv run python examples/batch_dry_run_example.py examples/sample_docs --parser mineru --no-recursive
Expand All @@ -22,7 +23,7 @@ def main() -> int:
parser.add_argument("paths", nargs="+", help="File paths or directories to scan")
parser.add_argument(
"--parser",
choices=["mineru", "docling"],
choices=["mineru", "docling", "paddleocr"],
default="mineru",
help="Parser to use for file-type support",
)
Expand Down
9 changes: 5 additions & 4 deletions examples/raganything_example.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#!/usr/bin/env python
"""
Example script demonstrating the integration of MinerU parser with RAGAnything
Example script demonstrating parser integration with RAGAnything
This example shows how to:
1. Process documents with RAGAnything using MinerU parser
1. Process documents with RAGAnything using configurable parsers
2. Perform pure text queries using aquery() method
3. Perform multimodal queries with specific multimodal content using aquery_with_multimodal() method
4. Handle different types of multimodal content (tables, equations) in queries
Expand Down Expand Up @@ -108,7 +108,7 @@ async def process_with_rag(
# Create RAGAnything configuration
config = RAGAnythingConfig(
working_dir=working_dir or "./rag_storage",
parser=parser, # Parser selection: mineru or docling
parser=parser, # Parser selection: mineru, docling, or paddleocr
parse_method="auto", # Parse method: auto, ocr, or txt
enable_image_processing=True,
enable_table_processing=True,
Expand Down Expand Up @@ -289,7 +289,8 @@ def main():
parser.add_argument(
"--parser",
default=os.getenv("PARSER", "mineru"),
help="Optional base URL for API",
choices=["mineru", "docling", "paddleocr"],
help="Parser selection",
)

args = parser.parse_args()
Expand Down
8 changes: 7 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ dependencies = [
image = ["Pillow>=10.0.0"]
text = ["reportlab>=4.0.0"]
office = [] # Requires LibreOffice (external program)
paddleocr = [
"paddleocr>=2.7.0",
"pypdfium2>=4.25.0",
]
markdown = [
"markdown>=3.4.0",
"weasyprint>=60.0",
Expand All @@ -39,9 +43,11 @@ markdown = [
all = [
"Pillow>=10.0.0",
"reportlab>=4.0.0",
"paddleocr>=2.7.0",
"pypdfium2>=4.25.0",
"markdown>=3.4.0",
"weasyprint>=60.0",
"pygments>=2.10.0"
"pygments>=2.10.0",
]

[project.urls]
Expand Down
16 changes: 7 additions & 9 deletions raganything/batch_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

from tqdm import tqdm

from .parser import MineruParser, DoclingParser
from .parser import SUPPORTED_PARSERS, get_parser


@dataclass
Expand Down Expand Up @@ -70,7 +70,7 @@ def __init__(
Initialize batch parser
Args:
parser_type: Type of parser to use ("mineru" or "docling")
parser_type: Type of parser to use ("mineru", "docling", or "paddleocr")
max_workers: Maximum number of parallel workers
show_progress: Whether to show progress bars
timeout_per_file: Timeout in seconds for each file
Expand All @@ -83,12 +83,10 @@ def __init__(
self.logger = logging.getLogger(__name__)

# Initialize parser
if parser_type == "mineru":
self.parser = MineruParser()
elif parser_type == "docling":
self.parser = DoclingParser()
else:
raise ValueError(f"Unsupported parser type: {parser_type}")
try:
self.parser = get_parser(parser_type)
except ValueError as exc:
raise ValueError(f"Unsupported parser type: {parser_type}") from exc

# Check parser installation (optional)
if not skip_installation_check:
Expand Down Expand Up @@ -384,7 +382,7 @@ def main():
parser.add_argument("--output", "-o", required=True, help="Output directory")
parser.add_argument(
"--parser",
choices=["mineru", "docling"],
choices=list(SUPPORTED_PARSERS),
default="mineru",
help="Parser to use",
)
Expand Down
2 changes: 1 addition & 1 deletion raganything/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class RAGAnythingConfig:
"""Default output directory for parsed content."""

parser: str = field(default=get_env_value("PARSER", "mineru", str))
"""Parser selection: 'mineru' or 'docling'."""
"""Parser selection: 'mineru', 'docling', or 'paddleocr'."""

display_content_stats: bool = field(
default=get_env_value("DISPLAY_CONTENT_STATS", True, bool)
Expand Down
Loading