Skip to content
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 36 additions & 10 deletions raganything/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from __future__ import annotations


import hashlib
import json
import argparse
import base64
Expand Down Expand Up @@ -62,6 +63,26 @@ def __init__(self) -> None:
"""Initialize the base parser."""
pass

@staticmethod
def _unique_output_dir(base_dir: Union[str, Path], file_path: Union[str, Path]) -> Path:
"""Create a unique output subdirectory for a file to prevent same-name collisions.

When multiple files share the same name (e.g. dir1/paper.pdf and dir2/paper.pdf),
their parser output would collide in the same output directory. This creates a
unique subdirectory by appending a short hash of the file's absolute path. (Fixes #51)

Args:
base_dir: The base output directory
file_path: Path to the input file

Returns:
Path like base_dir/paper_a1b2c3d4/ unique per absolute file path.
"""
file_path = Path(file_path).resolve()
stem = file_path.stem
path_hash = hashlib.md5(str(file_path).encode()).hexdigest()[:8]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Use a non-truncated hash for unique output directories

_unique_output_dir() truncates the MD5 digest to 8 hex chars, which gives only 32 bits of space; with larger batch runs, different absolute paths will eventually collide and be mapped to the same base_output_dir, causing the same output overwrite/data-loss behavior this patch is trying to eliminate (all updated parse paths now rely on this helper). In practice, birthday collisions become plausible around tens of thousands of files, so this can silently corrupt experiment outputs in high-volume ingestion jobs.

Useful? React with 👍 / 👎.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Avoid MD5 for output-dir hashing

Using hashlib.md5(...) here can raise at runtime on FIPS-enabled Python/OpenSSL builds, which means any parse call with output_dir set will fail before running MinerU/Docling. This regression is environment-specific but impactful in hardened production setups; prefer a FIPS-safe hash (for example SHA-256 truncated) or pass usedforsecurity=False where supported.

Useful? React with 👍 / 👎.

return Path(base_dir) / f"{stem}_{path_hash}"

@classmethod
def convert_office_to_pdf(
cls, doc_path: Union[str, Path], output_dir: Optional[str] = None
Expand Down Expand Up @@ -911,9 +932,10 @@ def parse_pdf(

name_without_suff = pdf_path.stem

# Prepare output directory
# Prepare output directory — use unique subdirectory to prevent
# same-name file collisions when output_dir is shared (#51)
if output_dir:
base_output_dir = Path(output_dir)
base_output_dir = self._unique_output_dir(output_dir, pdf_path)
else:
base_output_dir = pdf_path.parent / "mineru_output"

Expand Down Expand Up @@ -1064,9 +1086,10 @@ def parse_image(

name_without_suff = image_path.stem

# Prepare output directory
# Prepare output directory — use unique subdirectory to prevent
# same-name file collisions when output_dir is shared (#51)
if output_dir:
base_output_dir = Path(output_dir)
base_output_dir = self._unique_output_dir(output_dir, image_path)
else:
base_output_dir = image_path.parent / "mineru_output"

Expand Down Expand Up @@ -1303,9 +1326,10 @@ def parse_pdf(

name_without_suff = pdf_path.stem

# Prepare output directory
# Prepare output directory — use unique subdirectory to prevent
# same-name file collisions when output_dir is shared (#51)
if output_dir:
base_output_dir = Path(output_dir)
base_output_dir = self._unique_output_dir(output_dir, pdf_path)
else:
base_output_dir = pdf_path.parent / "docling_output"

Expand Down Expand Up @@ -1622,9 +1646,10 @@ def parse_office_doc(

name_without_suff = doc_path.stem

# Prepare output directory
# Prepare output directory — use unique subdirectory to prevent
# same-name file collisions when output_dir is shared (#51)
if output_dir:
base_output_dir = Path(output_dir)
base_output_dir = self._unique_output_dir(output_dir, doc_path)
else:
base_output_dir = doc_path.parent / "docling_output"

Expand Down Expand Up @@ -1680,9 +1705,10 @@ def parse_html(

name_without_suff = html_path.stem

# Prepare output directory
# Prepare output directory — use unique subdirectory to prevent
# same-name file collisions when output_dir is shared (#51)
if output_dir:
base_output_dir = Path(output_dir)
base_output_dir = self._unique_output_dir(output_dir, html_path)
else:
base_output_dir = html_path.parent / "docling_output"

Expand Down
Loading