From 267b6955ec83927fd2abc29c3445facd835ab70d Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 18 Feb 2026 10:08:47 +1100 Subject: [PATCH 1/2] fix: prevent same-name file collision in parser output directories (#51) When multiple files share the same name (e.g. dir1/paper.pdf and dir2/paper.pdf), their parser output was written to the same directory, causing data loss. Add _unique_output_dir() that creates a unique subdirectory per file by appending a short hash of the file's absolute path (e.g. paper_a1b2c3d4/). This ensures each file gets its own isolated output directory. --- raganything/parser.py | 46 +++++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/raganything/parser.py b/raganything/parser.py index a07443e2..bcaae2a9 100644 --- a/raganything/parser.py +++ b/raganything/parser.py @@ -12,6 +12,7 @@ from __future__ import annotations +import hashlib import json import argparse import base64 @@ -62,6 +63,26 @@ def __init__(self) -> None: """Initialize the base parser.""" pass + @staticmethod + def _unique_output_dir(base_dir: Union[str, Path], file_path: Union[str, Path]) -> Path: + """Create a unique output subdirectory for a file to prevent same-name collisions. + + When multiple files share the same name (e.g. dir1/paper.pdf and dir2/paper.pdf), + their parser output would collide in the same output directory. This creates a + unique subdirectory by appending a short hash of the file's absolute path. (Fixes #51) + + Args: + base_dir: The base output directory + file_path: Path to the input file + + Returns: + Path like base_dir/paper_a1b2c3d4/ unique per absolute file path. + """ + file_path = Path(file_path).resolve() + stem = file_path.stem + path_hash = hashlib.md5(str(file_path).encode()).hexdigest()[:8] + return Path(base_dir) / f"{stem}_{path_hash}" + @classmethod def convert_office_to_pdf( cls, doc_path: Union[str, Path], output_dir: Optional[str] = None @@ -911,9 +932,10 @@ def parse_pdf( name_without_suff = pdf_path.stem - # Prepare output directory + # Prepare output directory — use unique subdirectory to prevent + # same-name file collisions when output_dir is shared (#51) if output_dir: - base_output_dir = Path(output_dir) + base_output_dir = self._unique_output_dir(output_dir, pdf_path) else: base_output_dir = pdf_path.parent / "mineru_output" @@ -1064,9 +1086,10 @@ def parse_image( name_without_suff = image_path.stem - # Prepare output directory + # Prepare output directory — use unique subdirectory to prevent + # same-name file collisions when output_dir is shared (#51) if output_dir: - base_output_dir = Path(output_dir) + base_output_dir = self._unique_output_dir(output_dir, image_path) else: base_output_dir = image_path.parent / "mineru_output" @@ -1303,9 +1326,10 @@ def parse_pdf( name_without_suff = pdf_path.stem - # Prepare output directory + # Prepare output directory — use unique subdirectory to prevent + # same-name file collisions when output_dir is shared (#51) if output_dir: - base_output_dir = Path(output_dir) + base_output_dir = self._unique_output_dir(output_dir, pdf_path) else: base_output_dir = pdf_path.parent / "docling_output" @@ -1622,9 +1646,10 @@ def parse_office_doc( name_without_suff = doc_path.stem - # Prepare output directory + # Prepare output directory — use unique subdirectory to prevent + # same-name file collisions when output_dir is shared (#51) if output_dir: - base_output_dir = Path(output_dir) + base_output_dir = self._unique_output_dir(output_dir, doc_path) else: base_output_dir = doc_path.parent / "docling_output" @@ -1680,9 +1705,10 @@ def parse_html( name_without_suff = html_path.stem - # Prepare output directory + # Prepare output directory — use unique subdirectory to prevent + # same-name file collisions when output_dir is shared (#51) if output_dir: - base_output_dir = Path(output_dir) + base_output_dir = self._unique_output_dir(output_dir, html_path) else: base_output_dir = html_path.parent / "docling_output" From 82d8689a0ba86a7d6227a282b61f046fadb51a7f Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 19 Feb 2026 09:44:39 +1100 Subject: [PATCH 2/2] chore: apply ruff formatting --- raganything/parser.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/raganything/parser.py b/raganything/parser.py index bcaae2a9..1764c325 100644 --- a/raganything/parser.py +++ b/raganything/parser.py @@ -64,7 +64,9 @@ def __init__(self) -> None: pass @staticmethod - def _unique_output_dir(base_dir: Union[str, Path], file_path: Union[str, Path]) -> Path: + def _unique_output_dir( + base_dir: Union[str, Path], file_path: Union[str, Path] + ) -> Path: """Create a unique output subdirectory for a file to prevent same-name collisions. When multiple files share the same name (e.g. dir1/paper.pdf and dir2/paper.pdf),