-
Notifications
You must be signed in to change notification settings - Fork 1.6k
fix: prevent same-name file collision in parser output directories (#51) #203
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,7 @@ | |
| from __future__ import annotations | ||
|
|
||
|
|
||
| import hashlib | ||
| import json | ||
| import argparse | ||
| import base64 | ||
|
|
@@ -62,6 +63,26 @@ def __init__(self) -> None: | |
| """Initialize the base parser.""" | ||
| pass | ||
|
|
||
| @staticmethod | ||
| def _unique_output_dir(base_dir: Union[str, Path], file_path: Union[str, Path]) -> Path: | ||
| """Create a unique output subdirectory for a file to prevent same-name collisions. | ||
|
|
||
| When multiple files share the same name (e.g. dir1/paper.pdf and dir2/paper.pdf), | ||
| their parser output would collide in the same output directory. This creates a | ||
| unique subdirectory by appending a short hash of the file's absolute path. (Fixes #51) | ||
|
|
||
| Args: | ||
| base_dir: The base output directory | ||
| file_path: Path to the input file | ||
|
|
||
| Returns: | ||
| Path like base_dir/paper_a1b2c3d4/ unique per absolute file path. | ||
| """ | ||
| file_path = Path(file_path).resolve() | ||
| stem = file_path.stem | ||
| path_hash = hashlib.md5(str(file_path).encode()).hexdigest()[:8] | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Using Useful? React with 👍 / 👎. |
||
| return Path(base_dir) / f"{stem}_{path_hash}" | ||
|
|
||
| @classmethod | ||
| def convert_office_to_pdf( | ||
| cls, doc_path: Union[str, Path], output_dir: Optional[str] = None | ||
|
|
@@ -911,9 +932,10 @@ def parse_pdf( | |
|
|
||
| name_without_suff = pdf_path.stem | ||
|
|
||
| # Prepare output directory | ||
| # Prepare output directory — use unique subdirectory to prevent | ||
| # same-name file collisions when output_dir is shared (#51) | ||
| if output_dir: | ||
| base_output_dir = Path(output_dir) | ||
| base_output_dir = self._unique_output_dir(output_dir, pdf_path) | ||
| else: | ||
| base_output_dir = pdf_path.parent / "mineru_output" | ||
|
|
||
|
|
@@ -1064,9 +1086,10 @@ def parse_image( | |
|
|
||
| name_without_suff = image_path.stem | ||
|
|
||
| # Prepare output directory | ||
| # Prepare output directory — use unique subdirectory to prevent | ||
| # same-name file collisions when output_dir is shared (#51) | ||
| if output_dir: | ||
| base_output_dir = Path(output_dir) | ||
| base_output_dir = self._unique_output_dir(output_dir, image_path) | ||
| else: | ||
| base_output_dir = image_path.parent / "mineru_output" | ||
|
|
||
|
|
@@ -1303,9 +1326,10 @@ def parse_pdf( | |
|
|
||
| name_without_suff = pdf_path.stem | ||
|
|
||
| # Prepare output directory | ||
| # Prepare output directory — use unique subdirectory to prevent | ||
| # same-name file collisions when output_dir is shared (#51) | ||
| if output_dir: | ||
| base_output_dir = Path(output_dir) | ||
| base_output_dir = self._unique_output_dir(output_dir, pdf_path) | ||
| else: | ||
| base_output_dir = pdf_path.parent / "docling_output" | ||
|
|
||
|
|
@@ -1622,9 +1646,10 @@ def parse_office_doc( | |
|
|
||
| name_without_suff = doc_path.stem | ||
|
|
||
| # Prepare output directory | ||
| # Prepare output directory — use unique subdirectory to prevent | ||
| # same-name file collisions when output_dir is shared (#51) | ||
| if output_dir: | ||
| base_output_dir = Path(output_dir) | ||
| base_output_dir = self._unique_output_dir(output_dir, doc_path) | ||
| else: | ||
| base_output_dir = doc_path.parent / "docling_output" | ||
|
|
||
|
|
@@ -1680,9 +1705,10 @@ def parse_html( | |
|
|
||
| name_without_suff = html_path.stem | ||
|
|
||
| # Prepare output directory | ||
| # Prepare output directory — use unique subdirectory to prevent | ||
| # same-name file collisions when output_dir is shared (#51) | ||
| if output_dir: | ||
| base_output_dir = Path(output_dir) | ||
| base_output_dir = self._unique_output_dir(output_dir, html_path) | ||
| else: | ||
| base_output_dir = html_path.parent / "docling_output" | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
_unique_output_dir()truncates the MD5 digest to 8 hex chars, which gives only 32 bits of space; with larger batch runs, different absolute paths will eventually collide and be mapped to the samebase_output_dir, causing the same output overwrite/data-loss behavior this patch is trying to eliminate (all updated parse paths now rely on this helper). In practice, birthday collisions become plausible around tens of thousands of files, so this can silently corrupt experiment outputs in high-volume ingestion jobs.Useful? React with 👍 / 👎.