-
Notifications
You must be signed in to change notification settings - Fork 1.6k
feat(parser): add remote URL support for DoclingParser #195
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,10 @@ | |
| import subprocess | ||
| import tempfile | ||
| import logging | ||
| import urllib.parse | ||
| import urllib.request | ||
| import shutil | ||
| import os | ||
| from pathlib import Path | ||
| from typing import ( | ||
| Dict, | ||
|
|
@@ -58,6 +62,55 @@ class Parser: | |
| # Class-level logger | ||
| logger = logging.getLogger(__name__) | ||
|
|
||
| @staticmethod | ||
| def _is_url(path: str) -> bool: | ||
| """Check if the path is a URL.""" | ||
| try: | ||
| result = urllib.parse.urlparse(str(path)) | ||
| return all([result.scheme, result.netloc]) | ||
| except ValueError: | ||
| return False | ||
|
|
||
| def _download_file(self, url: str) -> Path: | ||
| """ | ||
| Download a file from a URL to a temporary file. | ||
| Attempts to preserve the file extension from the URL. | ||
| """ | ||
| try: | ||
| self.logger.info(f"Downloading file from URL: {url}") | ||
|
|
||
| # Parse URL to get path and extension | ||
| parsed_url = urllib.parse.urlparse(url) | ||
| path = Path(parsed_url.path) | ||
| suffix = path.suffix if path.suffix else "" | ||
|
|
||
| # Create a temporary file with the correct extension | ||
| # delete=False is important so we can close it and let other processes open it | ||
| # We must manually delete it later | ||
| fd, tmp_path = tempfile.mkstemp(suffix=suffix) | ||
| os.close(fd) | ||
| tmp_path = Path(tmp_path) | ||
|
|
||
| # Download the file | ||
| # We use a user-agent to avoid 403 Forbidden from some sites | ||
| req = urllib.request.Request( | ||
| url, | ||
| data=None, | ||
| headers={ | ||
| 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36' | ||
| } | ||
| ) | ||
|
|
||
| with urllib.request.urlopen(req) as response, open(tmp_path, 'wb') as out_file: | ||
| shutil.copyfileobj(response, out_file) | ||
|
|
||
| self.logger.info(f"Downloaded to temporary file: {tmp_path} ({tmp_path.stat().st_size} bytes)") | ||
| return tmp_path | ||
|
|
||
| except Exception as e: | ||
| self.logger.error(f"Failed to download file from {url}: {e}") | ||
| raise RuntimeError(f"Failed to download file from {url}: {e}") | ||
|
Comment on lines
130
to
144
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The new URL path creates a temp file before the network read, but on Useful? React with 👍 / 👎.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The _download_file method now properly cleans up the temporary file in the exception handler:
|
||
|
|
||
| def __init__(self) -> None: | ||
| """Initialize the base parser.""" | ||
| pass | ||
|
|
@@ -1339,38 +1392,54 @@ def parse_document( | |
| ) -> List[Dict[str, Any]]: | ||
| """ | ||
| Parse document using Docling based on file extension | ||
|
|
||
| Args: | ||
| file_path: Path to the file to be parsed | ||
| file_path: Path to the file to be parsed or URL | ||
| method: Parsing method | ||
| output_dir: Output directory path | ||
| lang: Document language for optimization | ||
| **kwargs: Additional parameters for docling command | ||
|
|
||
| Returns: | ||
| List[Dict[str, Any]]: List of content blocks | ||
| """ | ||
| # Convert to Path object | ||
| file_path = Path(file_path) | ||
| if not file_path.exists(): | ||
| raise FileNotFoundError(f"File does not exist: {file_path}") | ||
|
|
||
| # Get file extension | ||
| ext = file_path.suffix.lower() | ||
|
|
||
| # Choose appropriate parser based on file type | ||
| if ext == ".pdf": | ||
| return self.parse_pdf(file_path, output_dir, method, lang, **kwargs) | ||
| elif ext in self.OFFICE_FORMATS: | ||
| return self.parse_office_doc(file_path, output_dir, lang, **kwargs) | ||
| elif ext in self.HTML_FORMATS: | ||
| return self.parse_html(file_path, output_dir, lang, **kwargs) | ||
| else: | ||
| raise ValueError( | ||
| f"Unsupported file format: {ext}. " | ||
| f"Docling only supports PDF files, Office formats ({', '.join(self.OFFICE_FORMATS)}) " | ||
| f"and HTML formats ({', '.join(self.HTML_FORMATS)})" | ||
| ) | ||
| downloaded_temp_file = None | ||
|
|
||
| try: | ||
| # Check if input is a URL | ||
| if self._is_url(file_path): | ||
| file_path = self._download_file(file_path) | ||
| downloaded_temp_file = file_path | ||
|
|
||
| # Convert to Path object | ||
| file_path = Path(file_path) | ||
| if not file_path.exists(): | ||
| raise FileNotFoundError(f"File does not exist: {file_path}") | ||
|
|
||
| # Get file extension | ||
| ext = file_path.suffix.lower() | ||
|
|
||
| # Choose appropriate parser based on file type | ||
| if ext == ".pdf": | ||
| return self.parse_pdf(file_path, output_dir, method, lang, **kwargs) | ||
| elif ext in self.OFFICE_FORMATS: | ||
| return self.parse_office_doc(file_path, output_dir, lang, **kwargs) | ||
| elif ext in self.HTML_FORMATS: | ||
| return self.parse_html(file_path, output_dir, lang, **kwargs) | ||
| else: | ||
| raise ValueError( | ||
| f"Unsupported file format: {ext}. " | ||
| f"Docling only supports PDF files, Office formats ({', '.join(self.OFFICE_FORMATS)}) " | ||
| f"and HTML formats ({', '.join(self.HTML_FORMATS)})" | ||
| ) | ||
| finally: | ||
| # Clean up temporary file if we downloaded one | ||
| if downloaded_temp_file and downloaded_temp_file.exists(): | ||
| try: | ||
| downloaded_temp_file.unlink() | ||
| self.logger.debug(f"Removed temporary file: {downloaded_temp_file}") | ||
| except Exception as e: | ||
| self.logger.warning(f"Failed to remove temporary file {downloaded_temp_file}: {e}") | ||
|
|
||
| def _run_docling_command( | ||
| self, | ||
|
|
@@ -1504,13 +1573,15 @@ def read_from_block_recursive( | |
| content_list = [] | ||
| if not block.get("children"): | ||
| cnt += 1 | ||
| content_list.append(self.read_from_block(block, type, output_dir, cnt, num)) | ||
| result = self.read_from_block(block, type, output_dir, cnt, num) | ||
| if result: | ||
| content_list.append(result) | ||
| else: | ||
| if type not in ["groups", "body"]: | ||
| cnt += 1 | ||
| content_list.append( | ||
| self.read_from_block(block, type, output_dir, cnt, num) | ||
| ) | ||
| result = self.read_from_block(block, type, output_dir, cnt, num) | ||
| if result: | ||
| content_list.append(result) | ||
| members = block["children"] | ||
| for member in members: | ||
| cnt += 1 | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The downloaded temp file suffix is derived only from
Path(parsed_url.path).suffix, so URLs like/download?id=123that return a valid PDF get saved without an extension.parse_documentthen dispatches byfile_path.suffixand raisesUnsupported file format, so the new remote-URL feature fails for many common signed/download endpoints even though the payload is parseable.Useful? React with 👍 / 👎.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The method now supports extensionless URLs by inspecting the
Content-Typeheader:Content-Typeheadermimetypes.guess_extension()to infer the correct extension (e.g.,application/pdf→.pdf)This allows URLs like
/download?id=123to work correctly as long as they return a validContent-Typeheader.