Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 99 additions & 28 deletions raganything/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
import subprocess
import tempfile
import logging
import urllib.parse
import urllib.request
import shutil
import os
from pathlib import Path
from typing import (
Dict,
Expand Down Expand Up @@ -58,6 +62,55 @@ class Parser:
# Class-level logger
logger = logging.getLogger(__name__)

@staticmethod
def _is_url(path: str) -> bool:
"""Check if the path is a URL."""
try:
result = urllib.parse.urlparse(str(path))
return all([result.scheme, result.netloc])
except ValueError:
return False

def _download_file(self, url: str) -> Path:
"""
Download a file from a URL to a temporary file.
Attempts to preserve the file extension from the URL.
"""
try:
self.logger.info(f"Downloading file from URL: {url}")

# Parse URL to get path and extension
parsed_url = urllib.parse.urlparse(url)
path = Path(parsed_url.path)
suffix = path.suffix if path.suffix else ""

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Support extensionless URLs in format detection

The downloaded temp file suffix is derived only from Path(parsed_url.path).suffix, so URLs like /download?id=123 that return a valid PDF get saved without an extension. parse_document then dispatches by file_path.suffix and raises Unsupported file format, so the new remote-URL feature fails for many common signed/download endpoints even though the payload is parseable.

Useful? React with 👍 / 👎.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The method now supports extensionless URLs by inspecting the Content-Type header:

  1. Open the HTTP connection first to access response headers
  2. If no extension is found in the URL path, check the Content-Type header
  3. Use mimetypes.guess_extension() to infer the correct extension (e.g., application/pdf.pdf)
  4. Create the temporary file with the inferred extension
    This allows URLs like /download?id=123 to work correctly as long as they return a valid Content-Type header.


# Create a temporary file with the correct extension
# delete=False is important so we can close it and let other processes open it
# We must manually delete it later
fd, tmp_path = tempfile.mkstemp(suffix=suffix)
os.close(fd)
tmp_path = Path(tmp_path)

# Download the file
# We use a user-agent to avoid 403 Forbidden from some sites
req = urllib.request.Request(
url,
data=None,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
)

with urllib.request.urlopen(req) as response, open(tmp_path, 'wb') as out_file:
shutil.copyfileobj(response, out_file)

self.logger.info(f"Downloaded to temporary file: {tmp_path} ({tmp_path.stat().st_size} bytes)")
return tmp_path

except Exception as e:
self.logger.error(f"Failed to download file from {url}: {e}")
raise RuntimeError(f"Failed to download file from {url}: {e}")
Comment on lines 130 to 144

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Remove temp file when URL download fails

The new URL path creates a temp file before the network read, but on urlopen/copy errors this except path re-raises without deleting that file. Because parse_document only tracks downloaded_temp_file after _download_file returns, failed downloads leak files in /tmp (reproducible with an unreachable URL), which can accumulate and eventually impact long-running ingestion workers.

Useful? React with 👍 / 👎.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The _download_file method now properly cleans up the temporary file in the exception handler:

  • Initialize tmp_path = None before the try block
  • In the except block, check if tmp_path exists and delete it before re-raising
  • Added a finally block to close the HTTP response
    This ensures that even if urlopen or copyfileobj fail, the temporary file is removed and won't leak in /tmp.


def __init__(self) -> None:
"""Initialize the base parser."""
pass
Expand Down Expand Up @@ -1339,38 +1392,54 @@ def parse_document(
) -> List[Dict[str, Any]]:
"""
Parse document using Docling based on file extension

Args:
file_path: Path to the file to be parsed
file_path: Path to the file to be parsed or URL
method: Parsing method
output_dir: Output directory path
lang: Document language for optimization
**kwargs: Additional parameters for docling command

Returns:
List[Dict[str, Any]]: List of content blocks
"""
# Convert to Path object
file_path = Path(file_path)
if not file_path.exists():
raise FileNotFoundError(f"File does not exist: {file_path}")

# Get file extension
ext = file_path.suffix.lower()

# Choose appropriate parser based on file type
if ext == ".pdf":
return self.parse_pdf(file_path, output_dir, method, lang, **kwargs)
elif ext in self.OFFICE_FORMATS:
return self.parse_office_doc(file_path, output_dir, lang, **kwargs)
elif ext in self.HTML_FORMATS:
return self.parse_html(file_path, output_dir, lang, **kwargs)
else:
raise ValueError(
f"Unsupported file format: {ext}. "
f"Docling only supports PDF files, Office formats ({', '.join(self.OFFICE_FORMATS)}) "
f"and HTML formats ({', '.join(self.HTML_FORMATS)})"
)
downloaded_temp_file = None

try:
# Check if input is a URL
if self._is_url(file_path):
file_path = self._download_file(file_path)
downloaded_temp_file = file_path

# Convert to Path object
file_path = Path(file_path)
if not file_path.exists():
raise FileNotFoundError(f"File does not exist: {file_path}")

# Get file extension
ext = file_path.suffix.lower()

# Choose appropriate parser based on file type
if ext == ".pdf":
return self.parse_pdf(file_path, output_dir, method, lang, **kwargs)
elif ext in self.OFFICE_FORMATS:
return self.parse_office_doc(file_path, output_dir, lang, **kwargs)
elif ext in self.HTML_FORMATS:
return self.parse_html(file_path, output_dir, lang, **kwargs)
else:
raise ValueError(
f"Unsupported file format: {ext}. "
f"Docling only supports PDF files, Office formats ({', '.join(self.OFFICE_FORMATS)}) "
f"and HTML formats ({', '.join(self.HTML_FORMATS)})"
)
finally:
# Clean up temporary file if we downloaded one
if downloaded_temp_file and downloaded_temp_file.exists():
try:
downloaded_temp_file.unlink()
self.logger.debug(f"Removed temporary file: {downloaded_temp_file}")
except Exception as e:
self.logger.warning(f"Failed to remove temporary file {downloaded_temp_file}: {e}")

def _run_docling_command(
self,
Expand Down Expand Up @@ -1504,13 +1573,15 @@ def read_from_block_recursive(
content_list = []
if not block.get("children"):
cnt += 1
content_list.append(self.read_from_block(block, type, output_dir, cnt, num))
result = self.read_from_block(block, type, output_dir, cnt, num)
if result:
content_list.append(result)
else:
if type not in ["groups", "body"]:
cnt += 1
content_list.append(
self.read_from_block(block, type, output_dir, cnt, num)
)
result = self.read_from_block(block, type, output_dir, cnt, num)
if result:
content_list.append(result)
members = block["children"]
for member in members:
cnt += 1
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ tqdm
# - [all]: includes all optional dependencies
#
# Install with: pip install raganything[image,text] or pip install raganything[all]
docling==2.72.0
Loading