HKUDS · bueno12223 · Feb 10, 2026 · Feb 11, 2026 · chatgpt-codex-connector · Feb 10, 2026
diff --git a/raganything/parser.py b/raganything/parser.py
@@ -18,6 +18,10 @@
 import subprocess
 import tempfile
 import logging
+import urllib.parse
+import urllib.request
+import shutil
+import os
 from pathlib import Path
 from typing import (
     Dict,
@@ -58,6 +62,55 @@ class Parser:
     # Class-level logger
     logger = logging.getLogger(__name__)
 
+    @staticmethod
+    def _is_url(path: str) -> bool:
+        """Check if the path is a URL."""
+        try:
+            result = urllib.parse.urlparse(str(path))
+            return all([result.scheme, result.netloc])
+        except ValueError:
+            return False
+
+    def _download_file(self, url: str) -> Path:
+        """
+        Download a file from a URL to a temporary file.
+        Attempts to preserve the file extension from the URL.
+        """
+        try:
+            self.logger.info(f"Downloading file from URL: {url}")
+
+            # Parse URL to get path and extension
+            parsed_url = urllib.parse.urlparse(url)
+            path = Path(parsed_url.path)
+            suffix = path.suffix if path.suffix else ""
+
+            # Create a temporary file with the correct extension
+            # delete=False is important so we can close it and let other processes open it
+            # We must manually delete it later
+            fd, tmp_path = tempfile.mkstemp(suffix=suffix)
+            os.close(fd)
+            tmp_path = Path(tmp_path)
+
+            # Download the file
+            # We use a user-agent to avoid 403 Forbidden from some sites
+            req = urllib.request.Request(
+                url, 
+                data=None, 
+                headers={
+                    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
+                }
+            )
+
+            with urllib.request.urlopen(req) as response, open(tmp_path, 'wb') as out_file:
+                shutil.copyfileobj(response, out_file)
+
+            self.logger.info(f"Downloaded to temporary file: {tmp_path} ({tmp_path.stat().st_size} bytes)")
+            return tmp_path
+
+        except Exception as e:
+            self.logger.error(f"Failed to download file from {url}: {e}")
+            raise RuntimeError(f"Failed to download file from {url}: {e}")
+
     def __init__(self) -> None:
         """Initialize the base parser."""
         pass
@@ -1339,38 +1392,54 @@ def parse_document(
     ) -> List[Dict[str, Any]]:
         """
         Parse document using Docling based on file extension
-
+        
         Args:
-            file_path: Path to the file to be parsed
+            file_path: Path to the file to be parsed or URL
             method: Parsing method
             output_dir: Output directory path
             lang: Document language for optimization
             **kwargs: Additional parameters for docling command
-
+            
         Returns:
             List[Dict[str, Any]]: List of content blocks
         """
-        # Convert to Path object
-        file_path = Path(file_path)
-        if not file_path.exists():
-            raise FileNotFoundError(f"File does not exist: {file_path}")
-
-        # Get file extension
-        ext = file_path.suffix.lower()
-
-        # Choose appropriate parser based on file type
-        if ext == ".pdf":
-            return self.parse_pdf(file_path, output_dir, method, lang, **kwargs)
-        elif ext in self.OFFICE_FORMATS:
-            return self.parse_office_doc(file_path, output_dir, lang, **kwargs)
-        elif ext in self.HTML_FORMATS:
-            return self.parse_html(file_path, output_dir, lang, **kwargs)
-        else:
-            raise ValueError(
-                f"Unsupported file format: {ext}. "
-                f"Docling only supports PDF files, Office formats ({', '.join(self.OFFICE_FORMATS)}) "
-                f"and HTML formats ({', '.join(self.HTML_FORMATS)})"
-            )
+        downloaded_temp_file = None
+
+        try:
+            # Check if input is a URL
+            if self._is_url(file_path):
+                file_path = self._download_file(file_path)
+                downloaded_temp_file = file_path
+
+            # Convert to Path object
+            file_path = Path(file_path)
+            if not file_path.exists():
+                raise FileNotFoundError(f"File does not exist: {file_path}")
+
+            # Get file extension
+            ext = file_path.suffix.lower()
+
+            # Choose appropriate parser based on file type
+            if ext == ".pdf":
+                return self.parse_pdf(file_path, output_dir, method, lang, **kwargs)
+            elif ext in self.OFFICE_FORMATS:
+                return self.parse_office_doc(file_path, output_dir, lang, **kwargs)
+            elif ext in self.HTML_FORMATS:
+                return self.parse_html(file_path, output_dir, lang, **kwargs)
+            else:
+                raise ValueError(
+                    f"Unsupported file format: {ext}. "
+                    f"Docling only supports PDF files, Office formats ({', '.join(self.OFFICE_FORMATS)}) "
+                    f"and HTML formats ({', '.join(self.HTML_FORMATS)})"
+                )
+        finally:
+            # Clean up temporary file if we downloaded one
+            if downloaded_temp_file and downloaded_temp_file.exists():
+                try:
+                    downloaded_temp_file.unlink()
+                    self.logger.debug(f"Removed temporary file: {downloaded_temp_file}")
+                except Exception as e:
+                    self.logger.warning(f"Failed to remove temporary file {downloaded_temp_file}: {e}")
 
     def _run_docling_command(
         self,
@@ -1504,13 +1573,15 @@ def read_from_block_recursive(
         content_list = []
         if not block.get("children"):
             cnt += 1
-            content_list.append(self.read_from_block(block, type, output_dir, cnt, num))
+            result = self.read_from_block(block, type, output_dir, cnt, num)
+            if result:
+                content_list.append(result)
         else:
             if type not in ["groups", "body"]:
                 cnt += 1
-                content_list.append(
-                    self.read_from_block(block, type, output_dir, cnt, num)
-                )
+                result = self.read_from_block(block, type, output_dir, cnt, num)
+                if result:
+                    content_list.append(result)
             members = block["children"]
             for member in members:
                 cnt += 1

diff --git a/requirements.txt b/requirements.txt
@@ -12,3 +12,4 @@ tqdm
 # - [all]: includes all optional dependencies
 #
 # Install with: pip install raganything[image,text] or pip install raganything[all]
+docling==2.72.0