From fc6fca793431d60ed1b52a666a6394411977bc6f Mon Sep 17 00:00:00 2001 From: RinZ27 <222222878+RinZ27@users.noreply.github.com> Date: Sat, 14 Feb 2026 15:32:29 +0700 Subject: [PATCH 1/2] Fix potential path traversal and local file read vulnerabilities I noticed that image paths extracted from document parsing results and retrieval context weren't being properly sanitized before file operations. This could allow a malicious document to trick the VLM into reading sensitive system files via indirect prompt injection or crafted parser output. Added directory boundary checks in the MinerU parser, path validation in the VLM query mixin, and symlink blocking in the image validation utility. --- raganything/parser.py | 12 +++++++++--- raganything/query.py | 28 +++++++++++++++++++++++++--- raganything/utils.py | 6 +++++- 3 files changed, 39 insertions(+), 7 deletions(-) diff --git a/raganything/parser.py b/raganything/parser.py index a07443e2..adad7d95 100644 --- a/raganything/parser.py +++ b/raganything/parser.py @@ -872,10 +872,16 @@ def _read_output_files( absolute_img_path = ( images_base_dir / img_path ).resolve() + + # Security check: ensure the image path is within the base directory + resolved_base = images_base_dir.resolve() + if not absolute_img_path.is_relative_to(resolved_base): + cls.logger.warning( + f"Potential path traversal detected in {field_name}: {img_path}. Skipping." + ) + continue + item[field_name] = str(absolute_img_path) - cls.logger.debug( - f"Updated {field_name}: {img_path} -> {item[field_name]}" - ) except Exception as e: cls.logger.warning(f"Could not read JSON file {json_file}: {e}") diff --git a/raganything/query.py b/raganything/query.py index 7c55d070..04627cb4 100644 --- a/raganything/query.py +++ b/raganything/query.py @@ -568,12 +568,34 @@ def replace_image_path(match): return match.group(0) # Keep original # Use utility function to validate image file - self.logger.debug(f"Calling validate_image_file for: {image_path}") is_valid = validate_image_file(image_path) - self.logger.debug(f"Validation result for {image_path}: {is_valid}") + + # Security check: only allow images from the workspace or output directories + # to prevent indirect prompt injection from reading arbitrary system files. + if is_valid: + abs_image_path = Path(image_path).resolve() + # Check if it's in the current working directory or subdirectories + try: + is_in_cwd = abs_image_path.is_relative_to(Path.cwd()) + except ValueError: + is_in_cwd = False + + # If a config is available, check against working_dir and parser_output_dir + is_in_safe_dir = is_in_cwd + if hasattr(self, "config") and self.config: + try: + is_in_working = abs_image_path.is_relative_to(Path(self.config.working_dir).resolve()) + is_in_output = abs_image_path.is_relative_to(Path(self.config.parser_output_dir).resolve()) + is_in_safe_dir = is_in_safe_dir or is_in_working or is_in_output + except Exception: + pass + + if not is_in_safe_dir: + self.logger.warning(f"Blocking image path outside safe directories: {image_path}") + is_valid = False if not is_valid: - self.logger.warning(f"Image validation failed for: {image_path}") + self.logger.warning(f"Image validation failed or path unsafe for: {image_path}") return match.group(0) # Keep original if validation fails try: diff --git a/raganything/utils.py b/raganything/utils.py index 3d65e3a2..2829d2cf 100644 --- a/raganything/utils.py +++ b/raganything/utils.py @@ -93,10 +93,14 @@ def validate_image_file(image_path: str, max_size_mb: int = 50) -> bool: logger.debug(f"Resolved path object: {path}") logger.debug(f"Path exists check: {path.exists()}") - # Check if file exists + # Check if file exists and is not a symlink (for security) if not path.exists(): logger.warning(f"Image file not found: {image_path}") return False + + if path.is_symlink(): + logger.warning(f"Blocking symlink for security: {image_path}") + return False # Check file extension image_extensions = [ From 3979abdc3a74b17cc12f584bbce3054d2185f5e1 Mon Sep 17 00:00:00 2001 From: RinZ27 <222222878+RinZ27@users.noreply.github.com> Date: Thu, 19 Feb 2026 10:36:36 +0700 Subject: [PATCH 2/2] Fix: harden path traversal guards and support extra safe directories --- raganything/parser.py | 1 + raganything/query.py | 25 ++++++++++++++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/raganything/parser.py b/raganything/parser.py index adad7d95..e34792e0 100644 --- a/raganything/parser.py +++ b/raganything/parser.py @@ -879,6 +879,7 @@ def _read_output_files( cls.logger.warning( f"Potential path traversal detected in {field_name}: {img_path}. Skipping." ) + item[field_name] = "" # Clear unsafe path continue item[field_name] = str(absolute_img_path) diff --git a/raganything/query.py b/raganything/query.py index 04627cb4..2bc6f8ce 100644 --- a/raganything/query.py +++ b/raganything/query.py @@ -301,7 +301,12 @@ async def aquery_with_multimodal( return result async def aquery_vlm_enhanced( - self, query: str, mode: str = "mix", system_prompt: str | None = None, **kwargs + self, + query: str, + mode: str = "mix", + system_prompt: str | None = None, + extra_safe_dirs: List[str] = None, + **kwargs, ) -> str: """ VLM enhanced query - replaces image paths in retrieved context with base64 encoded images for VLM processing @@ -310,6 +315,7 @@ async def aquery_vlm_enhanced( query: User query mode: Underlying LightRAG query mode system_prompt: Optional system prompt to include + extra_safe_dirs: Optional list of additional safe directories to allow images from **kwargs: Other query parameters Returns: @@ -339,7 +345,7 @@ async def aquery_vlm_enhanced( # 2. Extract and process image paths enhanced_prompt, images_found = await self._process_image_paths_for_vlm( - raw_prompt + raw_prompt, extra_safe_dirs=extra_safe_dirs ) if not images_found: @@ -530,12 +536,15 @@ async def _describe_generic_for_query( return description - async def _process_image_paths_for_vlm(self, prompt: str) -> tuple[str, int]: + async def _process_image_paths_for_vlm( + self, prompt: str, extra_safe_dirs: List[str] = None + ) -> tuple[str, int]: """ Process image paths in prompt, keeping original paths and adding VLM markers Args: prompt: Original prompt + extra_safe_dirs: Optional list of additional safe directories Returns: tuple: (processed prompt, image count) @@ -589,6 +598,16 @@ def replace_image_path(match): is_in_safe_dir = is_in_safe_dir or is_in_working or is_in_output except Exception: pass + + # Check against extra safe directories if provided + if not is_in_safe_dir and extra_safe_dirs: + for safe_dir in extra_safe_dirs: + try: + if abs_image_path.is_relative_to(Path(safe_dir).resolve()): + is_in_safe_dir = True + break + except Exception: + continue if not is_in_safe_dir: self.logger.warning(f"Blocking image path outside safe directories: {image_path}")