diff --git a/raganything/parser.py b/raganything/parser.py index a07443e2..e34792e0 100644 --- a/raganything/parser.py +++ b/raganything/parser.py @@ -872,10 +872,17 @@ def _read_output_files( absolute_img_path = ( images_base_dir / img_path ).resolve() + + # Security check: ensure the image path is within the base directory + resolved_base = images_base_dir.resolve() + if not absolute_img_path.is_relative_to(resolved_base): + cls.logger.warning( + f"Potential path traversal detected in {field_name}: {img_path}. Skipping." + ) + item[field_name] = "" # Clear unsafe path + continue + item[field_name] = str(absolute_img_path) - cls.logger.debug( - f"Updated {field_name}: {img_path} -> {item[field_name]}" - ) except Exception as e: cls.logger.warning(f"Could not read JSON file {json_file}: {e}") diff --git a/raganything/query.py b/raganything/query.py index 7c55d070..2bc6f8ce 100644 --- a/raganything/query.py +++ b/raganything/query.py @@ -301,7 +301,12 @@ async def aquery_with_multimodal( return result async def aquery_vlm_enhanced( - self, query: str, mode: str = "mix", system_prompt: str | None = None, **kwargs + self, + query: str, + mode: str = "mix", + system_prompt: str | None = None, + extra_safe_dirs: List[str] = None, + **kwargs, ) -> str: """ VLM enhanced query - replaces image paths in retrieved context with base64 encoded images for VLM processing @@ -310,6 +315,7 @@ async def aquery_vlm_enhanced( query: User query mode: Underlying LightRAG query mode system_prompt: Optional system prompt to include + extra_safe_dirs: Optional list of additional safe directories to allow images from **kwargs: Other query parameters Returns: @@ -339,7 +345,7 @@ async def aquery_vlm_enhanced( # 2. Extract and process image paths enhanced_prompt, images_found = await self._process_image_paths_for_vlm( - raw_prompt + raw_prompt, extra_safe_dirs=extra_safe_dirs ) if not images_found: @@ -530,12 +536,15 @@ async def _describe_generic_for_query( return description - async def _process_image_paths_for_vlm(self, prompt: str) -> tuple[str, int]: + async def _process_image_paths_for_vlm( + self, prompt: str, extra_safe_dirs: List[str] = None + ) -> tuple[str, int]: """ Process image paths in prompt, keeping original paths and adding VLM markers Args: prompt: Original prompt + extra_safe_dirs: Optional list of additional safe directories Returns: tuple: (processed prompt, image count) @@ -568,12 +577,44 @@ def replace_image_path(match): return match.group(0) # Keep original # Use utility function to validate image file - self.logger.debug(f"Calling validate_image_file for: {image_path}") is_valid = validate_image_file(image_path) - self.logger.debug(f"Validation result for {image_path}: {is_valid}") + + # Security check: only allow images from the workspace or output directories + # to prevent indirect prompt injection from reading arbitrary system files. + if is_valid: + abs_image_path = Path(image_path).resolve() + # Check if it's in the current working directory or subdirectories + try: + is_in_cwd = abs_image_path.is_relative_to(Path.cwd()) + except ValueError: + is_in_cwd = False + + # If a config is available, check against working_dir and parser_output_dir + is_in_safe_dir = is_in_cwd + if hasattr(self, "config") and self.config: + try: + is_in_working = abs_image_path.is_relative_to(Path(self.config.working_dir).resolve()) + is_in_output = abs_image_path.is_relative_to(Path(self.config.parser_output_dir).resolve()) + is_in_safe_dir = is_in_safe_dir or is_in_working or is_in_output + except Exception: + pass + + # Check against extra safe directories if provided + if not is_in_safe_dir and extra_safe_dirs: + for safe_dir in extra_safe_dirs: + try: + if abs_image_path.is_relative_to(Path(safe_dir).resolve()): + is_in_safe_dir = True + break + except Exception: + continue + + if not is_in_safe_dir: + self.logger.warning(f"Blocking image path outside safe directories: {image_path}") + is_valid = False if not is_valid: - self.logger.warning(f"Image validation failed for: {image_path}") + self.logger.warning(f"Image validation failed or path unsafe for: {image_path}") return match.group(0) # Keep original if validation fails try: diff --git a/raganything/utils.py b/raganything/utils.py index 3d65e3a2..2829d2cf 100644 --- a/raganything/utils.py +++ b/raganything/utils.py @@ -93,10 +93,14 @@ def validate_image_file(image_path: str, max_size_mb: int = 50) -> bool: logger.debug(f"Resolved path object: {path}") logger.debug(f"Path exists check: {path.exists()}") - # Check if file exists + # Check if file exists and is not a symlink (for security) if not path.exists(): logger.warning(f"Image file not found: {image_path}") return False + + if path.is_symlink(): + logger.warning(f"Blocking symlink for security: {image_path}") + return False # Check file extension image_extensions = [