From 37f17674cce5b0fcdcd2144d2dcee0f2d33e704b Mon Sep 17 00:00:00 2001 From: 13ernkastel Date: Tue, 31 Mar 2026 21:04:38 +0800 Subject: [PATCH 1/3] Harden HTTP resource ingestion against private-network SSRF --- openviking/parse/parsers/html.py | 71 ++++++++++-- openviking/server/local_input_guard.py | 2 + openviking/server/routers/resources.py | 1 + openviking/service/resource_service.py | 8 ++ openviking/utils/media_processor.py | 6 +- openviking/utils/network_guard.py | 102 ++++++++++++++++++ openviking/utils/resource_processor.py | 3 + tests/server/test_api_local_input_security.py | 85 +++++++++++++++ 8 files changed, 266 insertions(+), 12 deletions(-) create mode 100644 openviking/utils/network_guard.py diff --git a/openviking/parse/parsers/html.py b/openviking/parse/parsers/html.py index f1fc737e9..97d3ece62 100644 --- a/openviking/parse/parsers/html.py +++ b/openviking/parse/parsers/html.py @@ -29,6 +29,8 @@ ) from openviking.parse.parsers.base_parser import BaseParser from openviking.parse.parsers.constants import CODE_EXTENSIONS +from openviking.utils.network_guard import build_httpx_request_validation_hooks +from openviking_cli.exceptions import PermissionDeniedError from openviking_cli.utils.config import get_openviking_config @@ -77,7 +79,12 @@ class URLTypeDetector: "application/xhtml+xml": URLType.WEBPAGE, } - async def detect(self, url: str, timeout: float = 10.0) -> Tuple[URLType, Dict[str, Any]]: + async def detect( + self, + url: str, + timeout: float = 10.0, + request_validator=None, + ) -> Tuple[URLType, Dict[str, Any]]: """ Detect URL content type. @@ -107,7 +114,16 @@ async def detect(self, url: str, timeout: float = 10.0) -> Tuple[URLType, Dict[s # 2. Send HEAD request to check Content-Type try: httpx = lazy_import("httpx") - async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client: + client_kwargs = { + "timeout": timeout, + "follow_redirects": True, + } + event_hooks = build_httpx_request_validation_hooks(request_validator) + if event_hooks: + client_kwargs["event_hooks"] = event_hooks + client_kwargs["trust_env"] = False + + async with httpx.AsyncClient(**client_kwargs) as client: response = await client.head(url) content_type = response.headers.get("content-type", "").lower() @@ -128,6 +144,8 @@ async def detect(self, url: str, timeout: float = 10.0) -> Tuple[URLType, Dict[s if "html" in content_type or "xml" in content_type: return URLType.WEBPAGE, meta + except PermissionDeniedError: + raise except Exception as e: meta["detection_error"] = str(e) @@ -271,7 +289,12 @@ async def _parse_url(self, url: str, start_time: float, **kwargs) -> ParseResult ParseResult """ # Detect URL type - url_type, meta = await self._url_detector.detect(url, timeout=self.timeout) + request_validator = kwargs.get("request_validator") + url_type, meta = await self._url_detector.detect( + url, + timeout=self.timeout, + request_validator=request_validator, + ) if url_type == URLType.WEBPAGE: # Fetch and parse as webpage @@ -317,7 +340,10 @@ async def _parse_webpage( """ try: # Fetch HTML - html_content = await self._fetch_html(url) + html_content = await self._fetch_html( + url, + request_validator=kwargs.get("request_validator"), + ) # Convert to Markdown markdown_content = self._html_to_markdown(html_content, base_url=url) @@ -339,6 +365,8 @@ async def _parse_webpage( return result + except PermissionDeniedError: + raise except Exception as e: return create_parse_result( root=ResourceNode(type=NodeType.ROOT, content_path=None), @@ -385,7 +413,10 @@ async def _handle_download_link( temp_path = None try: # Download to temporary file - temp_path = await self._download_file(url) + temp_path = await self._download_file( + url, + request_validator=kwargs.get("request_validator"), + ) # Extract original filename from URL for use as source_path, # so parsers use it instead of the temp file name. @@ -422,6 +453,8 @@ async def _handle_download_link( result.meta["url_type"] = f"download_{file_type}" return result + except PermissionDeniedError: + raise except Exception as e: return create_parse_result( root=ResourceNode(type=NodeType.ROOT, content_path=None), @@ -457,6 +490,8 @@ async def _handle_code_repository( return result + except PermissionDeniedError: + raise except Exception as e: return create_parse_result( root=ResourceNode(type=NodeType.ROOT, content_path=None), @@ -499,7 +534,7 @@ async def _parse_local_file(self, path: Path, start_time: float, **kwargs) -> Pa warnings=[f"Failed to read HTML: {e}"], ) - async def _fetch_html(self, url: str) -> str: + async def _fetch_html(self, url: str, request_validator=None) -> str: """ Fetch HTML content from URL. @@ -514,7 +549,16 @@ async def _fetch_html(self, url: str) -> str: """ httpx = lazy_import("httpx") - async with httpx.AsyncClient(timeout=self.timeout, follow_redirects=True) as client: + client_kwargs = { + "timeout": self.timeout, + "follow_redirects": True, + } + event_hooks = build_httpx_request_validation_hooks(request_validator) + if event_hooks: + client_kwargs["event_hooks"] = event_hooks + client_kwargs["trust_env"] = False + + async with httpx.AsyncClient(**client_kwargs) as client: headers = {"User-Agent": self.user_agent} response = await client.get(url, headers=headers) response.raise_for_status() @@ -591,7 +635,7 @@ async def _save_downloaded_text( result.temp_dir_path = temp_uri return result - async def _download_file(self, url: str) -> str: + async def _download_file(self, url: str, request_validator=None) -> str: """ Download file from URL to temporary location. @@ -619,7 +663,16 @@ async def _download_file(self, url: str) -> str: temp_file.close() # Download - async with httpx.AsyncClient(timeout=self.timeout, follow_redirects=True) as client: + client_kwargs = { + "timeout": self.timeout, + "follow_redirects": True, + } + event_hooks = build_httpx_request_validation_hooks(request_validator) + if event_hooks: + client_kwargs["event_hooks"] = event_hooks + client_kwargs["trust_env"] = False + + async with httpx.AsyncClient(**client_kwargs) as client: headers = {"User-Agent": self.user_agent} response = await client.get(url, headers=headers) response.raise_for_status() diff --git a/openviking/server/local_input_guard.py b/openviking/server/local_input_guard.py index ee0e2d091..d7a08a360 100644 --- a/openviking/server/local_input_guard.py +++ b/openviking/server/local_input_guard.py @@ -7,6 +7,7 @@ import re from pathlib import Path +from openviking.utils.network_guard import ensure_public_remote_target from openviking_cli.exceptions import PermissionDeniedError _WINDOWS_DRIVE_RE = re.compile(r"^[A-Za-z]:[\\/]") @@ -37,6 +38,7 @@ def require_remote_resource_source(source: str) -> str: "HTTP server only accepts remote resource URLs or temp-uploaded files; " "direct host filesystem paths are not allowed." ) + ensure_public_remote_target(source) return source diff --git a/openviking/server/routers/resources.py b/openviking/server/routers/resources.py index f9c3275f1..20c1ad821 100644 --- a/openviking/server/routers/resources.py +++ b/openviking/server/routers/resources.py @@ -212,6 +212,7 @@ async def add_resource( wait=request.wait, timeout=request.timeout, allow_local_path_resolution=allow_local_path_resolution, + enforce_public_remote_targets=True, **kwargs, ), ) diff --git a/openviking/service/resource_service.py b/openviking/service/resource_service.py index 056a09fd0..e55fc0deb 100644 --- a/openviking/service/resource_service.py +++ b/openviking/service/resource_service.py @@ -11,6 +11,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional from openviking.server.identity import RequestContext +from openviking.server.local_input_guard import is_remote_resource_source, require_remote_resource_source from openviking.storage import VikingDBManager from openviking.storage.queuefs import get_queue_manager from openviking.storage.viking_fs import VikingFS @@ -29,6 +30,7 @@ InvalidArgumentError, NotInitializedError, ) +from openviking.utils.network_guard import ensure_public_remote_target from openviking_cli.utils import get_logger from openviking_cli.utils.uri import VikingURI @@ -110,6 +112,7 @@ async def add_resource( watch_interval: float = 0, skip_watch_management: bool = False, allow_local_path_resolution: bool = True, + enforce_public_remote_targets: bool = False, **kwargs, ) -> Dict[str, Any]: """Add resource to OpenViking (only supports resources scope). @@ -137,6 +140,8 @@ async def add_resource( creating a new one. skip_watch_management: If True, skip watch task management (used by scheduler to avoid recursive watch task creation during scheduled execution) + enforce_public_remote_targets: When True, reject non-public remote hosts and + validate each outbound HTTP request URL during fetch. **kwargs: Extra options forwarded to the parser chain Returns: @@ -178,6 +183,9 @@ async def add_resource( raise InvalidArgumentError( "watch_interval > 0 requires 'to' to be specified (target URI to watch)" ) + if enforce_public_remote_targets and is_remote_resource_source(path): + path = require_remote_resource_source(path) + kwargs.setdefault("request_validator", ensure_public_remote_target) result = await self._resource_processor.process_resource( path=path, diff --git a/openviking/utils/media_processor.py b/openviking/utils/media_processor.py index 7fa9e7cef..1fb13b8ce 100644 --- a/openviking/utils/media_processor.py +++ b/openviking/utils/media_processor.py @@ -105,19 +105,19 @@ async def _process_url(self, url: str, instruction: str, **kwargs) -> ParseResul "FeishuParser not available. " "Install lark-oapi: pip install 'openviking[bot-feishu]'" ) - return await parser.parse(url, instruction=instruction) + return await parser.parse(url, instruction=instruction, **kwargs) # Route git protocols and repo URLs to CodeRepositoryParser if url.startswith(("git@", "git://", "ssh://")) or is_git_repo_url(url): from openviking.parse.parsers.code.code import CodeRepositoryParser parser = CodeRepositoryParser() - return await parser.parse(url, instruction=instruction) + return await parser.parse(url, instruction=instruction, **kwargs) from openviking.parse.parsers.html import HTMLParser parser = HTMLParser() - return await parser.parse(url, instruction=instruction) + return await parser.parse(url, instruction=instruction, **kwargs) @staticmethod def _is_feishu_url(url: str) -> bool: diff --git a/openviking/utils/network_guard.py b/openviking/utils/network_guard.py new file mode 100644 index 000000000..a9dad2c68 --- /dev/null +++ b/openviking/utils/network_guard.py @@ -0,0 +1,102 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 +"""Network target validation helpers for server-side remote fetches.""" + +from __future__ import annotations + +import ipaddress +import socket +from collections.abc import Callable +from typing import Optional +from urllib.parse import urlparse + +from openviking_cli.exceptions import PermissionDeniedError + +RequestValidator = Callable[[str], None] + +_LOCAL_HOSTNAMES = { + "localhost", + "localhost.localdomain", +} + + +def extract_remote_host(source: str) -> Optional[str]: + """Extract the destination host from a remote resource source.""" + if source.startswith("git@"): + rest = source[4:] + if ":" not in rest: + return None + return rest.split(":", 1)[0].strip().strip("[]") + + parsed = urlparse(source) + if parsed.hostname is None: + return None + return parsed.hostname.strip().strip("[]") + + +def _normalize_host(host: str) -> str: + return host.rstrip(".").lower() + + +def _resolve_host_addresses(host: str) -> set[str]: + try: + infos = socket.getaddrinfo(host, None, type=socket.SOCK_STREAM) + except (socket.gaierror, UnicodeError, OSError): + return set() + + addresses: set[str] = set() + for family, _, _, _, sockaddr in infos: + if family not in {socket.AF_INET, socket.AF_INET6}: + continue + addr = sockaddr[0] + if "%" in addr: + addr = addr.split("%", 1)[0] + addresses.add(addr) + return addresses + + +def _is_public_ip(address: str) -> bool: + try: + return ipaddress.ip_address(address).is_global + except ValueError: + return False + + +def ensure_public_remote_target(source: str) -> None: + """Reject loopback, link-local, private, and other non-public targets.""" + host = extract_remote_host(source) + if not host: + raise PermissionDeniedError( + "HTTP server only accepts remote resource URLs with a valid destination host." + ) + + normalized_host = _normalize_host(host) + if normalized_host in _LOCAL_HOSTNAMES or normalized_host.endswith(".localhost"): + raise PermissionDeniedError( + "HTTP server only accepts public remote resource targets; " + "loopback, link-local, private, and otherwise non-public destinations are not allowed." + ) + + resolved_addresses = _resolve_host_addresses(host) + if not resolved_addresses: + return + + non_public = sorted(addr for addr in resolved_addresses if not _is_public_ip(addr)) + if non_public: + raise PermissionDeniedError( + "HTTP server only accepts public remote resource targets; " + f"host '{host}' resolves to non-public address '{non_public[0]}'." + ) + + +def build_httpx_request_validation_hooks( + request_validator: Optional[RequestValidator], +) -> Optional[dict[str, list[Callable]]]: + """Build httpx request hooks that validate every outbound request URL.""" + if request_validator is None: + return None + + async def _validate_request(request) -> None: + request_validator(str(request.url)) + + return {"request": [_validate_request]} diff --git a/openviking/utils/resource_processor.py b/openviking/utils/resource_processor.py index 320c6e51d..c5f188c7f 100644 --- a/openviking/utils/resource_processor.py +++ b/openviking/utils/resource_processor.py @@ -18,6 +18,7 @@ from openviking.telemetry import get_current_telemetry from openviking.utils.embedding_utils import index_resource from openviking.utils.summarizer import Summarizer +from openviking_cli.exceptions import OpenVikingError from openviking_cli.utils import get_logger from openviking_cli.utils.storage import StoragePath @@ -161,6 +162,8 @@ async def process_resource( ) telemetry.set("resource.parse.warnings_count", len(parse_result.warnings or [])) + except OpenVikingError: + raise except Exception as e: result["status"] = "error" result["errors"].append(f"Parse error: {e}") diff --git a/tests/server/test_api_local_input_security.py b/tests/server/test_api_local_input_security.py index 8d7ec7c10..351f2be39 100644 --- a/tests/server/test_api_local_input_security.py +++ b/tests/server/test_api_local_input_security.py @@ -4,9 +4,16 @@ """Security tests for HTTP server local input handling.""" import io +import threading import zipfile +from http.server import BaseHTTPRequestHandler, HTTPServer import httpx +import pytest + +from openviking.parse.parsers.html import HTMLParser, URLTypeDetector +from openviking.utils.network_guard import ensure_public_remote_target +from openviking_cli.exceptions import PermissionDeniedError async def test_add_skill_accepts_temp_uploaded_file( @@ -80,6 +87,40 @@ def _build_ovpack_bytes() -> bytes: return buffer.getvalue() +@pytest.fixture +def loopback_http_url(): + body = b"loopback secret" + + class Handler(BaseHTTPRequestHandler): + def _write_headers(self) -> None: + self.send_response(200) + self.send_header("Content-Type", "text/html; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + + def do_HEAD(self): + self._write_headers() + + def do_GET(self): + self._write_headers() + self.wfile.write(body) + + def log_message(self, format, *args): + return + + server = HTTPServer(("127.0.0.1", 0), Handler) + thread = threading.Thread(target=server.serve_forever, daemon=True) + thread.start() + + try: + host, port = server.server_address + yield f"http://{host}:{port}/" + finally: + server.shutdown() + server.server_close() + thread.join(timeout=3) + + async def test_import_ovpack_accepts_temp_uploaded_file( client: httpx.AsyncClient, upload_temp_dir, @@ -152,3 +193,47 @@ async def test_add_resource_rejects_legacy_temp_path_field(client: httpx.AsyncCl json={"temp_path": "upload_resource.md", "reason": "legacy field"}, ) assert resp.status_code == 422 + + +async def test_add_resource_rejects_loopback_remote_url(client: httpx.AsyncClient): + resp = await client.post( + "/api/v1/resources", + json={"path": "http://127.0.0.1:8765/", "reason": "ssrf probe"}, + ) + assert resp.status_code == 403 + body = resp.json() + assert body["status"] == "error" + assert body["error"]["code"] == "PERMISSION_DENIED" + assert "public remote resource targets" in body["error"]["message"] + + +async def test_add_resource_rejects_private_git_ssh_url(client: httpx.AsyncClient): + resp = await client.post( + "/api/v1/resources", + json={"path": "git@127.0.0.1:org/repo.git", "reason": "internal git"}, + ) + assert resp.status_code == 403 + body = resp.json() + assert body["status"] == "error" + assert body["error"]["code"] == "PERMISSION_DENIED" + + +async def test_url_detector_request_validator_blocks_loopback_head(loopback_http_url: str): + detector = URLTypeDetector() + + with pytest.raises(PermissionDeniedError): + await detector.detect( + loopback_http_url, + timeout=2.0, + request_validator=ensure_public_remote_target, + ) + + +async def test_html_parser_request_validator_blocks_loopback_fetch(loopback_http_url: str): + parser = HTMLParser(timeout=2.0) + + with pytest.raises(PermissionDeniedError): + await parser._fetch_html( + loopback_http_url, + request_validator=ensure_public_remote_target, + ) From 94e30e2413229c5144eabef1834b4859202d0c9e Mon Sep 17 00:00:00 2001 From: 13ernkastel Date: Tue, 31 Mar 2026 21:28:22 +0800 Subject: [PATCH 2/3] chore(ci): retrigger checks From e4d179f7f110c9d149f4adceb765d618099da7f0 Mon Sep 17 00:00:00 2001 From: 13ernkastel Date: Wed, 1 Apr 2026 08:26:21 +0800 Subject: [PATCH 3/3] style: fix resource service import order --- openviking/service/resource_service.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/openviking/service/resource_service.py b/openviking/service/resource_service.py index e55fc0deb..5c7796079 100644 --- a/openviking/service/resource_service.py +++ b/openviking/service/resource_service.py @@ -11,7 +11,10 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional from openviking.server.identity import RequestContext -from openviking.server.local_input_guard import is_remote_resource_source, require_remote_resource_source +from openviking.server.local_input_guard import ( + is_remote_resource_source, + require_remote_resource_source, +) from openviking.storage import VikingDBManager from openviking.storage.queuefs import get_queue_manager from openviking.storage.viking_fs import VikingFS @@ -22,6 +25,7 @@ register_wait_telemetry, unregister_wait_telemetry, ) +from openviking.utils.network_guard import ensure_public_remote_target from openviking.utils.resource_processor import ResourceProcessor from openviking.utils.skill_processor import SkillProcessor from openviking_cli.exceptions import ( @@ -30,7 +34,6 @@ InvalidArgumentError, NotInitializedError, ) -from openviking.utils.network_guard import ensure_public_remote_target from openviking_cli.utils import get_logger from openviking_cli.utils.uri import VikingURI