Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 62 additions & 9 deletions openviking/parse/parsers/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
)
from openviking.parse.parsers.base_parser import BaseParser
from openviking.parse.parsers.constants import CODE_EXTENSIONS
from openviking.utils.network_guard import build_httpx_request_validation_hooks
from openviking_cli.exceptions import PermissionDeniedError
from openviking_cli.utils.config import get_openviking_config


Expand Down Expand Up @@ -77,7 +79,12 @@ class URLTypeDetector:
"application/xhtml+xml": URLType.WEBPAGE,
}

async def detect(self, url: str, timeout: float = 10.0) -> Tuple[URLType, Dict[str, Any]]:
async def detect(
self,
url: str,
timeout: float = 10.0,
request_validator=None,
) -> Tuple[URLType, Dict[str, Any]]:
"""
Detect URL content type.

Expand Down Expand Up @@ -107,7 +114,16 @@ async def detect(self, url: str, timeout: float = 10.0) -> Tuple[URLType, Dict[s
# 2. Send HEAD request to check Content-Type
try:
httpx = lazy_import("httpx")
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
client_kwargs = {
"timeout": timeout,
"follow_redirects": True,
}
event_hooks = build_httpx_request_validation_hooks(request_validator)
if event_hooks:
client_kwargs["event_hooks"] = event_hooks
client_kwargs["trust_env"] = False

async with httpx.AsyncClient(**client_kwargs) as client:
response = await client.head(url)
content_type = response.headers.get("content-type", "").lower()

Expand All @@ -128,6 +144,8 @@ async def detect(self, url: str, timeout: float = 10.0) -> Tuple[URLType, Dict[s
if "html" in content_type or "xml" in content_type:
return URLType.WEBPAGE, meta

except PermissionDeniedError:
raise
except Exception as e:
meta["detection_error"] = str(e)

Expand Down Expand Up @@ -271,7 +289,12 @@ async def _parse_url(self, url: str, start_time: float, **kwargs) -> ParseResult
ParseResult
"""
# Detect URL type
url_type, meta = await self._url_detector.detect(url, timeout=self.timeout)
request_validator = kwargs.get("request_validator")
url_type, meta = await self._url_detector.detect(
url,
timeout=self.timeout,
request_validator=request_validator,
)

if url_type == URLType.WEBPAGE:
# Fetch and parse as webpage
Expand Down Expand Up @@ -317,7 +340,10 @@ async def _parse_webpage(
"""
try:
# Fetch HTML
html_content = await self._fetch_html(url)
html_content = await self._fetch_html(
url,
request_validator=kwargs.get("request_validator"),
)

# Convert to Markdown
markdown_content = self._html_to_markdown(html_content, base_url=url)
Expand All @@ -339,6 +365,8 @@ async def _parse_webpage(

return result

except PermissionDeniedError:
raise
except Exception as e:
return create_parse_result(
root=ResourceNode(type=NodeType.ROOT, content_path=None),
Expand Down Expand Up @@ -385,7 +413,10 @@ async def _handle_download_link(
temp_path = None
try:
# Download to temporary file
temp_path = await self._download_file(url)
temp_path = await self._download_file(
url,
request_validator=kwargs.get("request_validator"),
)

# Extract original filename from URL for use as source_path,
# so parsers use it instead of the temp file name.
Expand Down Expand Up @@ -422,6 +453,8 @@ async def _handle_download_link(
result.meta["url_type"] = f"download_{file_type}"
return result

except PermissionDeniedError:
raise
except Exception as e:
return create_parse_result(
root=ResourceNode(type=NodeType.ROOT, content_path=None),
Expand Down Expand Up @@ -457,6 +490,8 @@ async def _handle_code_repository(

return result

except PermissionDeniedError:
raise
except Exception as e:
return create_parse_result(
root=ResourceNode(type=NodeType.ROOT, content_path=None),
Expand Down Expand Up @@ -499,7 +534,7 @@ async def _parse_local_file(self, path: Path, start_time: float, **kwargs) -> Pa
warnings=[f"Failed to read HTML: {e}"],
)

async def _fetch_html(self, url: str) -> str:
async def _fetch_html(self, url: str, request_validator=None) -> str:
"""
Fetch HTML content from URL.

Expand All @@ -514,7 +549,16 @@ async def _fetch_html(self, url: str) -> str:
"""
httpx = lazy_import("httpx")

async with httpx.AsyncClient(timeout=self.timeout, follow_redirects=True) as client:
client_kwargs = {
"timeout": self.timeout,
"follow_redirects": True,
}
event_hooks = build_httpx_request_validation_hooks(request_validator)
if event_hooks:
client_kwargs["event_hooks"] = event_hooks
client_kwargs["trust_env"] = False

async with httpx.AsyncClient(**client_kwargs) as client:
headers = {"User-Agent": self.user_agent}
response = await client.get(url, headers=headers)
response.raise_for_status()
Expand Down Expand Up @@ -591,7 +635,7 @@ async def _save_downloaded_text(
result.temp_dir_path = temp_uri
return result

async def _download_file(self, url: str) -> str:
async def _download_file(self, url: str, request_validator=None) -> str:
"""
Download file from URL to temporary location.

Expand Down Expand Up @@ -619,7 +663,16 @@ async def _download_file(self, url: str) -> str:
temp_file.close()

# Download
async with httpx.AsyncClient(timeout=self.timeout, follow_redirects=True) as client:
client_kwargs = {
"timeout": self.timeout,
"follow_redirects": True,
}
event_hooks = build_httpx_request_validation_hooks(request_validator)
if event_hooks:
client_kwargs["event_hooks"] = event_hooks
client_kwargs["trust_env"] = False

async with httpx.AsyncClient(**client_kwargs) as client:
headers = {"User-Agent": self.user_agent}
response = await client.get(url, headers=headers)
response.raise_for_status()
Expand Down
2 changes: 2 additions & 0 deletions openviking/server/local_input_guard.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import re
from pathlib import Path

from openviking.utils.network_guard import ensure_public_remote_target
from openviking_cli.exceptions import PermissionDeniedError

_WINDOWS_DRIVE_RE = re.compile(r"^[A-Za-z]:[\\/]")
Expand Down Expand Up @@ -37,6 +38,7 @@ def require_remote_resource_source(source: str) -> str:
"HTTP server only accepts remote resource URLs or temp-uploaded files; "
"direct host filesystem paths are not allowed."
)
ensure_public_remote_target(source)
return source


Expand Down
1 change: 1 addition & 0 deletions openviking/server/routers/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@ async def add_resource(
wait=request.wait,
timeout=request.timeout,
allow_local_path_resolution=allow_local_path_resolution,
enforce_public_remote_targets=True,
**kwargs,
),
)
Expand Down
11 changes: 11 additions & 0 deletions openviking/service/resource_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
from typing import TYPE_CHECKING, Any, Dict, List, Optional

from openviking.server.identity import RequestContext
from openviking.server.local_input_guard import (
is_remote_resource_source,
require_remote_resource_source,
)
from openviking.storage import VikingDBManager
from openviking.storage.queuefs import get_queue_manager
from openviking.storage.viking_fs import VikingFS
Expand All @@ -21,6 +25,7 @@
register_wait_telemetry,
unregister_wait_telemetry,
)
from openviking.utils.network_guard import ensure_public_remote_target
from openviking.utils.resource_processor import ResourceProcessor
from openviking.utils.skill_processor import SkillProcessor
from openviking_cli.exceptions import (
Expand Down Expand Up @@ -110,6 +115,7 @@ async def add_resource(
watch_interval: float = 0,
skip_watch_management: bool = False,
allow_local_path_resolution: bool = True,
enforce_public_remote_targets: bool = False,
**kwargs,
) -> Dict[str, Any]:
"""Add resource to OpenViking (only supports resources scope).
Expand Down Expand Up @@ -137,6 +143,8 @@ async def add_resource(
creating a new one.
skip_watch_management: If True, skip watch task management (used by scheduler to
avoid recursive watch task creation during scheduled execution)
enforce_public_remote_targets: When True, reject non-public remote hosts and
validate each outbound HTTP request URL during fetch.
**kwargs: Extra options forwarded to the parser chain

Returns:
Expand Down Expand Up @@ -178,6 +186,9 @@ async def add_resource(
raise InvalidArgumentError(
"watch_interval > 0 requires 'to' to be specified (target URI to watch)"
)
if enforce_public_remote_targets and is_remote_resource_source(path):
path = require_remote_resource_source(path)
kwargs.setdefault("request_validator", ensure_public_remote_target)

result = await self._resource_processor.process_resource(
path=path,
Expand Down
6 changes: 3 additions & 3 deletions openviking/utils/media_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,19 +105,19 @@ async def _process_url(self, url: str, instruction: str, **kwargs) -> ParseResul
"FeishuParser not available. "
"Install lark-oapi: pip install 'openviking[bot-feishu]'"
)
return await parser.parse(url, instruction=instruction)
return await parser.parse(url, instruction=instruction, **kwargs)

# Route git protocols and repo URLs to CodeRepositoryParser
if url.startswith(("git@", "git://", "ssh://")) or is_git_repo_url(url):
from openviking.parse.parsers.code.code import CodeRepositoryParser

parser = CodeRepositoryParser()
return await parser.parse(url, instruction=instruction)
return await parser.parse(url, instruction=instruction, **kwargs)

from openviking.parse.parsers.html import HTMLParser

parser = HTMLParser()
return await parser.parse(url, instruction=instruction)
return await parser.parse(url, instruction=instruction, **kwargs)

@staticmethod
def _is_feishu_url(url: str) -> bool:
Expand Down
102 changes: 102 additions & 0 deletions openviking/utils/network_guard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
# SPDX-License-Identifier: AGPL-3.0
"""Network target validation helpers for server-side remote fetches."""

from __future__ import annotations

import ipaddress
import socket
from collections.abc import Callable
from typing import Optional
from urllib.parse import urlparse

from openviking_cli.exceptions import PermissionDeniedError

RequestValidator = Callable[[str], None]

_LOCAL_HOSTNAMES = {
"localhost",
"localhost.localdomain",
}


def extract_remote_host(source: str) -> Optional[str]:
"""Extract the destination host from a remote resource source."""
if source.startswith("git@"):
rest = source[4:]
if ":" not in rest:
return None
return rest.split(":", 1)[0].strip().strip("[]")

parsed = urlparse(source)
if parsed.hostname is None:
return None
return parsed.hostname.strip().strip("[]")


def _normalize_host(host: str) -> str:
return host.rstrip(".").lower()


def _resolve_host_addresses(host: str) -> set[str]:
try:
infos = socket.getaddrinfo(host, None, type=socket.SOCK_STREAM)
except (socket.gaierror, UnicodeError, OSError):
return set()

addresses: set[str] = set()
for family, _, _, _, sockaddr in infos:
if family not in {socket.AF_INET, socket.AF_INET6}:
continue
addr = sockaddr[0]
if "%" in addr:
addr = addr.split("%", 1)[0]
addresses.add(addr)
return addresses


def _is_public_ip(address: str) -> bool:
try:
return ipaddress.ip_address(address).is_global
except ValueError:
return False


def ensure_public_remote_target(source: str) -> None:
"""Reject loopback, link-local, private, and other non-public targets."""
host = extract_remote_host(source)
if not host:
raise PermissionDeniedError(
"HTTP server only accepts remote resource URLs with a valid destination host."
)

normalized_host = _normalize_host(host)
if normalized_host in _LOCAL_HOSTNAMES or normalized_host.endswith(".localhost"):
raise PermissionDeniedError(
"HTTP server only accepts public remote resource targets; "
"loopback, link-local, private, and otherwise non-public destinations are not allowed."
)

resolved_addresses = _resolve_host_addresses(host)
if not resolved_addresses:
return

non_public = sorted(addr for addr in resolved_addresses if not _is_public_ip(addr))
if non_public:
raise PermissionDeniedError(
"HTTP server only accepts public remote resource targets; "
f"host '{host}' resolves to non-public address '{non_public[0]}'."
)


def build_httpx_request_validation_hooks(
request_validator: Optional[RequestValidator],
) -> Optional[dict[str, list[Callable]]]:
"""Build httpx request hooks that validate every outbound request URL."""
if request_validator is None:
return None

async def _validate_request(request) -> None:
request_validator(str(request.url))

return {"request": [_validate_request]}
3 changes: 3 additions & 0 deletions openviking/utils/resource_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from openviking.telemetry import get_current_telemetry
from openviking.utils.embedding_utils import index_resource
from openviking.utils.summarizer import Summarizer
from openviking_cli.exceptions import OpenVikingError
from openviking_cli.utils import get_logger
from openviking_cli.utils.storage import StoragePath

Expand Down Expand Up @@ -161,6 +162,8 @@ async def process_resource(
)
telemetry.set("resource.parse.warnings_count", len(parse_result.warnings or []))

except OpenVikingError:
raise
except Exception as e:
result["status"] = "error"
result["errors"].append(f"Parse error: {e}")
Expand Down
Loading
Loading