Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions docs/network-isolation-for-kubernetes.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,16 @@ sandbox = await Sandbox.create(
| Cluster CIDR exposure | Not exposed to users | Must be exposed to users |
| Use case | Platform-wide default isolation, recommended | Whitelist mode, fine-grained control |

## Runtime Compatibility

Both Approach 1 (`deny.always` via egress sidecar) and Approach 2 (per-sandbox `network_policy`) depend on the egress sidecar, which uses an iptables `nat` table REDIRECT rule for DNS interception. This works with `runc` (default) and all Kata Containers variants (`kata-qemu`, `kata-clh`, `kata-fc`), but **not with gVisor** — gVisor's netstack does not implement the `nat` table.

If you need both gVisor's syscall isolation and FQDN egress control:
- Use `kata-qemu` instead — it provides comparable security isolation and supports the egress sidecar.
- Alternatively, use a CNI-level FQDN policy (e.g., Cilium `toFQDNs`) for network isolation alongside gVisor.

See the [Compatibility Matrix](secure-container.md#compatibility-matrix) in the Secure Container Runtime Guide for the full feature support table.

## Recommendations

1. **Default full isolation**: use `deny.always` to block the cluster's internal CIDR ranges as the platform's default security baseline.
Expand Down
21 changes: 21 additions & 0 deletions docs/secure-container.md
Original file line number Diff line number Diff line change
Expand Up @@ -714,6 +714,26 @@ sudo containerd config dump
sudo systemctl restart containerd
```

#### 5. Egress Sidecar Incompatible with gVisor

**Error**: Sandbox pods CrashLoopBackOff with egress container log:
```
iptables: Failed to initialize nft: Protocol not supported
```
Or with iptables-legacy:
```
iptables v1.8.9 (legacy): can't initialize iptables table 'nat': Table does not exist (do you need to insmod?)
```

**Cause**: gVisor's netstack implements the `filter` and `mangle` iptables tables but does not implement the `nat` table. The egress sidecar uses a REDIRECT rule in the `nat` table to intercept DNS queries (port 53 → 15353), so it cannot start under gVisor. This is an upstream gVisor limitation ([gvisor#170](https://github.com/google/gvisor/issues/170)).

**Solution**:
- Use `secure_runtime.type = "kata"` with `k8s_runtime_class = "kata-qemu"` — Kata provides a full Linux kernel per pod, so the `nat` table is available and the egress sidecar works unchanged.
- Use a CNI-level FQDN policy (e.g., Cilium `toFQDNs`) instead of the egress sidecar for network isolation under gVisor.
- Remove `network_policy` from sandbox creation requests if egress control is not required.

> **Note**: The server validates this combination at request time and returns HTTP 400 with a clear error message when `secure_runtime.type = "gvisor"` and `network_policy` are used together.

### Compatibility Matrix

| Feature | runc | gVisor | Kata (QEMU) | Kata (CLH) | Kata (FC) |
Expand All @@ -724,6 +744,7 @@ sudo systemctl restart containerd
| Privileged Mode | Yes | No | Yes | Yes | No |
| Docker Volume | Yes | Yes | Yes | Yes | Yes |
| Systemd | Yes | No | Yes | Yes | No |
| iptables `nat` table (egress sidecar) | Yes | **No** | Yes | Yes | Yes |

### Getting Help

Expand Down
74 changes: 74 additions & 0 deletions kubernetes/test/e2e_runtime/gvisor/gvisor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,80 @@ spec:
})
})

Context("gVisor + egress sidecar incompatibility", func() {
var podName string

BeforeEach(func() {
podName = fmt.Sprintf("test-gvisor-egress-%d", time.Now().UnixNano())
})

AfterEach(func() {
By("cleaning up Pod")
if podName != "" {
_, _ = runKubectl("delete", "pod", podName, "-n", testNamespace,
"--ignore-not-found=true", "--grace-period=0", "--force")
}
})

It("should fail to start the egress sidecar under gVisor due to missing iptables nat table", func() {
egressImage := os.Getenv("EGRESS_IMG")
if egressImage == "" {
Skip("EGRESS_IMG not set; skipping gVisor + egress incompatibility test")
}

By("creating a Pod with gVisor runtimeClassName and egress sidecar")
podYAML := fmt.Sprintf(`apiVersion: v1
kind: Pod
metadata:
name: %s
namespace: %s
spec:
runtimeClassName: %s
restartPolicy: Never
containers:
- name: workload
image: %s
command: ["sleep", "300"]
- name: egress
image: %s
securityContext:
capabilities:
add: ["NET_ADMIN"]
env:
- name: OPENSANDBOX_EGRESS_MODE
value: "dns+nft"
- name: OPENSANDBOX_EGRESS_RULES
value: '{"defaultAction":"deny","egress":[]}'
`, podName, testNamespace, RuntimeClassName, utils.SandboxImage, egressImage)

podFile := filepath.Join("/tmp", fmt.Sprintf("test-pod-%s.yaml", podName))
err := os.WriteFile(podFile, []byte(podYAML), 0644)
Expect(err).NotTo(HaveOccurred())
defer os.Remove(podFile)

_, err = runKubectl("apply", "-f", podFile)
Expect(err).NotTo(HaveOccurred(), "Failed to create Pod")

By("verifying the egress container terminates with an error")
Eventually(func(g Gomega) {
output, err := runKubectl("get", "pod", podName, "-n", testNamespace,
"-o", "jsonpath={.status.containerStatuses[?(@.name==\"egress\")].state.terminated.exitCode}")
g.Expect(err).NotTo(HaveOccurred())
g.Expect(output).NotTo(BeEmpty(), "egress container should have terminated")
g.Expect(output).NotTo(Equal("0"), "egress container should exit with non-zero code")
}, 2*time.Minute).Should(Succeed())

By("verifying egress logs mention iptables nat failure")
output, err := runKubectl("logs", podName, "-n", testNamespace, "-c", "egress")
Expect(err).NotTo(HaveOccurred())
Expect(output).To(SatisfyAny(
ContainSubstring("Failed to initialize nft"),
ContainSubstring("can't initialize iptables table"),
ContainSubstring("nat"),
))
})
})

Context("Pool with gVisor RuntimeClass", func() {
var poolName string
var batchSandboxName string
Expand Down
3 changes: 2 additions & 1 deletion server/opensandbox_server/services/docker/networking.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
build_egress_auth_headers,
merge_endpoint_headers,
)
from opensandbox_server.services.validators import ensure_egress_configured
from opensandbox_server.services.validators import ensure_egress_configured, ensure_egress_runtime_compatible

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -139,6 +139,7 @@ def _ensure_network_policy_support(self, request) -> None:

# Common validation: egress.image must be configured
ensure_egress_configured(request.network_policy, self.app_config.egress)
ensure_egress_runtime_compatible(request.network_policy, self.app_config.secure_runtime)

def _ensure_secure_access_support(self, request) -> None:
"""Validate that secure access can be honored under the current Docker runtime."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from opensandbox_server.services.helpers import format_ingress_endpoint
from opensandbox_server.api.schema import Endpoint, ImageSpec, NetworkPolicy, PlatformSpec, Volume
from opensandbox_server.services.k8s.agent_sandbox_template import AgentSandboxTemplateManager
from opensandbox_server.services.validators import ensure_egress_runtime_compatible
from opensandbox_server.services.k8s.client import K8sClient
from opensandbox_server.services.k8s.egress_helper import apply_egress_to_spec
from opensandbox_server.services.k8s.provider_common import (
Expand Down Expand Up @@ -196,8 +197,12 @@ def create_workload(
sandbox["spec"].pop("shutdownTime", None)
else:
sandbox["spec"]["shutdownTime"] = expires_at.isoformat()
merged_pod_spec = sandbox.get("spec", {}).get("podTemplate", {}).get("spec", {})
ensure_egress_runtime_compatible(
network_policy,
effective_runtime_class=merged_pod_spec.get("runtimeClassName"),
)
if platform is not None:
merged_pod_spec = sandbox.get("spec", {}).get("podTemplate", {}).get("spec", {})
WorkloadProvider.ensure_platform_compatible_with_affinity(merged_pod_spec, platform)

created = self.k8s_client.create_custom_object(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from opensandbox_server.services.k8s.batchsandbox_template import BatchSandboxTemplateManager
from opensandbox_server.services.k8s.client import K8sClient
from opensandbox_server.services.k8s.egress_helper import apply_egress_to_spec
from opensandbox_server.services.validators import ensure_egress_runtime_compatible
from opensandbox_server.services.k8s.provider_common import (
DEFAULT_ENTRYPOINT,
_build_execd_init_container,
Expand Down Expand Up @@ -260,8 +261,12 @@ def create_workload(
else:
batchsandbox["spec"]["expireTime"] = expires_at.isoformat()
self._merge_pod_spec_extras(batchsandbox, extra_volumes, extra_mounts)
merged_pod_spec = batchsandbox.get("spec", {}).get("template", {}).get("spec", {})
ensure_egress_runtime_compatible(
network_policy,
effective_runtime_class=merged_pod_spec.get("runtimeClassName"),
)
if platform is not None and not windows_profile:
merged_pod_spec = batchsandbox.get("spec", {}).get("template", {}).get("spec", {})
WorkloadProvider.ensure_platform_compatible_with_affinity(merged_pod_spec, platform)

created = self.k8s_client.create_custom_object(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
from opensandbox_server.services.validators import (
ensure_entrypoint,
ensure_egress_configured,
ensure_egress_runtime_compatible,
ensure_future_expiration,
ensure_metadata_labels,
ensure_platform_valid,
Expand Down Expand Up @@ -254,10 +255,12 @@ async def _wait_for_sandbox_ready(
def _ensure_network_policy_support(self, request: CreateSandboxRequest) -> None:
"""
Validate that network policy can be honored under the current runtime config.

This validates that egress.image is configured when network_policy is provided.

This validates that egress.image is configured when network_policy is provided,
and that the secure runtime supports the iptables nat table needed by the sidecar.
"""
ensure_egress_configured(request.network_policy, self.app_config.egress)
ensure_egress_runtime_compatible(request.network_policy, self.app_config.secure_runtime)
Comment thread
Pangjiping marked this conversation as resolved.

def _ensure_image_auth_support(self, request: CreateSandboxRequest) -> None:
"""
Expand Down
18 changes: 17 additions & 1 deletion server/opensandbox_server/services/runtime_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ async def validate_secure_runtime_on_startup(
return

if config.runtime.type == "docker":
await _validate_docker_runtime(resolver, docker_client)
await _validate_docker_runtime(resolver, docker_client, config)
elif config.runtime.type == "kubernetes":
await _validate_k8s_runtime_class(resolver, k8s_client, config)
else:
Expand All @@ -169,6 +169,7 @@ async def validate_secure_runtime_on_startup(
async def _validate_docker_runtime(
resolver: SecureRuntimeResolver,
docker_client: Optional["DockerClient"],
config: "AppConfig",
) -> None:
"""Validate that the Docker OCI runtime exists."""
runtime_name = resolver.get_docker_runtime()
Expand Down Expand Up @@ -210,6 +211,8 @@ async def _validate_docker_runtime(
logger.error("Failed to validate Docker runtime: %s", exc)
raise

_warn_gvisor_egress_incompatibility(config)


async def _validate_k8s_runtime_class(
resolver: SecureRuntimeResolver,
Expand Down Expand Up @@ -249,6 +252,19 @@ async def _validate_k8s_runtime_class(
logger.error("Failed to validate RuntimeClass: %s", exc)
raise

_warn_gvisor_egress_incompatibility(config)


def _warn_gvisor_egress_incompatibility(config: "AppConfig") -> None:
"""Log a warning when gVisor is configured alongside an egress sidecar image."""
egress_image = config.egress.image if getattr(config, "egress", None) else None
if config.secure_runtime and config.secure_runtime.type == "gvisor" and egress_image:
logger.warning(
"gVisor runtime is configured with egress sidecar image. "
"The egress sidecar's iptables nat-based DNS redirect is incompatible with gVisor. "
"Sandboxes created with network_policy will be rejected at creation time."
)


__all__ = [
"SecureRuntimeResolver",
Expand Down
39 changes: 38 additions & 1 deletion server/opensandbox_server/services/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

if TYPE_CHECKING:
from opensandbox_server.api.schema import NetworkPolicy, OSSFS, PlatformSpec, Volume
from opensandbox_server.config import EgressConfig
from opensandbox_server.config import EgressConfig, SecureRuntimeConfig


def ensure_entrypoint(entrypoint: Sequence[str]) -> None:
Expand Down Expand Up @@ -600,6 +600,43 @@ def ensure_egress_configured(
)


_GVISOR_NAT_INCOMPATIBLE_RUNTIMES = frozenset({"gvisor"})


def ensure_egress_runtime_compatible(
network_policy: Optional["NetworkPolicy"],
secure_runtime: Optional["SecureRuntimeConfig"] = None,
effective_runtime_class: Optional[str] = None,
) -> None:
"""
Reject network_policy when the secure runtime lacks iptables nat table support.

gVisor's netstack does not implement the iptables nat table, which the egress
sidecar requires for DNS redirect (REDIRECT target on port 53).
"""
if not network_policy:
return
runtime_type = None
if secure_runtime is not None and secure_runtime.type:
runtime_type = secure_runtime.type
elif effective_runtime_class:
runtime_type = effective_runtime_class
if not runtime_type:
return
if runtime_type in _GVISOR_NAT_INCOMPATIBLE_RUNTIMES:
Comment thread
Pangjiping marked this conversation as resolved.
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail={
"code": SandboxErrorCodes.INVALID_PARAMETER,
"message": (
f"networkPolicy is not compatible with runtime '{runtime_type}': "
f"gVisor does not support the iptables nat table required by the egress sidecar. "
f"Use a compatible runtime (e.g. kata) or remove networkPolicy."
),
},
)


def ensure_volumes_valid(
volumes: Optional[List["Volume"]],
allowed_host_prefixes: Optional[List[str]] = None,
Expand Down
57 changes: 55 additions & 2 deletions server/tests/test_runtime_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,25 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest.mock
from types import SimpleNamespace
from unittest.mock import MagicMock

import pytest
from kubernetes.client.exceptions import ApiException

from opensandbox_server.config import AppConfig, RuntimeConfig, SecureRuntimeConfig
from opensandbox_server.config import AppConfig, EgressConfig, RuntimeConfig, SecureRuntimeConfig
from opensandbox_server.services.runtime_resolver import (
SecureRuntimeResolver,
validate_secure_runtime_on_startup,
)


def _config(runtime_type: str = "docker", secure_runtime=None):
def _config(runtime_type: str = "docker", secure_runtime=None, egress=None):
return AppConfig(
runtime=RuntimeConfig(type=runtime_type, execd_image="opensandbox/execd:test"),
secure_runtime=secure_runtime,
egress=egress,
)


Expand Down Expand Up @@ -171,3 +173,54 @@ async def test_validate_secure_runtime_skips_unknown_runtime_type() -> None:
)

await validate_secure_runtime_on_startup(config)


@pytest.mark.asyncio
async def test_validate_startup_warns_gvisor_with_egress() -> None:
k8s_client = MagicMock()
config = _config(
runtime_type="kubernetes",
secure_runtime=SecureRuntimeConfig(type="gvisor", k8s_runtime_class="gvisor"),
egress=EgressConfig(image="opensandbox/egress:latest"),
)

with unittest.mock.patch(
"opensandbox_server.services.runtime_resolver.logger"
) as mock_logger:
await validate_secure_runtime_on_startup(config, k8s_client=k8s_client)

mock_logger.warning.assert_called_once()
assert "iptables nat" in mock_logger.warning.call_args[0][0]


@pytest.mark.asyncio
async def test_validate_startup_no_warn_gvisor_without_egress() -> None:
k8s_client = MagicMock()
config = _config(
runtime_type="kubernetes",
secure_runtime=SecureRuntimeConfig(type="gvisor", k8s_runtime_class="gvisor"),
)

with unittest.mock.patch(
"opensandbox_server.services.runtime_resolver.logger"
) as mock_logger:
await validate_secure_runtime_on_startup(config, k8s_client=k8s_client)

mock_logger.warning.assert_not_called()


@pytest.mark.asyncio
async def test_validate_startup_no_warn_kata_with_egress() -> None:
k8s_client = MagicMock()
config = _config(
runtime_type="kubernetes",
secure_runtime=SecureRuntimeConfig(type="kata", k8s_runtime_class="kata-qemu"),
egress=EgressConfig(image="opensandbox/egress:latest"),
)

with unittest.mock.patch(
"opensandbox_server.services.runtime_resolver.logger"
) as mock_logger:
await validate_secure_runtime_on_startup(config, k8s_client=k8s_client)

mock_logger.warning.assert_not_called()
Loading
Loading