diff --git a/docs/network-isolation-for-kubernetes.md b/docs/network-isolation-for-kubernetes.md index 512dea743..0e58d903c 100644 --- a/docs/network-isolation-for-kubernetes.md +++ b/docs/network-isolation-for-kubernetes.md @@ -140,6 +140,16 @@ sandbox = await Sandbox.create( | Cluster CIDR exposure | Not exposed to users | Must be exposed to users | | Use case | Platform-wide default isolation, recommended | Whitelist mode, fine-grained control | +## Runtime Compatibility + +Both Approach 1 (`deny.always` via egress sidecar) and Approach 2 (per-sandbox `network_policy`) depend on the egress sidecar, which uses an iptables `nat` table REDIRECT rule for DNS interception. This works with `runc` (default) and all Kata Containers variants (`kata-qemu`, `kata-clh`, `kata-fc`), but **not with gVisor** — gVisor's netstack does not implement the `nat` table. + +If you need both gVisor's syscall isolation and FQDN egress control: +- Use `kata-qemu` instead — it provides comparable security isolation and supports the egress sidecar. +- Alternatively, use a CNI-level FQDN policy (e.g., Cilium `toFQDNs`) for network isolation alongside gVisor. + +See the [Compatibility Matrix](secure-container.md#compatibility-matrix) in the Secure Container Runtime Guide for the full feature support table. + ## Recommendations 1. **Default full isolation**: use `deny.always` to block the cluster's internal CIDR ranges as the platform's default security baseline. diff --git a/docs/secure-container.md b/docs/secure-container.md index 032ac79ee..b9965c216 100644 --- a/docs/secure-container.md +++ b/docs/secure-container.md @@ -714,6 +714,26 @@ sudo containerd config dump sudo systemctl restart containerd ``` +#### 5. Egress Sidecar Incompatible with gVisor + +**Error**: Sandbox pods CrashLoopBackOff with egress container log: +``` +iptables: Failed to initialize nft: Protocol not supported +``` +Or with iptables-legacy: +``` +iptables v1.8.9 (legacy): can't initialize iptables table 'nat': Table does not exist (do you need to insmod?) +``` + +**Cause**: gVisor's netstack implements the `filter` and `mangle` iptables tables but does not implement the `nat` table. The egress sidecar uses a REDIRECT rule in the `nat` table to intercept DNS queries (port 53 → 15353), so it cannot start under gVisor. This is an upstream gVisor limitation ([gvisor#170](https://github.com/google/gvisor/issues/170)). + +**Solution**: +- Use `secure_runtime.type = "kata"` with `k8s_runtime_class = "kata-qemu"` — Kata provides a full Linux kernel per pod, so the `nat` table is available and the egress sidecar works unchanged. +- Use a CNI-level FQDN policy (e.g., Cilium `toFQDNs`) instead of the egress sidecar for network isolation under gVisor. +- Remove `network_policy` from sandbox creation requests if egress control is not required. + +> **Note**: The server validates this combination at request time and returns HTTP 400 with a clear error message when `secure_runtime.type = "gvisor"` and `network_policy` are used together. + ### Compatibility Matrix | Feature | runc | gVisor | Kata (QEMU) | Kata (CLH) | Kata (FC) | @@ -724,6 +744,7 @@ sudo systemctl restart containerd | Privileged Mode | Yes | No | Yes | Yes | No | | Docker Volume | Yes | Yes | Yes | Yes | Yes | | Systemd | Yes | No | Yes | Yes | No | +| iptables `nat` table (egress sidecar) | Yes | **No** | Yes | Yes | Yes | ### Getting Help diff --git a/kubernetes/test/e2e_runtime/gvisor/gvisor_test.go b/kubernetes/test/e2e_runtime/gvisor/gvisor_test.go index 89fe218b2..1fdfe6f6e 100644 --- a/kubernetes/test/e2e_runtime/gvisor/gvisor_test.go +++ b/kubernetes/test/e2e_runtime/gvisor/gvisor_test.go @@ -124,6 +124,80 @@ spec: }) }) + Context("gVisor + egress sidecar incompatibility", func() { + var podName string + + BeforeEach(func() { + podName = fmt.Sprintf("test-gvisor-egress-%d", time.Now().UnixNano()) + }) + + AfterEach(func() { + By("cleaning up Pod") + if podName != "" { + _, _ = runKubectl("delete", "pod", podName, "-n", testNamespace, + "--ignore-not-found=true", "--grace-period=0", "--force") + } + }) + + It("should fail to start the egress sidecar under gVisor due to missing iptables nat table", func() { + egressImage := os.Getenv("EGRESS_IMG") + if egressImage == "" { + Skip("EGRESS_IMG not set; skipping gVisor + egress incompatibility test") + } + + By("creating a Pod with gVisor runtimeClassName and egress sidecar") + podYAML := fmt.Sprintf(`apiVersion: v1 +kind: Pod +metadata: + name: %s + namespace: %s +spec: + runtimeClassName: %s + restartPolicy: Never + containers: + - name: workload + image: %s + command: ["sleep", "300"] + - name: egress + image: %s + securityContext: + capabilities: + add: ["NET_ADMIN"] + env: + - name: OPENSANDBOX_EGRESS_MODE + value: "dns+nft" + - name: OPENSANDBOX_EGRESS_RULES + value: '{"defaultAction":"deny","egress":[]}' +`, podName, testNamespace, RuntimeClassName, utils.SandboxImage, egressImage) + + podFile := filepath.Join("/tmp", fmt.Sprintf("test-pod-%s.yaml", podName)) + err := os.WriteFile(podFile, []byte(podYAML), 0644) + Expect(err).NotTo(HaveOccurred()) + defer os.Remove(podFile) + + _, err = runKubectl("apply", "-f", podFile) + Expect(err).NotTo(HaveOccurred(), "Failed to create Pod") + + By("verifying the egress container terminates with an error") + Eventually(func(g Gomega) { + output, err := runKubectl("get", "pod", podName, "-n", testNamespace, + "-o", "jsonpath={.status.containerStatuses[?(@.name==\"egress\")].state.terminated.exitCode}") + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(output).NotTo(BeEmpty(), "egress container should have terminated") + g.Expect(output).NotTo(Equal("0"), "egress container should exit with non-zero code") + }, 2*time.Minute).Should(Succeed()) + + By("verifying egress logs mention iptables nat failure") + output, err := runKubectl("logs", podName, "-n", testNamespace, "-c", "egress") + Expect(err).NotTo(HaveOccurred()) + Expect(output).To(SatisfyAny( + ContainSubstring("Failed to initialize nft"), + ContainSubstring("can't initialize iptables table"), + ContainSubstring("nat"), + )) + }) + }) + Context("Pool with gVisor RuntimeClass", func() { var poolName string var batchSandboxName string diff --git a/server/opensandbox_server/services/docker/networking.py b/server/opensandbox_server/services/docker/networking.py index 4823dfd7d..020c01a30 100644 --- a/server/opensandbox_server/services/docker/networking.py +++ b/server/opensandbox_server/services/docker/networking.py @@ -53,7 +53,7 @@ build_egress_auth_headers, merge_endpoint_headers, ) -from opensandbox_server.services.validators import ensure_egress_configured +from opensandbox_server.services.validators import ensure_egress_configured, ensure_egress_runtime_compatible logger = logging.getLogger(__name__) @@ -139,6 +139,7 @@ def _ensure_network_policy_support(self, request) -> None: # Common validation: egress.image must be configured ensure_egress_configured(request.network_policy, self.app_config.egress) + ensure_egress_runtime_compatible(request.network_policy, self.app_config.secure_runtime) def _ensure_secure_access_support(self, request) -> None: """Validate that secure access can be honored under the current Docker runtime.""" diff --git a/server/opensandbox_server/services/k8s/agent_sandbox_provider.py b/server/opensandbox_server/services/k8s/agent_sandbox_provider.py index 0247940ca..32b0bcb5e 100644 --- a/server/opensandbox_server/services/k8s/agent_sandbox_provider.py +++ b/server/opensandbox_server/services/k8s/agent_sandbox_provider.py @@ -27,6 +27,7 @@ from opensandbox_server.services.helpers import format_ingress_endpoint from opensandbox_server.api.schema import Endpoint, ImageSpec, NetworkPolicy, PlatformSpec, Volume from opensandbox_server.services.k8s.agent_sandbox_template import AgentSandboxTemplateManager +from opensandbox_server.services.validators import ensure_egress_runtime_compatible from opensandbox_server.services.k8s.client import K8sClient from opensandbox_server.services.k8s.egress_helper import apply_egress_to_spec from opensandbox_server.services.k8s.provider_common import ( @@ -196,8 +197,12 @@ def create_workload( sandbox["spec"].pop("shutdownTime", None) else: sandbox["spec"]["shutdownTime"] = expires_at.isoformat() + merged_pod_spec = sandbox.get("spec", {}).get("podTemplate", {}).get("spec", {}) + ensure_egress_runtime_compatible( + network_policy, + effective_runtime_class=merged_pod_spec.get("runtimeClassName"), + ) if platform is not None: - merged_pod_spec = sandbox.get("spec", {}).get("podTemplate", {}).get("spec", {}) WorkloadProvider.ensure_platform_compatible_with_affinity(merged_pod_spec, platform) created = self.k8s_client.create_custom_object( diff --git a/server/opensandbox_server/services/k8s/batchsandbox_provider.py b/server/opensandbox_server/services/k8s/batchsandbox_provider.py index 2fba857b8..06575db7b 100644 --- a/server/opensandbox_server/services/k8s/batchsandbox_provider.py +++ b/server/opensandbox_server/services/k8s/batchsandbox_provider.py @@ -38,6 +38,7 @@ from opensandbox_server.services.k8s.batchsandbox_template import BatchSandboxTemplateManager from opensandbox_server.services.k8s.client import K8sClient from opensandbox_server.services.k8s.egress_helper import apply_egress_to_spec +from opensandbox_server.services.validators import ensure_egress_runtime_compatible from opensandbox_server.services.k8s.provider_common import ( DEFAULT_ENTRYPOINT, _build_execd_init_container, @@ -260,8 +261,12 @@ def create_workload( else: batchsandbox["spec"]["expireTime"] = expires_at.isoformat() self._merge_pod_spec_extras(batchsandbox, extra_volumes, extra_mounts) + merged_pod_spec = batchsandbox.get("spec", {}).get("template", {}).get("spec", {}) + ensure_egress_runtime_compatible( + network_policy, + effective_runtime_class=merged_pod_spec.get("runtimeClassName"), + ) if platform is not None and not windows_profile: - merged_pod_spec = batchsandbox.get("spec", {}).get("template", {}).get("spec", {}) WorkloadProvider.ensure_platform_compatible_with_affinity(merged_pod_spec, platform) created = self.k8s_client.create_custom_object( diff --git a/server/opensandbox_server/services/k8s/kubernetes_service.py b/server/opensandbox_server/services/k8s/kubernetes_service.py index 37e0710ee..02e009c9e 100644 --- a/server/opensandbox_server/services/k8s/kubernetes_service.py +++ b/server/opensandbox_server/services/k8s/kubernetes_service.py @@ -78,6 +78,7 @@ from opensandbox_server.services.validators import ( ensure_entrypoint, ensure_egress_configured, + ensure_egress_runtime_compatible, ensure_future_expiration, ensure_metadata_labels, ensure_platform_valid, @@ -254,10 +255,12 @@ async def _wait_for_sandbox_ready( def _ensure_network_policy_support(self, request: CreateSandboxRequest) -> None: """ Validate that network policy can be honored under the current runtime config. - - This validates that egress.image is configured when network_policy is provided. + + This validates that egress.image is configured when network_policy is provided, + and that the secure runtime supports the iptables nat table needed by the sidecar. """ ensure_egress_configured(request.network_policy, self.app_config.egress) + ensure_egress_runtime_compatible(request.network_policy, self.app_config.secure_runtime) def _ensure_image_auth_support(self, request: CreateSandboxRequest) -> None: """ diff --git a/server/opensandbox_server/services/runtime_resolver.py b/server/opensandbox_server/services/runtime_resolver.py index 240ef6b3a..c4efbda2c 100644 --- a/server/opensandbox_server/services/runtime_resolver.py +++ b/server/opensandbox_server/services/runtime_resolver.py @@ -156,7 +156,7 @@ async def validate_secure_runtime_on_startup( return if config.runtime.type == "docker": - await _validate_docker_runtime(resolver, docker_client) + await _validate_docker_runtime(resolver, docker_client, config) elif config.runtime.type == "kubernetes": await _validate_k8s_runtime_class(resolver, k8s_client, config) else: @@ -169,6 +169,7 @@ async def validate_secure_runtime_on_startup( async def _validate_docker_runtime( resolver: SecureRuntimeResolver, docker_client: Optional["DockerClient"], + config: "AppConfig", ) -> None: """Validate that the Docker OCI runtime exists.""" runtime_name = resolver.get_docker_runtime() @@ -210,6 +211,8 @@ async def _validate_docker_runtime( logger.error("Failed to validate Docker runtime: %s", exc) raise + _warn_gvisor_egress_incompatibility(config) + async def _validate_k8s_runtime_class( resolver: SecureRuntimeResolver, @@ -249,6 +252,19 @@ async def _validate_k8s_runtime_class( logger.error("Failed to validate RuntimeClass: %s", exc) raise + _warn_gvisor_egress_incompatibility(config) + + +def _warn_gvisor_egress_incompatibility(config: "AppConfig") -> None: + """Log a warning when gVisor is configured alongside an egress sidecar image.""" + egress_image = config.egress.image if getattr(config, "egress", None) else None + if config.secure_runtime and config.secure_runtime.type == "gvisor" and egress_image: + logger.warning( + "gVisor runtime is configured with egress sidecar image. " + "The egress sidecar's iptables nat-based DNS redirect is incompatible with gVisor. " + "Sandboxes created with network_policy will be rejected at creation time." + ) + __all__ = [ "SecureRuntimeResolver", diff --git a/server/opensandbox_server/services/validators.py b/server/opensandbox_server/services/validators.py index 4b37b5704..a9f5bcfa8 100644 --- a/server/opensandbox_server/services/validators.py +++ b/server/opensandbox_server/services/validators.py @@ -31,7 +31,7 @@ if TYPE_CHECKING: from opensandbox_server.api.schema import NetworkPolicy, OSSFS, PlatformSpec, Volume - from opensandbox_server.config import EgressConfig + from opensandbox_server.config import EgressConfig, SecureRuntimeConfig def ensure_entrypoint(entrypoint: Sequence[str]) -> None: @@ -600,6 +600,43 @@ def ensure_egress_configured( ) +_GVISOR_NAT_INCOMPATIBLE_RUNTIMES = frozenset({"gvisor"}) + + +def ensure_egress_runtime_compatible( + network_policy: Optional["NetworkPolicy"], + secure_runtime: Optional["SecureRuntimeConfig"] = None, + effective_runtime_class: Optional[str] = None, +) -> None: + """ + Reject network_policy when the secure runtime lacks iptables nat table support. + + gVisor's netstack does not implement the iptables nat table, which the egress + sidecar requires for DNS redirect (REDIRECT target on port 53). + """ + if not network_policy: + return + runtime_type = None + if secure_runtime is not None and secure_runtime.type: + runtime_type = secure_runtime.type + elif effective_runtime_class: + runtime_type = effective_runtime_class + if not runtime_type: + return + if runtime_type in _GVISOR_NAT_INCOMPATIBLE_RUNTIMES: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail={ + "code": SandboxErrorCodes.INVALID_PARAMETER, + "message": ( + f"networkPolicy is not compatible with runtime '{runtime_type}': " + f"gVisor does not support the iptables nat table required by the egress sidecar. " + f"Use a compatible runtime (e.g. kata) or remove networkPolicy." + ), + }, + ) + + def ensure_volumes_valid( volumes: Optional[List["Volume"]], allowed_host_prefixes: Optional[List[str]] = None, diff --git a/server/tests/test_runtime_resolver.py b/server/tests/test_runtime_resolver.py index aa344e4f7..e599b37cf 100644 --- a/server/tests/test_runtime_resolver.py +++ b/server/tests/test_runtime_resolver.py @@ -12,23 +12,25 @@ # See the License for the specific language governing permissions and # limitations under the License. +import unittest.mock from types import SimpleNamespace from unittest.mock import MagicMock import pytest from kubernetes.client.exceptions import ApiException -from opensandbox_server.config import AppConfig, RuntimeConfig, SecureRuntimeConfig +from opensandbox_server.config import AppConfig, EgressConfig, RuntimeConfig, SecureRuntimeConfig from opensandbox_server.services.runtime_resolver import ( SecureRuntimeResolver, validate_secure_runtime_on_startup, ) -def _config(runtime_type: str = "docker", secure_runtime=None): +def _config(runtime_type: str = "docker", secure_runtime=None, egress=None): return AppConfig( runtime=RuntimeConfig(type=runtime_type, execd_image="opensandbox/execd:test"), secure_runtime=secure_runtime, + egress=egress, ) @@ -171,3 +173,54 @@ async def test_validate_secure_runtime_skips_unknown_runtime_type() -> None: ) await validate_secure_runtime_on_startup(config) + + +@pytest.mark.asyncio +async def test_validate_startup_warns_gvisor_with_egress() -> None: + k8s_client = MagicMock() + config = _config( + runtime_type="kubernetes", + secure_runtime=SecureRuntimeConfig(type="gvisor", k8s_runtime_class="gvisor"), + egress=EgressConfig(image="opensandbox/egress:latest"), + ) + + with unittest.mock.patch( + "opensandbox_server.services.runtime_resolver.logger" + ) as mock_logger: + await validate_secure_runtime_on_startup(config, k8s_client=k8s_client) + + mock_logger.warning.assert_called_once() + assert "iptables nat" in mock_logger.warning.call_args[0][0] + + +@pytest.mark.asyncio +async def test_validate_startup_no_warn_gvisor_without_egress() -> None: + k8s_client = MagicMock() + config = _config( + runtime_type="kubernetes", + secure_runtime=SecureRuntimeConfig(type="gvisor", k8s_runtime_class="gvisor"), + ) + + with unittest.mock.patch( + "opensandbox_server.services.runtime_resolver.logger" + ) as mock_logger: + await validate_secure_runtime_on_startup(config, k8s_client=k8s_client) + + mock_logger.warning.assert_not_called() + + +@pytest.mark.asyncio +async def test_validate_startup_no_warn_kata_with_egress() -> None: + k8s_client = MagicMock() + config = _config( + runtime_type="kubernetes", + secure_runtime=SecureRuntimeConfig(type="kata", k8s_runtime_class="kata-qemu"), + egress=EgressConfig(image="opensandbox/egress:latest"), + ) + + with unittest.mock.patch( + "opensandbox_server.services.runtime_resolver.logger" + ) as mock_logger: + await validate_secure_runtime_on_startup(config, k8s_client=k8s_client) + + mock_logger.warning.assert_not_called() diff --git a/server/tests/test_validators.py b/server/tests/test_validators.py index 5522bd094..0f4715897 100644 --- a/server/tests/test_validators.py +++ b/server/tests/test_validators.py @@ -18,6 +18,7 @@ from opensandbox_server.api.schema import Host, OSSFS, PVC, Volume, PlatformSpec from opensandbox_server.services.constants import SandboxErrorCodes from opensandbox_server.services.validators import ( + ensure_egress_runtime_compatible, ensure_metadata_labels, ensure_platform_valid, ensure_timeout_within_limit, @@ -634,3 +635,58 @@ def test_invalid_pvc_name_rejected_by_pydantic(self): with pytest.raises(ValidationError) as exc_info: PVC(claim_name="Invalid_PVC") # Invalid: uppercase and underscore assert "claim_name" in str(exc_info.value) + + +class TestEgressRuntimeCompatibility: + + def _network_policy(self): + from opensandbox_server.api.schema import NetworkPolicy + return NetworkPolicy(default_action="deny", egress=[]) + + def _secure_runtime(self, type_: str): + from opensandbox_server.config import SecureRuntimeConfig + if type_ == "gvisor": + return SecureRuntimeConfig(type=type_, k8s_runtime_class="gvisor") + if type_ == "kata": + return SecureRuntimeConfig(type=type_, k8s_runtime_class="kata-qemu") + return SecureRuntimeConfig(type=type_) + + def test_rejects_gvisor_with_network_policy(self): + with pytest.raises(HTTPException) as exc_info: + ensure_egress_runtime_compatible(self._network_policy(), self._secure_runtime("gvisor")) + assert exc_info.value.status_code == 400 + assert "gVisor" in exc_info.value.detail["message"] + assert exc_info.value.detail["code"] == SandboxErrorCodes.INVALID_PARAMETER + + def test_allows_kata_with_network_policy(self): + ensure_egress_runtime_compatible(self._network_policy(), self._secure_runtime("kata")) + + def test_allows_no_secure_runtime(self): + ensure_egress_runtime_compatible(self._network_policy(), None) + + def test_allows_empty_secure_runtime(self): + ensure_egress_runtime_compatible(self._network_policy(), self._secure_runtime("")) + + def test_allows_gvisor_without_network_policy(self): + ensure_egress_runtime_compatible(None, self._secure_runtime("gvisor")) + + def test_rejects_template_gvisor_with_network_policy(self): + with pytest.raises(HTTPException) as exc_info: + ensure_egress_runtime_compatible( + self._network_policy(), None, effective_runtime_class="gvisor" + ) + assert exc_info.value.status_code == 400 + assert "gVisor" in exc_info.value.detail["message"] + + def test_allows_template_kata_with_network_policy(self): + ensure_egress_runtime_compatible( + self._network_policy(), None, effective_runtime_class="kata-qemu" + ) + + def test_secure_runtime_takes_precedence_over_template(self): + with pytest.raises(HTTPException): + ensure_egress_runtime_compatible( + self._network_policy(), + self._secure_runtime("gvisor"), + effective_runtime_class="kata-qemu", + )