Skip to content

Commit aac57d0

Browse files
committed
feat(bootstrap): use native nftables under Podman, drop legacy iptables modules
When running under Podman, the k3s cluster now uses: - Native nftables kube-proxy mode (--kube-proxy-arg=proxy-mode=nftables) - Host DNS resolution instead of iptables DNAT proxy (Podman DNS is routable) - Skipped iptables backend probe (unnecessary with nftables kube-proxy) This eliminates the need for legacy iptables kernel modules (ip_tables, iptable_nat, iptable_filter, iptable_mangle) on the host when using Podman. The Docker path is completely unchanged — all new behavior is gated on CONTAINER_RUNTIME=podman. Container image: add nftables package (provides nft binary for kube-proxy). RPM spec: modules-load.d now only loads br_netfilter (still required for bridged pod traffic regardless of iptables/nftables). Remove podman-docker recommends (no longer needed with native Podman socket detection and nftables networking).
1 parent 78067b1 commit aac57d0

4 files changed

Lines changed: 81 additions & 53 deletions

File tree

crates/openshell-bootstrap/src/docker.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -878,6 +878,11 @@ pub async fn ensure_container(
878878
env_vars.push("GPU_ENABLED=true".to_string());
879879
}
880880

881+
// Pass the container runtime to the entrypoint so it can select the
882+
// appropriate networking stack (nftables kube-proxy for Podman, iptables
883+
// DNS proxy for Docker, etc.).
884+
env_vars.push(format!("CONTAINER_RUNTIME={}", runtime.binary_name()));
885+
881886
let env = Some(env_vars);
882887

883888
// Set the health check explicitly on the container config so it works

deploy/docker/Dockerfile.images

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,7 @@ RUN dnf install -y fedora-repos && \
233233
dnf install -y \
234234
ca-certificates \
235235
iptables \
236+
nftables \
236237
util-linux \
237238
bind-utils \
238239
&& dnf clean all

deploy/docker/cluster-entrypoint.sh

Lines changed: 62 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -34,22 +34,13 @@ yaml_quote() {
3434
printf "'%s'" "$(printf '%s' "$1" | sed "s/'/''/g")"
3535
}
3636

37-
# ---------------------------------------------------------------------------
38-
# Select iptables backend
39-
# ---------------------------------------------------------------------------
40-
# Some kernels (e.g. Jetson Linux 5.15-tegra) have the nf_tables subsystem
41-
# but lack the nft_compat bridge that allows flannel and kube-proxy to use
42-
# xt extension modules (xt_comment, xt_conntrack). Detect this by probing
43-
# whether xt_comment is usable via the current iptables backend. If the
44-
# probe fails, switch to iptables-legacy. Set USE_IPTABLES_LEGACY=1
45-
# externally to skip the probe and force the legacy backend.
4637
# ---------------------------------------------------------------------------
4738
# Check br_netfilter kernel module
4839
# ---------------------------------------------------------------------------
4940
# br_netfilter makes the kernel pass bridge (pod-to-pod) traffic through
50-
# iptables. Without it, kube-proxy's DNAT rules for ClusterIP services are
51-
# never applied to pod traffic, so pods cannot reach services such as
52-
# kube-dns (10.43.0.10), breaking all in-cluster DNS resolution.
41+
# netfilter (iptables or nftables). Without it, kube-proxy's DNAT rules for
42+
# ClusterIP services are never applied to pod traffic, so pods cannot reach
43+
# services such as kube-dns (10.43.0.10), breaking all in-cluster DNS.
5344
#
5445
# The module must be loaded on the HOST before the container starts —
5546
# containers cannot load kernel modules themselves. If it is missing, log a
@@ -65,25 +56,37 @@ if [ ! -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then
6556
echo " echo br_netfilter | sudo tee /etc/modules-load.d/br_netfilter.conf" >&2
6657
fi
6758

68-
if [ -z "${USE_IPTABLES_LEGACY:-}" ]; then
69-
if iptables -t filter -N _xt_probe 2>/dev/null; then
70-
_probe_rc=0
71-
iptables -t filter -A _xt_probe -m comment --comment "probe" -j ACCEPT \
72-
2>/dev/null || _probe_rc=$?
73-
iptables -t filter -D _xt_probe -m comment --comment "probe" -j ACCEPT \
74-
2>/dev/null || true
75-
iptables -t filter -X _xt_probe 2>/dev/null || true
76-
[ "$_probe_rc" -ne 0 ] && USE_IPTABLES_LEGACY=1
59+
# ---------------------------------------------------------------------------
60+
# Select iptables backend (Docker only)
61+
# ---------------------------------------------------------------------------
62+
# Under Podman with nftables kube-proxy mode, the iptables backend probe is
63+
# unnecessary — kube-proxy uses nft directly. Flannel still uses the iptables
64+
# binary but through the nft compat shim which doesn't need the xt probe.
65+
#
66+
# Under Docker (or unset runtime), probe whether xt_comment is usable. Some
67+
# kernels (e.g. Jetson Linux 5.15-tegra) have nf_tables but lack the
68+
# nft_compat bridge. If the probe fails, switch to iptables-legacy.
69+
if [ "${CONTAINER_RUNTIME:-}" != "podman" ]; then
70+
if [ -z "${USE_IPTABLES_LEGACY:-}" ]; then
71+
if iptables -t filter -N _xt_probe 2>/dev/null; then
72+
_probe_rc=0
73+
iptables -t filter -A _xt_probe -m comment --comment "probe" -j ACCEPT \
74+
2>/dev/null || _probe_rc=$?
75+
iptables -t filter -D _xt_probe -m comment --comment "probe" -j ACCEPT \
76+
2>/dev/null || true
77+
iptables -t filter -X _xt_probe 2>/dev/null || true
78+
[ "$_probe_rc" -ne 0 ] && USE_IPTABLES_LEGACY=1
79+
fi
7780
fi
78-
fi
7981

80-
if [ "${USE_IPTABLES_LEGACY:-0}" = "1" ]; then
81-
echo "iptables nf_tables xt extension bridge unavailable — switching to iptables-legacy"
82-
if update-alternatives --set iptables /usr/sbin/iptables-legacy 2>/dev/null &&
83-
update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy 2>/dev/null; then
84-
echo "Now using iptables-legacy mode"
85-
else
86-
echo "Warning: could not switch to iptables-legacy — cluster networking may fail"
82+
if [ "${USE_IPTABLES_LEGACY:-0}" = "1" ]; then
83+
echo "iptables nf_tables xt extension bridge unavailable — switching to iptables-legacy"
84+
if update-alternatives --set iptables /usr/sbin/iptables-legacy 2>/dev/null &&
85+
update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy 2>/dev/null; then
86+
echo "Now using iptables-legacy mode"
87+
else
88+
echo "Warning: could not switch to iptables-legacy — cluster networking may fail"
89+
fi
8790
fi
8891
fi
8992

@@ -174,13 +177,20 @@ setup_dns_proxy() {
174177
echo "Configured k3s DNS to use ${CONTAINER_IP} (proxied to Docker DNS)"
175178
}
176179

177-
if ! setup_dns_proxy; then
178-
echo "DNS proxy setup failed, falling back to public DNS servers"
179-
echo "Note: this may not work on Docker Desktop (Mac/Windows)"
180-
cat >"$RESOLV_CONF" <<EOF
180+
if [ "${CONTAINER_RUNTIME:-}" = "podman" ]; then
181+
# Podman DNS is directly routable (aardvark-dns or host DNS) — no proxy
182+
# needed. Copy the container's resolv.conf so k3s has a stable path.
183+
cp /etc/resolv.conf "$RESOLV_CONF"
184+
echo "Podman detected — using host DNS resolution (no proxy needed)"
185+
else
186+
if ! setup_dns_proxy; then
187+
echo "DNS proxy setup failed, falling back to public DNS servers"
188+
echo "Note: this may not work on Docker Desktop (Mac/Windows)"
189+
cat >"$RESOLV_CONF" <<EOF
181190
nameserver 8.8.8.8
182191
nameserver 8.8.4.4
183192
EOF
193+
fi
184194
fi
185195

186196
# ---------------------------------------------------------------------------
@@ -632,7 +642,9 @@ fi
632642
# On kernels where xt_comment is unavailable, kube-router's network policy
633643
# controller panics at startup. Disable it when the iptables-legacy probe
634644
# triggered; sandbox isolation is enforced by the NSSH1 HMAC handshake instead.
635-
if [ "${USE_IPTABLES_LEGACY:-0}" = "1" ]; then
645+
# Under Podman with nftables kube-proxy, the xt probe is skipped entirely so
646+
# USE_IPTABLES_LEGACY is never set — network policy stays enabled.
647+
if [ "${USE_IPTABLES_LEGACY:-0}" = "1" ] && [ "${CONTAINER_RUNTIME:-}" != "podman" ]; then
636648
EXTRA_KUBELET_ARGS="$EXTRA_KUBELET_ARGS --disable-network-policy"
637649
fi
638650

@@ -659,8 +671,23 @@ if [ -n "${OPENSHELL_NODE_NAME:-}" ]; then
659671
echo "Using deterministic k3s node name: ${OPENSHELL_NODE_NAME}"
660672
fi
661673

674+
# ---------------------------------------------------------------------------
675+
# Select kube-proxy mode
676+
# ---------------------------------------------------------------------------
677+
# Under Podman, use native nftables kube-proxy mode so no legacy iptables
678+
# kernel modules (ip_tables, iptable_nat, etc.) are required on the host.
679+
# Docker retains the default iptables mode for maximum compatibility.
680+
EXTRA_KUBE_PROXY_ARGS=""
681+
if [ "${CONTAINER_RUNTIME:-}" = "podman" ]; then
682+
echo "Podman detected — using nftables kube-proxy mode"
683+
EXTRA_KUBE_PROXY_ARGS="--kube-proxy-arg=proxy-mode=nftables"
684+
fi
685+
662686
# Execute k3s with explicit resolv-conf passed as a kubelet arg.
663687
# k3s v1.35.2+ no longer accepts --resolv-conf as a top-level server flag;
664688
# it must be passed via --kubelet-arg instead.
665689
# shellcheck disable=SC2086
666-
exec /bin/k3s "$@" $NODE_NAME_ARG --kubelet-arg=resolv-conf="$RESOLV_CONF" $EXTRA_KUBELET_ARGS
690+
exec /bin/k3s "$@" $NODE_NAME_ARG \
691+
--kubelet-arg=resolv-conf="$RESOLV_CONF" \
692+
$EXTRA_KUBELET_ARGS \
693+
$EXTRA_KUBE_PROXY_ARGS

openshell.spec

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,6 @@ BuildRequires: python3-devel
3939
# Runtime: container runtime for gateway lifecycle (start/stop/destroy).
4040
# Podman is preferred; Docker is also supported via --container-runtime flag.
4141
Recommends: podman
42-
# When Podman is the container runtime, podman-docker provides the
43-
# /var/run/docker.sock symlink and `docker` CLI alias that third-party
44-
# libraries (e.g., bollard) expect.
45-
Recommends: podman-docker
4642

4743
%description
4844
OpenShell provides safe, sandboxed runtimes for autonomous AI agents.
@@ -98,19 +94,18 @@ cargo build --release --bin openshell
9894
# Install CLI binary
9995
install -Dpm 0755 target/release/%{name} %{buildroot}%{_bindir}/%{name}
10096

101-
# Install modules-load.d config for legacy iptables kernel modules.
102-
# k3s (used by the gateway cluster) bundles its own legacy iptables binary
103-
# for flannel CNI. Modern distros (Fedora 41+, RHEL 10+) only load nf_tables
104-
# by default, so these legacy modules must be explicitly loaded.
97+
# Install modules-load.d config for br_netfilter.
98+
# br_netfilter makes the kernel pass bridged (pod-to-pod) traffic through
99+
# netfilter hooks so kube-proxy DNAT rules (iptables or nftables) apply to
100+
# ClusterIP service traffic. Legacy iptables modules are not required —
101+
# kube-proxy uses native nftables under Podman, and the iptables binary on
102+
# modern distros (Fedora 41+, RHEL 10+) is iptables-nft which uses the
103+
# nf_tables kernel path.
105104
install -d %{buildroot}%{_modulesloaddir}
106105
cat > %{buildroot}%{_modulesloaddir}/%{name}.conf << 'EOF'
107-
# Load legacy iptables kernel modules required by k3s flannel CNI.
108-
# Modern kernels use nf_tables by default; these modules provide the
109-
# legacy iptables interface that k3s's bundled iptables-legacy needs.
110-
ip_tables
111-
iptable_nat
112-
iptable_filter
113-
iptable_mangle
106+
# Load br_netfilter for K3s bridge networking.
107+
# Required so kube-proxy DNAT rules (iptables or nftables) apply to
108+
# bridged pod-to-pod traffic for ClusterIP service resolution.
114109
br_netfilter
115110
EOF
116111

@@ -154,9 +149,9 @@ echo "rpm" > %{buildroot}%{python3_sitelib}/%{name}-%{version}.dist-info/INSTALL
154149
touch %{buildroot}%{python3_sitelib}/%{name}-%{version}.dist-info/RECORD
155150

156151
%post
157-
# Load kernel modules immediately so a reboot is not required after
158-
# initial installation. The modules-load.d config handles subsequent boots.
159-
modprobe -a ip_tables iptable_nat iptable_filter iptable_mangle br_netfilter > /dev/null 2>&1 || :
152+
# Load br_netfilter immediately so a reboot is not required after install.
153+
# The modules-load.d config handles subsequent boots.
154+
modprobe br_netfilter > /dev/null 2>&1 || :
160155
%sysctl_apply 99-%{name}.conf
161156

162157
%check

0 commit comments

Comments
 (0)