diff --git a/Dockerfile b/Dockerfile index a864e8bc..2c21d98f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -34,7 +34,12 @@ RUN arch="$(uname -m | sed -e s/x86_64/amd64/ -e s/aarch64/arm64/)" && \ RUN apt-get update && apt-get install -y --no-install-recommends \ gettext-base \ moreutils \ - socat + socat ipset wget ADD Dockerfile.d/etc_udev_rules.d_90-flannel.rules /etc/udev/rules.d/90-flannel.rules +ADD Dockerfile.d/etc_udev_rules.d_95-calico.rules /etc/udev/rules.d/95-calico.rules ADD Dockerfile.d/u7s-entrypoint.sh / +# Calico +ENV FELIX_IGNORELOOSERPF=true +RUN wget https://github.com/projectcalico/calico/releases/download/v3.30.5/calicoctl-linux-amd64 -O /tmp/calicoctl && \ + chmod +x /tmp/calicoctl && mv /tmp/calicoctl /usr/local/bin ENTRYPOINT ["/u7s-entrypoint.sh", "/usr/local/bin/entrypoint", "/sbin/init"] diff --git a/Dockerfile.d/etc_udev_rules.d_95-calico.rules b/Dockerfile.d/etc_udev_rules.d_95-calico.rules new file mode 100644 index 00000000..94beb184 --- /dev/null +++ b/Dockerfile.d/etc_udev_rules.d_95-calico.rules @@ -0,0 +1 @@ +SUBSYSTEM=="net", ACTION=="add|change|move", ENV{INTERFACE}=="vxlan.calico", RUN+="/usr/sbin/ethtool -K vxlan.calico tx-checksum-ip-generic off" diff --git a/Makefile b/Makefile index de258ddb..acfe0940 100644 --- a/Makefile +++ b/Makefile @@ -6,6 +6,7 @@ export PORT_ETCD ?= 2379 export PORT_KUBELET ?= 10250 export PORT_FLANNEL ?= 8472 export PORT_KUBE_APISERVER ?= 6443 +export PORT_CALICO ?= 5473 # HOSTNAME is the name of the physical host export HOSTNAME ?= $(shell hostname) @@ -35,6 +36,7 @@ NODE_SHELL := $(COMPOSE) exec \ -e NODE_IP=$(NODE_IP) \ -e PORT_KUBE_APISERVER=$(PORT_KUBE_APISERVER) \ -e PORT_FLANNEL=$(PORT_FLANNEL) \ + -e PORT_CALICO=$(PORT_CALICO) \ -e PORT_KUBELET=$(PORT_KUBELET) \ -e PORT_ETCD=$(PORT_ETCD) \ $(NODE_SERVICE_NAME) @@ -160,3 +162,8 @@ install-flannel: # We don't actually need it there, just on the physical node, so we use newer K8s and older flannel $(NODE_SHELL) kubectl apply -f https://github.com/flannel-io/flannel/releases/download/v0.25.1/kube-flannel.yml #$(NODE_SHELL) /usernetes/Makefile.d/install-flannel.sh + +.PHONY: install-calico +install-calico: + # Calico daemonset changes and node-level address changes + $(NODE_SHELL) /usernetes/Makefile.d/calico/install-calico.sh diff --git a/Makefile.d/calico/calico-ethtool.yaml b/Makefile.d/calico/calico-ethtool.yaml new file mode 100644 index 00000000..8e9b98f7 --- /dev/null +++ b/Makefile.d/calico/calico-ethtool.yaml @@ -0,0 +1,47 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: calico-checksum-fix + namespace: kube-system + labels: + k8s-app: calico-checksum-fix +spec: + selector: + matchLabels: + name: calico-checksum-fix + template: + metadata: + labels: + name: calico-checksum-fix + spec: + hostNetwork: true + hostPID: true + securityContext: + runAsUser: 0 + initContainers: + - name: fix-checksum + image: ghcr.io/converged-computing/usernetes:alpine + # image: alpine:latest + command: ["/bin/sh", "-c"] + args: + - | + # nsenter -t 1 enters the init process's namespace (of the host) + # check if the interface exists before running ethtool + if [ -d /sys/class/net/vxlan.calico ]; then + echo "Applying ethtool fix to vxlan.calico..." + nsenter -t 1 -n -u -i -m -- ethtool -K vxlan.calico tx-checksum-ip-generic off + else + echo "vxlan.calico interface not found, skipping." + fi + iptables -I INPUT -p udp --dport 8472 -j ACCEPT + sysctl -w net.ipv4.conf.all.rp_filter=1 + sysctl -w net.ipv4.conf.default.rp_filter=1 + sysctl -w net.ipv4.conf.eth0.rp_filter=1 + sysctl -w net.ipv4.conf.vxlan/calico.rp_filter=1 + securityContext: + privileged: true + containers: + - name: pause + # image: registry.k8s.io/pause:3.9 + image: ghcr.io/converged-computing/usernetes:pause + terminationGracePeriodSeconds: 0 diff --git a/Makefile.d/calico/install-calico.sh b/Makefile.d/calico/install-calico.sh new file mode 100755 index 00000000..fa299022 --- /dev/null +++ b/Makefile.d/calico/install-calico.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# Install standard Calico +CALICO_VERSION="v3.31" +CALICO_FILE="calico.yaml" +wget https://raw.githubusercontent.com/projectcalico/calico/refs/heads/release-v3.31/manifests/calico.yaml -O $CALICO_FILE + +# backend to vxlan +yq eval-all -i '(select(.kind == "ConfigMap" and .metadata.name == "calico-config").data.calico_backend) = "vxlan"' $CALICO_FILE + +# IPIP and VXLAN +yq eval-all -i '(select(.kind == "DaemonSet" and .metadata.name == "calico-node").spec.template.spec.containers[0].env[] | select(.name == "CALICO_IPV4POOL_IPIP").value) = "Never"' $CALICO_FILE +yq eval-all -i '(select(.kind == "DaemonSet" and .metadata.name == "calico-node").spec.template.spec.containers[0].env[] | select(.name == "CALICO_IPV4POOL_VXLAN").value) = "CrossSubnet"' $CALICO_FILE +yq eval-all -i '(select(.kind == "DaemonSet" and .metadata.name == "calico-node").spec.template.spec.containers[0].env[] | select(.name == "CALICO_IPV6POOL_VXLAN").value) = "CrossSubnet"' $CALICO_FILE + +# FELIX for rootless +yq eval-all -i 'select(.kind == "DaemonSet" and .metadata.name == "calico-node").spec.template.spec.containers[0].env += {"name": "FELIX_IGNORELOOSERPF", "value": "true"}' $CALICO_FILE +yq eval-all -i 'select(.kind == "DaemonSet" and .metadata.name == "calico-node").spec.template.spec.containers[0].env += {"name": "FELIX_VXLANPORT", "value": "8472"}' $CALICO_FILE +yq eval-all -i 'select(.kind == "DaemonSet" and .metadata.name == "calico-node").spec.template.spec.containers[0].env += {"name": "FELIX_EXTERNALNODESCIDRLIST", "value": "10.100.0.0/16"}' $CALICO_FILE + +# health probes (Remove bird-ready and bird-live) +yq eval-all -i '(select(.kind == "DaemonSet" and .metadata.name == "calico-node").spec.template.spec.containers[0].livenessProbe.exec.command) = ["/bin/calico-node", "-felix-live"]' $CALICO_FILE +yq eval-all -i '(select(.kind == "DaemonSet" and .metadata.name == "calico-node").spec.template.spec.containers[0].readinessProbe.exec.command) = ["/bin/calico-node", "-felix-ready"]' $CALICO_FILE + +# install components with our rootless version +kubectl apply -f ${CALICO_FILE} +echo "Done. Final file is $CALICO_FILE" + +# Give a small break to settle - we need calico.vxlan to be created +sleep 10 + +# This must be removed or the address will be reset +kubectl set env daemonset/calico-node IP- -n kube-system + +# Allow pods to recreate +echo "Recreating calico pods..." +sleep 10 + +# https://youtu.be/noriIzBKYRk?si=mlOC27ntvSEDw_VM&t=299 +# These commands need to be done bringing up node +# iptables -I INPUT -p udp --dport 8472 -j ACCEPT +# sysctl -w net.ipv4.conf.all.rp_filter=2 +# sysctl -w net.ipv4.conf.default.rp_filter=2 +# sysctl -w net.ipv4.conf.eth0.rp_filter=2 +# sysctl -w "net.ipv4.conf.vxlan/calico.rp_filter=2" + +# This needs to be done after daemonset is patched +# Note that the calico-node has a warning after this, but it won't work if we don't do it +for node in $(kubectl get nodes -o name); do + host_ip="$(kubectl get "${node}" -o jsonpath='{.metadata.labels.usernetes/host-ip}')" + nodename=$(cut -d / -f 2 <<< $node) + calicoctl --allow-version-mismatch patch node ${nodename} --patch='{"spec": {"bgp":{"ipv4Address": "'"$host_ip"'"}}}' +done + +# applies ethtool -K vxlan.calico tx-checksum-ip-generic off +# check with: bridge fdb show dev vxlan.calico should have node address NOT 10.x address +kubectl apply --server-side -f /usernetes/Makefile.d/calico/calico-ethtool.yaml + +# These should be run after calico installed +# 1. make sync-external-ip and make install-calico +# the second has a daemonset to apply these commands +# ethtool -K vxlan.calico tx-checksum-ip-generic off diff --git a/Makefile.d/check-preflight.sh b/Makefile.d/check-preflight.sh index 623e9f52..12ba05b5 100755 --- a/Makefile.d/check-preflight.sh +++ b/Makefile.d/check-preflight.sh @@ -16,7 +16,9 @@ script_dir="$(dirname "$0")" detect_engine="${script_dir}"/detect-container-engine.sh : "${CONTAINER_ENGINE:=$("${detect_engine}" CONTAINER_ENGINE)}" : "${CONTAINER_ENGINE_TYPE:=$("${detect_engine}" CONTAINER_ENGINE_TYPE)}" -: "${QUICK:=0}" + +# Set to 1 since we will do calico by default +: "${QUICK:=1}" : "${BUSYBOX_IMAGE:=docker.io/library/busybox:latest}" if [ -z "${CONTAINER_ENGINE}" ] || [ -z "${CONTAINER_ENGINE_TYPE}" ]; then diff --git a/Makefile.d/sync-external-ip.sh b/Makefile.d/sync-external-ip.sh index 2b4e8bec..056827df 100755 --- a/Makefile.d/sync-external-ip.sh +++ b/Makefile.d/sync-external-ip.sh @@ -16,4 +16,6 @@ for node in $(kubectl get nodes -o name); do if echo "${taints}" | grep -q node.cloudprovider.kubernetes.io/uninitialized; then kubectl taint nodes "${node}" node.cloudprovider.kubernetes.io/uninitialized- fi + nodename=$(cut -d / -f 2 <<< $node) + calicoctl --allow-version-mismatch patch node ${nodename} --patch='{"spec": {"bgp":{"ipv4Address": "'"$host_ip"'"}}}' done diff --git a/README.md b/README.md index db1eccce..30f8c515 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ but Usernetes (Gen 2) supports creating a cluster with multiple hosts. - CRI: containerd - OCI: runc - CNI: Flannel +- CNI: Calico ## Requirements @@ -72,7 +73,8 @@ EOF sudo systemctl restart systemd-modules-load.service ``` -- sysctl: +- sysctl (should not be required for calico, but needs testing) + ``` sudo tee /etc/sysctl.d/99-usernetes.conf </dev/null net.ipv4.conf.default.rp_filter = 2 @@ -110,6 +112,8 @@ See `make help`. make up make kubeadm-init make install-flannel +# or +make install-calico # Enable kubectl make kubeconfig diff --git a/docker-compose.yaml b/docker-compose.yaml index 1df41123..52773461 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -12,6 +12,8 @@ services: ipv4_address: ${NODE_IP} ports: # : + # Calico + - ${PORT_CALICO}:${PORT_CALICO} # etcd (default: 2379) - ${PORT_ETCD}:${PORT_ETCD} # kube-apiserver (default: 6443) diff --git a/service/README.md b/service/README.md index 9bd636e2..402804c2 100644 --- a/service/README.md +++ b/service/README.md @@ -15,6 +15,8 @@ flux alloc --bg -N2 -q pbatch -t 8h ### Control Plane +TODO: `export QUICK=1` + ```bash ssh corona189 # For the control plane - start @@ -40,25 +42,8 @@ Back on the control plane (if everything looks good) we can go to the copied con ```bash . source_env.sh -``` -```console -[sochat1@corona190:service]$ kubectl get nodes -NAME STATUS ROLES AGE VERSION -u7s-corona190 NotReady control-plane 3m20s v1.30.0 -u7s-corona196 NotReady 1m3s v1.30.0 -``` - -Importantly, the ips need to be sync'd (and an annotation added for flannel) after nodes are up. They will all be `NotReady`. - -```bash make sync-external-ip -make install-flannel -``` -```console -[sochat1@corona190:service]$ kubectl get nodes -NAME STATUS ROLES AGE VERSION -u7s-corona190 Ready control-plane 5m v1.30.0 -u7s-corona196 Ready 3m7s v1.30.0 +make install-calico ``` Install the Flux Operator... @@ -125,4 +110,17 @@ https://raw.githubusercontent.com/ROCm/k8s-device-plugin/763445e18f3838fa72b22e3 kubectl logs alexnet-tf-gpu-pod alexnet-tf-gpu-container ``` -Our final experiments will be done separately, and these notes likely cleaned up. + +### Debugging + +Calico: In u7s this address should be same as host: + +```bash +bridge fdb show dev vxlan.calico +``` +```console +# "this address" +66:63:44:f3:b6:76 dst 192.168.128.222 self permanent +``` + +If you see the container interface (10.0.x) this is a bug. It could be that the calico-node daemonset still has the `IP` environment variable set to autodetect (which will clobber any changes you make) or you did not issue all the commands in the sync external ip script, or the daemonset to run ethtool. diff --git a/service/usernetes-start-control-plane.sh b/service/usernetes-start-control-plane.sh index 2be076d7..d2051131 100755 --- a/service/usernetes-start-control-plane.sh +++ b/service/usernetes-start-control-plane.sh @@ -5,7 +5,7 @@ set -euo pipefail # These are variables we likely will change # LC only supplies podman USERNETES_CONTAINER_TECH=${1:-"podman"} -USERNETES_TEMPLATE_PATH=/usr/workspace/usernetes/usernetes-06-26-2025 +USERNETES_TEMPLATE_PATH=/usr/workspace/usernetes/usernetes-calico # We will copy join command here shared_join_command_dir="/usr/workspace/usernetes" @@ -13,6 +13,16 @@ shared_join_command_dir="/usr/workspace/usernetes" # The user needs to run the setup script USERNAME=$(whoami) +# Logging functions for consistency (like Akihiro!) +log() { + echo "$(date '+%Y-%m-%d %H:%M:%S') - INFO - $1" +} + +error_exit() { + echo "$(date '+%Y-%m-%d %H:%M:%S') - ERROR - $1" >&2 + exit 1 +} + # This is way a lot for just deriving home, but I'm not convinced it will always # be defined in the environment if [[ -z "${HOME:-}" || ! -d "${HOME}" ]]; then @@ -37,16 +47,6 @@ which podman-compose # We don't want to use /var because that is a memory based fs export TMPDIR="/tmp/${USERNAME}" -# Logging functions for consistency (like Akihiro!) -log() { - echo "$(date '+%Y-%m-%d %H:%M:%S') - INFO - $1" -} - -error_exit() { - echo "$(date '+%Y-%m-%d %H:%M:%S') - ERROR - $1" >&2 - exit 1 -} - install_kubectl() { if ! command -v kubectl > /dev/null; then log "Installing kubectl..." @@ -60,7 +60,20 @@ install_kubectl() { command -v kubectl > /dev/null || error_exit "kubectl not found after installation attempt." } - +install_yq() { + if ! command -v yq > /dev/null; then + log "Installing yq..." + YQ_VERSION=v4.2.0 + YQ_PLATFORM=linux_amd64 + wget https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_${YQ_PLATFORM}.tar.gz -O - | tar xz + chmod +x ./yq_${YQ_PLATFORM} + mv ./yq_${YQ_PLATFORM} "${LOCAL_BIN_DIR}/yq" + log " yq installed to ${LOCAL_BIN_DIR}/yq" + else + log " yq found at $(command -v yq)" + fi + command -v yq > /dev/null || error_exit "yq not found after installation attempt." +} # Pre-flight Checks & Setup log "🎬 Starting Usernetes Control Plane Setup" diff --git a/service/usernetes-start-worker.sh b/service/usernetes-start-worker.sh index 709585d4..bb5d7d52 100755 --- a/service/usernetes-start-worker.sh +++ b/service/usernetes-start-worker.sh @@ -2,10 +2,20 @@ set -euo pipefail +# Logging functions for consistency (like Akihiro!) +log() { + echo "$(date '+%Y-%m-%d %H:%M:%S') - INFO - $1" +} + +error_exit() { + echo "$(date '+%Y-%m-%d %H:%M:%S') - ERROR - $1" >&2 + exit 1 +} + # These are variables we likely will change # LC only supplies podman USERNETES_CONTAINER_TECH=${1:-"podman"} -USERNETES_TEMPLATE_PATH=/usr/workspace/usernetes/usernetes-06-26-2025 +USERNETES_TEMPLATE_PATH=/usr/workspace/usernetes/usernetes-calico # The join command needs to be here shared_join_command_dir="/usr/workspace/usernetes" @@ -38,16 +48,6 @@ log " Updated PATH: ${PATH}" # We don't want to use /var because that is a memory based fs export TMPDIR="/tmp/${USERNAME}" -# Logging functions for consistency (like Akihiro!) -log() { - echo "$(date '+%Y-%m-%d %H:%M:%S') - INFO - $1" -} - -error_exit() { - echo "$(date '+%Y-%m-%d %H:%M:%S') - ERROR - $1" >&2 - exit 1 -} - install_kubectl() { if ! command -v kubectl > /dev/null; then log "Installing kubectl..." @@ -61,8 +61,6 @@ install_kubectl() { command -v kubectl > /dev/null || error_exit "kubectl not found after installation attempt." } - - # Pre-flight Checks & Setup log "🎬 Starting Usernetes Control Plane Setup" log " Temporary directory: ${TMPDIR}"