diff --git a/Dockerfile.dra b/Dockerfile.dra new file mode 100644 index 00000000..2c5541d5 --- /dev/null +++ b/Dockerfile.dra @@ -0,0 +1,17 @@ +FROM ghcr.io/google/dranet:v0.4.0 AS builder + +# podman build --userns-uid-map=0:0:1 --userns-uid-map=1:1:1999 --userns-uid-map=65534:2000:2 -f $(pwd)/Dockerfile.dra -t usernetes_dra $(pwd) +# docker build -t ghcr.io/converged-computing/usernetes:dra . +# make up && make kubeadm-init && make dra && make kubeconfig && kubectl apply -f Makefile.d/dra/rbac.yaml && podman restart dranet-driver + +# We can likely use ubuntu:24.04, I am reproducing the node environment as much as possible. +FROM docker.io/kindest/node:v1.33.0@sha256:91e9ed777db80279c22d1d1068c091b899b2078506e4a0f797fbf6e397c0b0b2 +RUN apt-get update && apt-get install -y ca-certificates +COPY cspca.llnl.gov.cer.pem /usr/local/share/ca-certificates/ +COPY cspca.cer.pem /usr/local/share/ca-certificates/ +RUN update-ca-certificates +RUN apt-get update && \ + apt-get install -y --no-install-recommends kmod libibverbs-dev librdmacm-dev rdma-core libnl-3-dev ibverbs-utils libnl-route-3-dev + +COPY --from=builder /dranet /dranet +ENTRYPOINT ["/dranet"] diff --git a/Makefile b/Makefile index de258ddb..36433a59 100644 --- a/Makefile +++ b/Makefile @@ -84,6 +84,7 @@ up: check-preflight # Podman creates cni files in a shared location, this ensures unique names that do not clobbed one another sed -i "s/default_network/$(HOSTNAME)/g" docker-compose.yaml $(COMPOSE) up --build -d + $(NODE_SHELL) /bin/bash /usernetes/Makefile.d/dra/update-containerd.sh .PHONY: down down: @@ -111,6 +112,15 @@ ifeq ($(shell command -v kubectl 2> /dev/null),) @echo "make kubectl" endif +.PHONY: kubeconfig +dra: + $(NODE_SHELL) kubectl apply -f /usernetes/Makefile.d/dra/rbac.yaml + # These are needed if you want ibv_devinfo to work + #$(CONTAINER_ENGINE) exec dranet-driver mkdir -p /etc/libibverbs.d + #$(CONTAINER_ENGINE) exec dranet-driver echo "driver mlx4" >> /etc/libibverbs.d/mlx4.driver + #$(CONTAINER_ENGINE) exec dranet-driver echo "driver mlx5" >> /etc/libibverbs.d/mlx5.driver + + .PHONY: kubectl kubectl: $(COMPOSE) exec -T --workdir=/usr/bin $(NODE_SERVICE_NAME) tar c kubectl | tar xv diff --git a/Makefile.d/dra/rbac.yaml b/Makefile.d/dra/rbac.yaml new file mode 100644 index 00000000..2e0dc9a6 --- /dev/null +++ b/Makefile.d/dra/rbac.yaml @@ -0,0 +1,21 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: dranet-admin-role +rules: +- apiGroups: ["resource.k8s.io"] + resources: ["resourceslices", "resourceclaims", "resourceclaimtemplates"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: dranet-admin-binding +subjects: +- kind: User + name: kubernetes-admin + apiGroup: rbac.authorization.k8s.io +roleRef: + kind: ClusterRole + name: dranet-admin-role + apiGroup: rbac.authorization.k8s.io diff --git a/Makefile.d/dra/update-containerd.sh b/Makefile.d/dra/update-containerd.sh new file mode 100755 index 00000000..8fcbb180 --- /dev/null +++ b/Makefile.d/dra/update-containerd.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -o errexit +set -o pipefail +set -o nounset +set -x +if grep -q "io.containerd.nri.v1.nri" /etc/containerd/config.toml + then + echo "containerd config contains NRI reference already; taking no action" + else + echo "containerd config does not mention NRI, thus enabling it"; + printf '%s\n' "[plugins.\"io.containerd.nri.v1.nri\"]" " disable = false" " disable_connections = false" " plugin_config_path = \"/etc/nri/conf.d\"" " plugin_path = \"/opt/nri/plugins\"" " plugin_registration_timeout = \"5s\"" " plugin_request_timeout = \"5s\"" " socket_path = \"/var/run/nri/nri.sock\"" >> /etc/containerd/config.toml + echo "restarting containerd" + systemctl restart containerd +fi diff --git a/docker-compose.yaml b/docker-compose.yaml index 1df41123..347004ee 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -10,6 +10,8 @@ services: networks: default_network: ipv4_address: ${NODE_IP} + expose: + - "${PORT_KUBE_APISERVER}" ports: # : # etcd (default: 2379) @@ -27,8 +29,7 @@ services: - node-var:/var - node-opt:/opt - node-etc:/etc - - type: tmpfs - target: /run + - node-run:/run - type: tmpfs target: /tmp working_dir: /usernetes @@ -45,6 +46,29 @@ services: "nerdctl/bypass4netns": "${BYPASS4NETNS:-false}" "nerdctl/bypass4netns-ignore-bind": "true" "nerdctl/bypass4netns-ignore-subnets": "${BYPASS4NETNS_IGNORE_SUBNETS:-}" + + dranet: + image: usernetes_dra + imagePullPolicy: Always + container_name: dranet-driver + privileged: true + restart: always + # DraNet needs to see the host network to manage its devices + network_mode: "host" + volumes: + - /boot:/boot:ro + - /lib/modules:/lib/modules:ro + - node-var:/var + - node-run:/run:ro + - node-etc:/etc + extra_hosts: + # We need to be able to see the api server + - "${NODE_NAME}:${HOST_IP}" + command: + - --kubeconfig=/etc/kubernetes/admin.conf + - --hostname-override=${NODE_NAME} + - --filter=true + networks: default_network: ipam: @@ -53,6 +77,7 @@ networks: # The node IP here is not accessible from other nodes. - subnet: ${NODE_SUBNET} volumes: + node-run: {} node-var: {} node-opt: {} node-etc: {} diff --git a/kubeadm-config.yaml b/kubeadm-config.yaml index 2ec040f3..67ef149d 100644 --- a/kubeadm-config.yaml +++ b/kubeadm-config.yaml @@ -15,6 +15,8 @@ apiServer: extraArgs: - name: etcd-servers value: https://127.0.0.1:${PORT_ETCD} + - name: runtime-config + value: "api/beta=true" - name: advertise-address value: ${HOST_IP} - name: secure-port @@ -42,6 +44,8 @@ apiVersion: kubelet.config.k8s.io/v1beta1 failSwapOn: false port: ${PORT_KUBELET} featureGates: + DRAResourceClaimDeviceStatus: true + DynamicResourceAllocation: true KubeletInUserNamespace: true --- apiVersion: kubeproxy.config.k8s.io/v1alpha1 @@ -54,3 +58,4 @@ conntrack: tcpEstablishedTimeout: 0s # Skip setting "net.netfilter.nf_conntrack_tcp_timeout_close" tcpCloseWaitTimeout: 0s + diff --git a/service/README.md b/service/README.md index eab1bfaa..2f83375d 100644 --- a/service/README.md +++ b/service/README.md @@ -13,8 +13,19 @@ Request a flux alloc for the control plane and a worker, for however many minute flux alloc --bg -N2 -q pbatch -t 8h ``` +You can use the WIP testing script to bring up control plane and worker nodes: + +```bash +bash start-usernetes.sh +``` + +The script will shell you into the control plane node after starting. + + ### Control Plane +If you can't start services via ssh: + ```bash # For the control plane - start ssh @@ -25,10 +36,31 @@ systemctl --user status usernetes-control-plane # check log in /usr/workspace/usernetes/control-plane.log ``` -Importantly, in the above you need a podman-compose that has the line to add a label for `PODMAN_SYSTEMD_UNIT` commented out. If when you are in the usernetes kubelet container (`make shell`) or a container and `ulimit -l` is not unlimited, Infiniband is unlikely to work. +If you can: + +```bash +ssh corona190 systemctl --user start usernetes-control-plane +ssh corona190 systemctl --user status usernetes-control-plane +``` + +When /tmp/$USER/usernetes/source-env.sh exists, you can start the worker node + +``` +ssh corona190 ls corona190 ls /tmp/sochat1/usernetes +``` +You can also look at the log file. Look for this line at the very end: + +``` +cat /usr/workspace/usernetes/control-plane.log +2025-07-13 22:29:29 - INFO - ๐Ÿš€ Service will now idle indefinitely. Process ID: 819551 +``` + +Then you can start the worker(s). And importantly, in the above you need a podman-compose that has the line to add a label for `PODMAN_SYSTEMD_UNIT` commented out. If when you are in the usernetes kubelet container (`make shell`) or a container and `ulimit -l` is not unlimited, Infiniband is unlikely to work. ### Worker +Without services enabled via ssh: + ```bash ssh corona190 rm -rf /usr/workspace/usernetes/worker.log @@ -37,6 +69,13 @@ systemctl --user status usernetes-worker # check log in /usr/workspace/usernetes/worker.log ``` +Or with: + +```bash +ssh corona192 systemctl --user start usernetes-worker +ssh corona192 systemctl --user status usernetes-worker +``` + Back on the control plane (if everything looks good) we can go to the copied control plane directory, source a file to get kubectl and the correct paths, and see our cluster. ```bash diff --git a/service/start-usernetes.sh b/service/start-usernetes.sh new file mode 100644 index 00000000..0354d69e --- /dev/null +++ b/service/start-usernetes.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +jobid=${1} + +# If we aren't provided with an id assume the last submit +if [[ "${jobid}" == "" ]]; + then + jobid=$(flux job last) +fi + +echo "Jobid to start usernetes is ${jobid}" + +# This is the nodelist - can be a single node or multiple +nodelist=$(flux jobs $(flux job last) --json | jq -r .nodelist) +nodelist=($(flux hostlist --expand $nodelist)) +control_plane_node=${nodelist[0]} +worker_nodes=${nodelist[@]:1} + +# This currently assumes one usernetes job running. +# We will want a way to have a custom logging file. +# We can likely remove this and have logs with service on node. +rm -rf /usr/workspace/usernetes/control-plane.log + +# Start the control plane +ssh $control_plane_node systemctl --user start usernetes-control-plane +echo "Log for control plane will be in /usr/workspace/usernetes/control-plane.log" + +# The control plane is ready when this file exists +while true + do + ssh $control_plane_node "test -f /tmp/$USER/usernetes/source_env.sh" + if [[ "$?" == "0" ]]; then + echo "Usernetes control plane is ready." + ssh $control_plane_node systemctl --user status usernetes-control-plane + break + else + sleep 3 + fi +done + +# Start worker nodes +for worker_node in ${worker_nodes[@]} + do + ssh $worker_node systemctl --user start usernetes-worker +done + +# Again wait for all workers to be ready + +for worker_node in ${worker_nodes[@]} + do + ssh $worker_node "test -f /tmp/$USER/usernetes/source_env.sh" + + # If any single worker isn't ready, keep going + if [[ "$?" != "0" ]]; then + sleep 3 + continue + fi + + # If we get here, all nodes are ready. + break +done + +# Show the nodes. ssh does not honor cd to different directory +ssh $control_plane_node /bin/bash -c "cd /tmp/$USER/usernetes/ && . /tmp/$USER/usernetes/source_env.sh && kubectl get nodes" + +# Install flannel and sync ips +ssh $control_plane_node /bin/bash -c "cd /tmp/$USER/usernetes/ && . /tmp/$USER/usernetes/source_env.sh && make -C /tmp/$USER/usernetes install-flannel" +ssh $control_plane_node /bin/bash -c "cd /tmp/$USER/usernetes/ && . /tmp/$USER/usernetes/source_env.sh && make -C /tmp/$USER/usernetes sync-external-ip" + +# Shell in. +echo "Shelling into Usernetes control plane. Change directory to /tmp/$USER/usernetes and source_env.sh to use kubectl" +ssh $control_plane_node diff --git a/service/usernetes-start-control-plane.sh b/service/usernetes-start-control-plane.sh index 2be076d7..4c13b643 100755 --- a/service/usernetes-start-control-plane.sh +++ b/service/usernetes-start-control-plane.sh @@ -5,7 +5,7 @@ set -euo pipefail # These are variables we likely will change # LC only supplies podman USERNETES_CONTAINER_TECH=${1:-"podman"} -USERNETES_TEMPLATE_PATH=/usr/workspace/usernetes/usernetes-06-26-2025 +USERNETES_TEMPLATE_PATH=/usr/workspace/usernetes/usernetes-dra # We will copy join command here shared_join_command_dir="/usr/workspace/usernetes" @@ -13,6 +13,16 @@ shared_join_command_dir="/usr/workspace/usernetes" # The user needs to run the setup script USERNAME=$(whoami) +# Logging functions for consistency (like Akihiro!) +log() { + echo "$(date '+%Y-%m-%d %H:%M:%S') - INFO - $1" +} + +error_exit() { + echo "$(date '+%Y-%m-%d %H:%M:%S') - ERROR - $1" >&2 + exit 1 +} + # This is way a lot for just deriving home, but I'm not convinced it will always # be defined in the environment if [[ -z "${HOME:-}" || ! -d "${HOME}" ]]; then @@ -37,16 +47,6 @@ which podman-compose # We don't want to use /var because that is a memory based fs export TMPDIR="/tmp/${USERNAME}" -# Logging functions for consistency (like Akihiro!) -log() { - echo "$(date '+%Y-%m-%d %H:%M:%S') - INFO - $1" -} - -error_exit() { - echo "$(date '+%Y-%m-%d %H:%M:%S') - ERROR - $1" >&2 - exit 1 -} - install_kubectl() { if ! command -v kubectl > /dev/null; then log "Installing kubectl..." @@ -126,11 +126,12 @@ unshare_cleanup # Usernetes Specific Setup log "๐Ÿ“‚ Copying Usernetes template from ${USERNETES_TEMPLATE_PATH}" cp -R "${USERNETES_TEMPLATE_PATH}" "${TMPDIR}/usernetes" -cd "${TMPDIR}/usernetes" # Now inside the copied template -sleep 3 # Allow filesystem operations to settle if needed +cd "${TMPDIR}/usernetes" +sleep 3 log "๐Ÿ‘ท Building Usernetes container image 'usernetes_node'" ${container_runtime_path} build --userns-uid-map=0:0:1 --userns-uid-map=1:1:1999 --userns-uid-map=65534:2000:2 -f $(pwd)/Dockerfile -t usernetes_node $(pwd) +${container_runtime_path} build --userns-uid-map=0:0:1 --userns-uid-map=1:1:1999 --userns-uid-map=65534:2000:2 -f $(pwd)/Dockerfile.dra -t usernetes_dra $(pwd) cleanup() { log "๐Ÿงน Cleaning up old networks or volumes (best effort)" @@ -150,12 +151,19 @@ if ! make up; then fi sleep 3 + log "๐Ÿ” Running kubeadm-init with 'make kubeadm-init'" if ! make kubeadm-init; then error_exit "Failed 'make kubeadm-init'." fi sleep 3 +log "๐Ÿ‘พ Setting up dynamic resource allocation" +if ! make dra; then + error_exit "Failed 'make kubeadm-init'." +fi +sleep 3 + log "๐Ÿฅท Creating kubeconfig with 'make kubeconfig'" if ! make kubeconfig; then error_exit "Failed 'make kubeconfig'." @@ -171,6 +179,10 @@ chmod 600 "${KUBECONFIG}" # source <(kubectl completion bash) sleep 3 +log "๐Ÿฅท Install rbac for dranet-driver" +kubectl apply -f Makefile.d/dra/rbac.yaml +${container_runtime_path} restart dranet-driver + # Get control plane node name robustly log "๐Ÿ‘ Untainting control plane and labeling node" control_plane_node="" diff --git a/service/usernetes-start-worker.sh b/service/usernetes-start-worker.sh index 709585d4..40490463 100755 --- a/service/usernetes-start-worker.sh +++ b/service/usernetes-start-worker.sh @@ -5,7 +5,7 @@ set -euo pipefail # These are variables we likely will change # LC only supplies podman USERNETES_CONTAINER_TECH=${1:-"podman"} -USERNETES_TEMPLATE_PATH=/usr/workspace/usernetes/usernetes-06-26-2025 +USERNETES_TEMPLATE_PATH=/usr/workspace/usernetes/usernetes-dra # The join command needs to be here shared_join_command_dir="/usr/workspace/usernetes" @@ -17,6 +17,16 @@ fi # The user needs to run the setup script USERNAME=$(whoami) +# Logging functions for consistency (like Akihiro!) +log() { + echo "$(date '+%Y-%m-%d %H:%M:%S') - INFO - $1" +} + +error_exit() { + echo "$(date '+%Y-%m-%d %H:%M:%S') - ERROR - $1" >&2 + exit 1 +} + # This is way a lot for just deriving home, but I'm not convinced it will always # be defined in the environment if [[ -z "${HOME:-}" || ! -d "${HOME}" ]]; then @@ -38,16 +48,6 @@ log " Updated PATH: ${PATH}" # We don't want to use /var because that is a memory based fs export TMPDIR="/tmp/${USERNAME}" -# Logging functions for consistency (like Akihiro!) -log() { - echo "$(date '+%Y-%m-%d %H:%M:%S') - INFO - $1" -} - -error_exit() { - echo "$(date '+%Y-%m-%d %H:%M:%S') - ERROR - $1" >&2 - exit 1 -} - install_kubectl() { if ! command -v kubectl > /dev/null; then log "Installing kubectl..." @@ -135,6 +135,7 @@ sleep 3 log "๐Ÿ‘ท Building Usernetes container image 'usernetes_node'" ${container_runtime_path} build --userns-uid-map=0:0:1 --userns-uid-map=1:1:1999 --userns-uid-map=65534:2000:2 -f $(pwd)/Dockerfile -t usernetes_node $(pwd) +${container_runtime_path} build --userns-uid-map=0:0:1 --userns-uid-map=1:1:1999 --userns-uid-map=65534:2000:2 -f $(pwd)/Dockerfile.dra -t usernetes_dra $(pwd) cleanup() { log "๐Ÿงน Cleaning up old networks or volumes (best effort)"