Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions Dockerfile.dra
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
FROM ghcr.io/google/dranet:v0.4.0 AS builder

# podman build --userns-uid-map=0:0:1 --userns-uid-map=1:1:1999 --userns-uid-map=65534:2000:2 -f $(pwd)/Dockerfile.dra -t usernetes_dra $(pwd)
# docker build -t ghcr.io/converged-computing/usernetes:dra .
# make up && make kubeadm-init && make dra && make kubeconfig && kubectl apply -f Makefile.d/dra/rbac.yaml && podman restart dranet-driver

# We can likely use ubuntu:24.04, I am reproducing the node environment as much as possible.
FROM docker.io/kindest/node:v1.33.0@sha256:91e9ed777db80279c22d1d1068c091b899b2078506e4a0f797fbf6e397c0b0b2
RUN apt-get update && apt-get install -y ca-certificates
COPY cspca.llnl.gov.cer.pem /usr/local/share/ca-certificates/
COPY cspca.cer.pem /usr/local/share/ca-certificates/
RUN update-ca-certificates
RUN apt-get update && \
apt-get install -y --no-install-recommends kmod libibverbs-dev librdmacm-dev rdma-core libnl-3-dev ibverbs-utils libnl-route-3-dev

COPY --from=builder /dranet /dranet
ENTRYPOINT ["/dranet"]
10 changes: 10 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ up: check-preflight
# Podman creates cni files in a shared location, this ensures unique names that do not clobbed one another
sed -i "s/default_network/$(HOSTNAME)/g" docker-compose.yaml
$(COMPOSE) up --build -d
$(NODE_SHELL) /bin/bash /usernetes/Makefile.d/dra/update-containerd.sh

.PHONY: down
down:
Expand Down Expand Up @@ -111,6 +112,15 @@ ifeq ($(shell command -v kubectl 2> /dev/null),)
@echo "make kubectl"
endif

.PHONY: kubeconfig
dra:
$(NODE_SHELL) kubectl apply -f /usernetes/Makefile.d/dra/rbac.yaml
# These are needed if you want ibv_devinfo to work
#$(CONTAINER_ENGINE) exec dranet-driver mkdir -p /etc/libibverbs.d
#$(CONTAINER_ENGINE) exec dranet-driver echo "driver mlx4" >> /etc/libibverbs.d/mlx4.driver
#$(CONTAINER_ENGINE) exec dranet-driver echo "driver mlx5" >> /etc/libibverbs.d/mlx5.driver


.PHONY: kubectl
kubectl:
$(COMPOSE) exec -T --workdir=/usr/bin $(NODE_SERVICE_NAME) tar c kubectl | tar xv
Expand Down
21 changes: 21 additions & 0 deletions Makefile.d/dra/rbac.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: dranet-admin-role
rules:
- apiGroups: ["resource.k8s.io"]
resources: ["resourceslices", "resourceclaims", "resourceclaimtemplates"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: dranet-admin-binding
subjects:
- kind: User
name: kubernetes-admin
apiGroup: rbac.authorization.k8s.io
roleRef:
kind: ClusterRole
name: dranet-admin-role
apiGroup: rbac.authorization.k8s.io
14 changes: 14 additions & 0 deletions Makefile.d/dra/update-containerd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash
set -o errexit
set -o pipefail
set -o nounset
set -x
if grep -q "io.containerd.nri.v1.nri" /etc/containerd/config.toml
then
echo "containerd config contains NRI reference already; taking no action"
else
echo "containerd config does not mention NRI, thus enabling it";
printf '%s\n' "[plugins.\"io.containerd.nri.v1.nri\"]" " disable = false" " disable_connections = false" " plugin_config_path = \"/etc/nri/conf.d\"" " plugin_path = \"/opt/nri/plugins\"" " plugin_registration_timeout = \"5s\"" " plugin_request_timeout = \"5s\"" " socket_path = \"/var/run/nri/nri.sock\"" >> /etc/containerd/config.toml
echo "restarting containerd"
systemctl restart containerd
fi
29 changes: 27 additions & 2 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ services:
networks:
default_network:
ipv4_address: ${NODE_IP}
expose:
- "${PORT_KUBE_APISERVER}"
ports:
# <host>:<container>
# etcd (default: 2379)
Expand All @@ -27,8 +29,7 @@ services:
- node-var:/var
- node-opt:/opt
- node-etc:/etc
- type: tmpfs
target: /run
- node-run:/run
- type: tmpfs
target: /tmp
working_dir: /usernetes
Expand All @@ -45,6 +46,29 @@ services:
"nerdctl/bypass4netns": "${BYPASS4NETNS:-false}"
"nerdctl/bypass4netns-ignore-bind": "true"
"nerdctl/bypass4netns-ignore-subnets": "${BYPASS4NETNS_IGNORE_SUBNETS:-}"

dranet:
image: usernetes_dra
imagePullPolicy: Always
container_name: dranet-driver
privileged: true
restart: always
# DraNet needs to see the host network to manage its devices
network_mode: "host"
volumes:
- /boot:/boot:ro
- /lib/modules:/lib/modules:ro
- node-var:/var
- node-run:/run:ro
- node-etc:/etc
extra_hosts:
# We need to be able to see the api server
- "${NODE_NAME}:${HOST_IP}"
command:
- --kubeconfig=/etc/kubernetes/admin.conf
- --hostname-override=${NODE_NAME}
- --filter=true

networks:
default_network:
ipam:
Expand All @@ -53,6 +77,7 @@ networks:
# The node IP here is not accessible from other nodes.
- subnet: ${NODE_SUBNET}
volumes:
node-run: {}
node-var: {}
node-opt: {}
node-etc: {}
5 changes: 5 additions & 0 deletions kubeadm-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ apiServer:
extraArgs:
- name: etcd-servers
value: https://127.0.0.1:${PORT_ETCD}
- name: runtime-config
value: "api/beta=true"
- name: advertise-address
value: ${HOST_IP}
- name: secure-port
Expand Down Expand Up @@ -42,6 +44,8 @@ apiVersion: kubelet.config.k8s.io/v1beta1
failSwapOn: false
port: ${PORT_KUBELET}
featureGates:
DRAResourceClaimDeviceStatus: true
DynamicResourceAllocation: true
KubeletInUserNamespace: true
---
apiVersion: kubeproxy.config.k8s.io/v1alpha1
Expand All @@ -54,3 +58,4 @@ conntrack:
tcpEstablishedTimeout: 0s
# Skip setting "net.netfilter.nf_conntrack_tcp_timeout_close"
tcpCloseWaitTimeout: 0s

41 changes: 40 additions & 1 deletion service/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,19 @@ Request a flux alloc for the control plane and a worker, for however many minute
flux alloc --bg -N2 -q pbatch -t 8h
```

You can use the WIP testing script to bring up control plane and worker nodes:

```bash
bash start-usernetes.sh
```

The script will shell you into the control plane node after starting.


### Control Plane

If you can't start services via ssh:

```bash
# For the control plane - start
ssh <allocated node>
Expand All @@ -25,10 +36,31 @@ systemctl --user status usernetes-control-plane
# check log in /usr/workspace/usernetes/control-plane.log
```

Importantly, in the above you need a podman-compose that has the line to add a label for `PODMAN_SYSTEMD_UNIT` commented out. If when you are in the usernetes kubelet container (`make shell`) or a container and `ulimit -l` is not unlimited, Infiniband is unlikely to work.
If you can:

```bash
ssh corona190 systemctl --user start usernetes-control-plane
ssh corona190 systemctl --user status usernetes-control-plane
```

When /tmp/$USER/usernetes/source-env.sh exists, you can start the worker node

```
ssh corona190 ls corona190 ls /tmp/sochat1/usernetes
```
You can also look at the log file. Look for this line at the very end:

```
cat /usr/workspace/usernetes/control-plane.log
2025-07-13 22:29:29 - INFO - 🚀 Service will now idle indefinitely. Process ID: 819551
```

Then you can start the worker(s). And importantly, in the above you need a podman-compose that has the line to add a label for `PODMAN_SYSTEMD_UNIT` commented out. If when you are in the usernetes kubelet container (`make shell`) or a container and `ulimit -l` is not unlimited, Infiniband is unlikely to work.

### Worker

Without services enabled via ssh:

```bash
ssh corona190
rm -rf /usr/workspace/usernetes/worker.log
Expand All @@ -37,6 +69,13 @@ systemctl --user status usernetes-worker
# check log in /usr/workspace/usernetes/worker.log
```

Or with:

```bash
ssh corona192 systemctl --user start usernetes-worker
ssh corona192 systemctl --user status usernetes-worker
```

Back on the control plane (if everything looks good) we can go to the copied control plane directory, source a file to get kubectl and the correct paths, and see our cluster.

```bash
Expand Down
72 changes: 72 additions & 0 deletions service/start-usernetes.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/bin/bash

jobid=${1}

# If we aren't provided with an id assume the last submit
if [[ "${jobid}" == "" ]];
then
jobid=$(flux job last)
fi

echo "Jobid to start usernetes is ${jobid}"

# This is the nodelist - can be a single node or multiple
nodelist=$(flux jobs $(flux job last) --json | jq -r .nodelist)
nodelist=($(flux hostlist --expand $nodelist))
control_plane_node=${nodelist[0]}
worker_nodes=${nodelist[@]:1}

# This currently assumes one usernetes job running.
# We will want a way to have a custom logging file.
# We can likely remove this and have logs with service on node.
rm -rf /usr/workspace/usernetes/control-plane.log

# Start the control plane
ssh $control_plane_node systemctl --user start usernetes-control-plane
echo "Log for control plane will be in /usr/workspace/usernetes/control-plane.log"

# The control plane is ready when this file exists
while true
do
ssh $control_plane_node "test -f /tmp/$USER/usernetes/source_env.sh"
if [[ "$?" == "0" ]]; then
echo "Usernetes control plane is ready."
ssh $control_plane_node systemctl --user status usernetes-control-plane
break
else
sleep 3
fi
done

# Start worker nodes
for worker_node in ${worker_nodes[@]}
do
ssh $worker_node systemctl --user start usernetes-worker
done

# Again wait for all workers to be ready

for worker_node in ${worker_nodes[@]}
do
ssh $worker_node "test -f /tmp/$USER/usernetes/source_env.sh"

# If any single worker isn't ready, keep going
if [[ "$?" != "0" ]]; then
sleep 3
continue
fi

# If we get here, all nodes are ready.
break
done

# Show the nodes. ssh does not honor cd to different directory
ssh $control_plane_node /bin/bash -c "cd /tmp/$USER/usernetes/ && . /tmp/$USER/usernetes/source_env.sh && kubectl get nodes"

# Install flannel and sync ips
ssh $control_plane_node /bin/bash -c "cd /tmp/$USER/usernetes/ && . /tmp/$USER/usernetes/source_env.sh && make -C /tmp/$USER/usernetes install-flannel"
ssh $control_plane_node /bin/bash -c "cd /tmp/$USER/usernetes/ && . /tmp/$USER/usernetes/source_env.sh && make -C /tmp/$USER/usernetes sync-external-ip"

# Shell in.
echo "Shelling into Usernetes control plane. Change directory to /tmp/$USER/usernetes and source_env.sh to use kubectl"
ssh $control_plane_node
38 changes: 25 additions & 13 deletions service/usernetes-start-control-plane.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,24 @@ set -euo pipefail
# These are variables we likely will change
# LC only supplies podman
USERNETES_CONTAINER_TECH=${1:-"podman"}
USERNETES_TEMPLATE_PATH=/usr/workspace/usernetes/usernetes-06-26-2025
USERNETES_TEMPLATE_PATH=/usr/workspace/usernetes/usernetes-dra

# We will copy join command here
shared_join_command_dir="/usr/workspace/usernetes"

# The user needs to run the setup script
USERNAME=$(whoami)

# Logging functions for consistency (like Akihiro!)
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - INFO - $1"
}

error_exit() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - ERROR - $1" >&2
exit 1
}

# This is way a lot for just deriving home, but I'm not convinced it will always
# be defined in the environment
if [[ -z "${HOME:-}" || ! -d "${HOME}" ]]; then
Expand All @@ -37,16 +47,6 @@ which podman-compose
# We don't want to use /var because that is a memory based fs
export TMPDIR="/tmp/${USERNAME}"

# Logging functions for consistency (like Akihiro!)
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - INFO - $1"
}

error_exit() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - ERROR - $1" >&2
exit 1
}

install_kubectl() {
if ! command -v kubectl > /dev/null; then
log "Installing kubectl..."
Expand Down Expand Up @@ -126,11 +126,12 @@ unshare_cleanup
# Usernetes Specific Setup
log "📂 Copying Usernetes template from ${USERNETES_TEMPLATE_PATH}"
cp -R "${USERNETES_TEMPLATE_PATH}" "${TMPDIR}/usernetes"
cd "${TMPDIR}/usernetes" # Now inside the copied template
sleep 3 # Allow filesystem operations to settle if needed
cd "${TMPDIR}/usernetes"
sleep 3

log "👷 Building Usernetes container image 'usernetes_node'"
${container_runtime_path} build --userns-uid-map=0:0:1 --userns-uid-map=1:1:1999 --userns-uid-map=65534:2000:2 -f $(pwd)/Dockerfile -t usernetes_node $(pwd)
${container_runtime_path} build --userns-uid-map=0:0:1 --userns-uid-map=1:1:1999 --userns-uid-map=65534:2000:2 -f $(pwd)/Dockerfile.dra -t usernetes_dra $(pwd)

cleanup() {
log "🧹 Cleaning up old networks or volumes (best effort)"
Expand All @@ -150,12 +151,19 @@ if ! make up; then
fi
sleep 3


log "🔐 Running kubeadm-init with 'make kubeadm-init'"
if ! make kubeadm-init; then
error_exit "Failed 'make kubeadm-init'."
fi
sleep 3

log "👾 Setting up dynamic resource allocation"
if ! make dra; then
error_exit "Failed 'make kubeadm-init'."
fi
sleep 3

log "🥷 Creating kubeconfig with 'make kubeconfig'"
if ! make kubeconfig; then
error_exit "Failed 'make kubeconfig'."
Expand All @@ -171,6 +179,10 @@ chmod 600 "${KUBECONFIG}"
# source <(kubectl completion bash)
sleep 3

log "🥷 Install rbac for dranet-driver"
kubectl apply -f Makefile.d/dra/rbac.yaml
${container_runtime_path} restart dranet-driver

# Get control plane node name robustly
log "🍑 Untainting control plane and labeling node"
control_plane_node=""
Expand Down
Loading