From fa347b6cd232ac9ae9bf56615b530ae38fd9820a Mon Sep 17 00:00:00 2001
From: vsoch <vsoch@users.noreply.github.com>
Date: Tue, 28 Apr 2026 11:18:02 -0700
Subject: [PATCH 1/4] feat: untangle specific cluster build needs from ci here

Signed-off-by: vsoch <vsoch@users.noreply.github.com>
---
 .github/workflows/build-deploy.yaml      | 60 ++++++++++++++++++++++++
 Dockerfile                               | 37 +--------------
 Dockerfile.d/Dockerfile.base             | 36 ++++++++++++++
 docker-compose.yaml                      |  4 +-
 hack/create-cluster-lima.sh              |  3 ++
 hack/test-smoke.sh                       |  9 ++++
 service/usernetes-start-control-plane.sh | 20 ++++----
 service/usernetes-start-worker.sh        | 21 +++++----
 8 files changed, 134 insertions(+), 56 deletions(-)
 create mode 100644 .github/workflows/build-deploy.yaml
 create mode 100644 Dockerfile.d/Dockerfile.base

diff --git a/.github/workflows/build-deploy.yaml b/.github/workflows/build-deploy.yaml
new file mode 100644
index 00000000..10870354
--- /dev/null
+++ b/.github/workflows/build-deploy.yaml
@@ -0,0 +1,60 @@
+name: Docker Build and Deploy
+
+on:
+  push:
+    branches: 
+      - develop
+  pull_request: {}
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: converged-computing/usernetes
+
+jobs:
+  build-and-push:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to the Container registry
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata (tags, labels)
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          file: Dockerfile.d/Dockerfile.base
+          tags: |
+            # Set node-base as the primary tag for the main branch
+            type=raw,value=node-base,enable=${{ github.ref == 'refs/heads/main' }}
+            # Add SHA tag for traceability
+            type=sha,format=short
+            # Tag PRs with the PR number
+            type=ref,event=pr
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v5
+        with:
+          file: Dockerfile.d/Dockerfile.base       
+          context: .
+          # Only push if it's NOT a pull request
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          # Use GitHub Actions cache to speed up builds
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
diff --git a/Dockerfile b/Dockerfile
index a864e8bc..5f25c810 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,40 +1,7 @@
-ARG BASE_IMAGE=docker.io/kindest/node:v1.33.0@sha256:91e9ed777db80279c22d1d1068c091b899b2078506e4a0f797fbf6e397c0b0b2
-ARG CNI_PLUGINS_VERSION=v1.7.1
-ARG HELM_VERSION=v3.17.3
-ARG FLANNEL_VERSION=v0.26.7
+ARG BASE_IMAGE=ghcr.io/converged-computing/usernetes:node-base
+# Edit this image to add / adopt for your environment
 FROM ${BASE_IMAGE}
-COPY Dockerfile.d/SHA256SUMS.d/ /tmp/SHA256SUMS.d
-ARG CNI_PLUGINS_VERSION
-ARG HELM_VERSION
-ARG FLANNEL_VERSION
 # This are private on our cluster and need to be copied to here
 COPY cspca.llnl.gov.cer.pem /usr/local/share/ca-certificates/
 COPY cspca.cer.pem /usr/local/share/ca-certificates/
 RUN update-ca-certificates
-RUN arch="$(uname -m | sed -e s/x86_64/amd64/ -e s/aarch64/arm64/)" && \
-  fname="cni-plugins-linux-${arch}-${CNI_PLUGINS_VERSION}.tgz" && \
-  curl --insecure -o "${fname}" -fSL "https://github.com/containernetworking/plugins/releases/download/${CNI_PLUGINS_VERSION}/${fname}" && \
-  grep "${fname}" "/tmp/SHA256SUMS.d/cni-plugins-${CNI_PLUGINS_VERSION}" | sha256sum -c && \
-  mkdir -p /opt/cni/bin && \
-  tar xzf "${fname}" -C /opt/cni/bin && \
-  rm -f "${fname}" && \
-  fname="helm-${HELM_VERSION}-linux-${arch}.tar.gz" && \
-  curl --insecure -o "${fname}" -fSL "https://get.helm.sh/${fname}" && \
-  grep "${fname}" "/tmp/SHA256SUMS.d/helm-${HELM_VERSION}" | sha256sum -c && \
-  tar xzf "${fname}" -C /usr/local/bin --strip-components=1 -- "linux-${arch}/helm" && \
-  rm -f "${fname}" && \
-  fname="flannel.tgz" && \
-  curl --insecure -o "${fname}" -fSL "https://github.com/flannel-io/flannel/releases/download/${FLANNEL_VERSION}/${fname}" && \
-  grep "${fname}" "/tmp/SHA256SUMS.d/flannel-${FLANNEL_VERSION}" | sha256sum -c && \
-  tar xzf "${fname}" -C / && \
-  rm -f "${fname}"
-# gettext-base: for `envsubst`
-# moreutils: for `sponge`
-# socat: for `socat` (to silence "[WARNING FileExisting-socat]" from kubeadm)
-RUN apt-get update && apt-get install -y --no-install-recommends \
-  gettext-base \
-  moreutils \
-  socat
-ADD Dockerfile.d/etc_udev_rules.d_90-flannel.rules /etc/udev/rules.d/90-flannel.rules
-ADD Dockerfile.d/u7s-entrypoint.sh /
-ENTRYPOINT ["/u7s-entrypoint.sh", "/usr/local/bin/entrypoint", "/sbin/init"]
diff --git a/Dockerfile.d/Dockerfile.base b/Dockerfile.d/Dockerfile.base
new file mode 100644
index 00000000..b411a36e
--- /dev/null
+++ b/Dockerfile.d/Dockerfile.base
@@ -0,0 +1,36 @@
+ARG BASE_IMAGE=docker.io/kindest/node:v1.33.0@sha256:91e9ed777db80279c22d1d1068c091b899b2078506e4a0f797fbf6e397c0b0b2
+ARG CNI_PLUGINS_VERSION=v1.7.1
+ARG HELM_VERSION=v3.17.3
+ARG FLANNEL_VERSION=v0.26.7
+FROM ${BASE_IMAGE}
+COPY Dockerfile.d/SHA256SUMS.d/ /tmp/SHA256SUMS.d
+ARG CNI_PLUGINS_VERSION
+ARG HELM_VERSION
+ARG FLANNEL_VERSION
+RUN arch="$(uname -m | sed -e s/x86_64/amd64/ -e s/aarch64/arm64/)" && \
+  fname="cni-plugins-linux-${arch}-${CNI_PLUGINS_VERSION}.tgz" && \
+  curl --insecure -o "${fname}" -fSL "https://github.com/containernetworking/plugins/releases/download/${CNI_PLUGINS_VERSION}/${fname}" && \
+  grep "${fname}" "/tmp/SHA256SUMS.d/cni-plugins-${CNI_PLUGINS_VERSION}" | sha256sum -c && \
+  mkdir -p /opt/cni/bin && \
+  tar xzf "${fname}" -C /opt/cni/bin && \
+  rm -f "${fname}" && \
+  fname="helm-${HELM_VERSION}-linux-${arch}.tar.gz" && \
+  curl --insecure -o "${fname}" -fSL "https://get.helm.sh/${fname}" && \
+  grep "${fname}" "/tmp/SHA256SUMS.d/helm-${HELM_VERSION}" | sha256sum -c && \
+  tar xzf "${fname}" -C /usr/local/bin --strip-components=1 -- "linux-${arch}/helm" && \
+  rm -f "${fname}" && \
+  fname="flannel.tgz" && \
+  curl --insecure -o "${fname}" -fSL "https://github.com/flannel-io/flannel/releases/download/${FLANNEL_VERSION}/${fname}" && \
+  grep "${fname}" "/tmp/SHA256SUMS.d/flannel-${FLANNEL_VERSION}" | sha256sum -c && \
+  tar xzf "${fname}" -C / && \
+  rm -f "${fname}"
+# gettext-base: for `envsubst`
+# moreutils: for `sponge`
+# socat: for `socat` (to silence "[WARNING FileExisting-socat]" from kubeadm)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  gettext-base \
+  moreutils \
+  socat ipset wget
+ADD Dockerfile.d/etc_udev_rules.d_90-flannel.rules /etc/udev/rules.d/90-flannel.rules
+ADD Dockerfile.d/u7s-entrypoint.sh /
+ENTRYPOINT ["/u7s-entrypoint.sh", "/usr/local/bin/entrypoint", "/sbin/init"]
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 1df41123..a4ad40a8 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -3,7 +3,9 @@
 ---
 services:
   node:
-    image: usernetes_node
+    build:
+      context: .
+      dockerfile: Dockerfile.d/Dockerfile.base
     hostname: ${NODE_NAME}
     privileged: true
     restart: always
diff --git a/hack/create-cluster-lima.sh b/hack/create-cluster-lima.sh
index ba3d230c..0fea2a5d 100755
--- a/hack/create-cluster-lima.sh
+++ b/hack/create-cluster-lima.sh
@@ -23,6 +23,9 @@ fi
 for host in host0 host1; do
 	# Set --plain to minimize Limaism
 	${LIMACTL} start --plain --network lima:user-v2 --name="${host}" ${LIMACTL_CREATE_ARGS} "${LIMA_TEMPLATE}"
+	echo "LISTING ${host}"
+	${LIMACTL} shell "${host}" ls /
+	${LIMACTL} shell "${host}" ls /home
 	${LIMACTL} copy -r "$(pwd)" "${host}:${guest_home}/usernetes"
 	${LIMACTL} shell "${host}" sudo CONTAINER_ENGINE="${CONTAINER_ENGINE}" "${guest_home}/usernetes/init-host/init-host.root.sh"
 	# Terminate the current session so that the cgroup delegation takes an effect. This command exits with status 255 as SSH terminates.
diff --git a/hack/test-smoke.sh b/hack/test-smoke.sh
index 2a8680cb..8bf13bf7 100755
--- a/hack/test-smoke.sh
+++ b/hack/test-smoke.sh
@@ -54,6 +54,15 @@ EOF
 	INFO "Waiting for 3 replicas to be ready"
 	kubectl rollout status --timeout=5m statefulset
 
+	INFO "GET PODS"
+	kubectl get pods
+	INFO "DESCRIBE PODS"
+	kubectl describe pods
+	for name in $(kubectl get pods -o json | jq -r .items[].metadata.name)
+	  do
+	     kubectl logs $name
+	  done
+
 	INFO "Connecting to dnstest-{0,1,2}.dnstest.default.svc.cluster.local"
 	kubectl run -i --rm --image=alpine --restart=Never dnstest-shell -- sh -exc 'for f in $(seq 0 2); do wget -O- http://dnstest-${f}.dnstest.default.svc.cluster.local; done'
 
diff --git a/service/usernetes-start-control-plane.sh b/service/usernetes-start-control-plane.sh
index 2be076d7..3eb884d5 100755
--- a/service/usernetes-start-control-plane.sh
+++ b/service/usernetes-start-control-plane.sh
@@ -10,6 +10,16 @@ USERNETES_TEMPLATE_PATH=/usr/workspace/usernetes/usernetes-06-26-2025
 # We will copy join command here
 shared_join_command_dir="/usr/workspace/usernetes"
 
+# Logging functions for consistency (like Akihiro!)
+log() {
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - INFO - $1"
+}
+
+error_exit() {
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - ERROR - $1" >&2
+    exit 1
+}
+
 # The user needs to run the setup script
 USERNAME=$(whoami)
 
@@ -37,16 +47,6 @@ which podman-compose
 # We don't want to use /var because that is a memory based fs
 export TMPDIR="/tmp/${USERNAME}"
 
-# Logging functions for consistency (like Akihiro!)
-log() {
-    echo "$(date '+%Y-%m-%d %H:%M:%S') - INFO - $1"
-}
-
-error_exit() {
-    echo "$(date '+%Y-%m-%d %H:%M:%S') - ERROR - $1" >&2
-    exit 1
-}
-
 install_kubectl() {
     if ! command -v kubectl > /dev/null; then
         log "Installing kubectl..."
diff --git a/service/usernetes-start-worker.sh b/service/usernetes-start-worker.sh
index 709585d4..276984d0 100755
--- a/service/usernetes-start-worker.sh
+++ b/service/usernetes-start-worker.sh
@@ -7,6 +7,17 @@ set -euo pipefail
 USERNETES_CONTAINER_TECH=${1:-"podman"} 
 USERNETES_TEMPLATE_PATH=/usr/workspace/usernetes/usernetes-06-26-2025
 
+# Logging functions for consistency (like Akihiro!)
+log() {
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - INFO - $1"
+}
+
+error_exit() {
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - ERROR - $1" >&2
+    exit 1
+}
+
+
 # The join command needs to be here
 shared_join_command_dir="/usr/workspace/usernetes"
 if [ ! -f "${shared_join_command_dir}/join-command" ]
@@ -38,16 +49,6 @@ log "    Updated PATH: ${PATH}"
 # We don't want to use /var because that is a memory based fs
 export TMPDIR="/tmp/${USERNAME}"
 
-# Logging functions for consistency (like Akihiro!)
-log() {
-    echo "$(date '+%Y-%m-%d %H:%M:%S') - INFO - $1"
-}
-
-error_exit() {
-    echo "$(date '+%Y-%m-%d %H:%M:%S') - ERROR - $1" >&2
-    exit 1
-}
-
 install_kubectl() {
     if ! command -v kubectl > /dev/null; then
         log "Installing kubectl..."

From 65a875a3770258016d3f2be4bd70e4de088a998f Mon Sep 17 00:00:00 2001
From: Vanessa Sochat <814322+vsoch@users.noreply.github.com>
Date: Tue, 28 Apr 2026 13:43:37 -0700
Subject: [PATCH 2/4] Change guest home directory to '/home/runner.guest'

---
 .github/workflows/main.yaml                 | 35 ++++++---------------
 .github/workflows/reusable-single-node.yaml |  6 ----
 Makefile                                    |  6 ++--
 hack/create-cluster-lima.sh                 |  5 +--
 hack/test-smoke.sh                          | 24 +++++++++++++-
 5 files changed, 40 insertions(+), 36 deletions(-)

diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index 7efb874d..6075da33 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -2,15 +2,16 @@
 name: Main
 on: [push, pull_request]
 jobs:
-  single-node:
-    name: "Single node"
-    strategy:
-      fail-fast: false
-      matrix:
-        container_engine: [docker, nerdctl, podman]
-    uses: ./.github/workflows/reusable-single-node.yaml
-    with:
-      container_engine: ${{ matrix.container_engine }}
+  # We would never use usernetes on a single node
+  #single-node:
+  #  name: "Single node"
+  #  strategy:
+  #    fail-fast: false
+  #    matrix:
+  #      container_engine: [docker, nerdctl, podman]
+  #  uses: ./.github/workflows/reusable-single-node.yaml
+  #  with:
+  #    container_engine: ${{ matrix.container_engine }}
 
   multi-node:
     name: "Multi node"
@@ -30,19 +31,3 @@ jobs:
     with:
       lima_template: ${{ matrix.lima_template }}
       container_engine: ${{ matrix.container_engine }}
-
-  # TODO: this test should create multiple instances of Usernetes on each of the hosts
-  multi-node-custom-ports:
-    name: "Multi node with custom service ports"
-    uses: ./.github/workflows/reusable-multi-node.yaml
-    with:
-      lima_template: "template://ubuntu-24.04"
-      container_engine: "docker"
-      # Defaults to 6443
-      kube_apiserver_port: "8080"
-      # Defaults to 8472
-      flannel_port: "9072"
-      # Defaults to 10250
-      kubelet_port: "20250"
-      # Defaults to 2379
-      etcd_port: "9090"
diff --git a/.github/workflows/reusable-single-node.yaml b/.github/workflows/reusable-single-node.yaml
index 28b70013..cd0a7f7d 100644
--- a/.github/workflows/reusable-single-node.yaml
+++ b/.github/workflows/reusable-single-node.yaml
@@ -83,9 +83,3 @@ jobs:
       - run: make kubeconfig
       - run: kubectl taint nodes --all node-role.kubernetes.io/control-plane-
       - run: ./hack/test-smoke.sh
-      - name: "Test data persistency after restarting the node"
-        run: |
-          make down
-          make up
-          sleep 30
-          ./hack/test-smoke.sh
diff --git a/Makefile b/Makefile
index de258ddb..8ed4c7b6 100644
--- a/Makefile
+++ b/Makefile
@@ -7,6 +7,8 @@ export PORT_KUBELET ?= 10250
 export PORT_FLANNEL ?= 8472
 export PORT_KUBE_APISERVER ?= 6443
 
+HERE := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+
 # HOSTNAME is the name of the physical host
 export HOSTNAME ?= $(shell hostname)
 # HOST_IP is the IP address of the physical host. Accessible from other hosts.
@@ -82,7 +84,7 @@ render: check-preflight
 .PHONY: up
 up: check-preflight
 	# Podman creates cni files in a shared location, this ensures unique names that do not clobbed one another
-	sed -i "s/default_network/$(HOSTNAME)/g" docker-compose.yaml
+	sed -i "s/default_network/$(HOSTNAME)/g" $(HERE)/docker-compose.yaml
 	$(COMPOSE) up --build -d
 
 .PHONY: down
@@ -145,7 +147,7 @@ sync-external-ip:
 .PHONY: kubeadm-join
 kubeadm-join:
 	# Our kernel is too old for usernetes, so we need this
-	sed -i "s/--token/--ignore-preflight-errors=all --token/g" join-command
+	sed -i "s/--token/--ignore-preflight-errors=all --token/g" $(HERE)/join-command
 	$(NODE_SHELL) /bin/bash /usernetes/join-command
 	@echo "# Run 'make sync-external-ip' on the control plane"
 
diff --git a/hack/create-cluster-lima.sh b/hack/create-cluster-lima.sh
index 0fea2a5d..7ccf1533 100755
--- a/hack/create-cluster-lima.sh
+++ b/hack/create-cluster-lima.sh
@@ -11,7 +11,7 @@ set -eux -o pipefail
 : "${PORT_FLANNEL:=8472}"
 : "${PORT_KUBELET:=10250}"
 
-guest_home="/home/${USER}.linux"
+guest_home="/home/runner.guest"
 
 if [ "$(id -u)" -le 1000 ]; then
 	# In --plain mode, UID has to be >= 1000 to populate subuids
@@ -49,7 +49,8 @@ done
 ${LIMACTL} shell host0 ${SERVICE_PORTS} CONTAINER_ENGINE="${CONTAINER_ENGINE}" make -C "${guest_home}/usernetes" kubeadm-init install-flannel kubeconfig join-command
 
 # Let host1 join the cluster
-${LIMACTL} copy host0:~/usernetes/join-command host1:~/usernetes/join-command
+${LIMACTL} copy host0:${guest_home}/usernetes/join-command ./join-command
+${LIMACTL} copy ./join-command host1:${guest_home}/usernetes/join-command
 ${LIMACTL} shell host1 ${SERVICE_PORTS} CONTAINER_ENGINE="${CONTAINER_ENGINE}" make -C "${guest_home}/usernetes" kubeadm-join
 ${LIMACTL} shell host0 ${SERVICE_PORTS} CONTAINER_ENGINE="${CONTAINER_ENGINE}" make -C "${guest_home}/usernetes" sync-external-ip
 
diff --git a/hack/test-smoke.sh b/hack/test-smoke.sh
index 8bf13bf7..034bb6bd 100755
--- a/hack/test-smoke.sh
+++ b/hack/test-smoke.sh
@@ -56,16 +56,38 @@ EOF
 
 	INFO "GET PODS"
 	kubectl get pods
+	kubectl get pods -n kube-system
 	INFO "DESCRIBE PODS"
 	kubectl describe pods
 	for name in $(kubectl get pods -o json | jq -r .items[].metadata.name)
 	  do
 	     kubectl logs $name
+	     kubectl exec -it $name -- cat /etc/resolv.conf
 	  done
 
+	INFO "Patching CoreDNS to use 8.8.8.8"
+	kubectl get configmap coredns -n kube-system -o yaml | \
+	  sed 's/forward . \/etc\/resolv.conf/forward . 8.8.8.8/' | \
+	  kubectl apply -f -
+  
+	INFO "Restarting CoreDNS"
+	kubectl delete pod -n kube-system -l k8s-app=kube-dns
+	kubectl rollout status deployment coredns -n kube-system
+
 	INFO "Connecting to dnstest-{0,1,2}.dnstest.default.svc.cluster.local"
-	kubectl run -i --rm --image=alpine --restart=Never dnstest-shell -- sh -exc 'for f in $(seq 0 2); do wget -O- http://dnstest-${f}.dnstest.default.svc.cluster.local; done'
+	kubectl run -i --rm --image=busybox:1.28 --restart=Never dnstest-shell -- sh -exc '
+  echo "--- Resolv.conf ---"
+  cat /etc/resolv.conf
+  
+  echo "--- Testing External DNS (google.com) ---"
+  nslookup google.com || echo "External DNS Failed"
+
+  echo "--- Testing Internal DNS (dnstest-0) ---"
+  nslookup dnstest-0.dnstest || echo "Internal DNS Failed"
 
+  for f in 0 1 2; do 
+    wget -qO- http://dnstest-${f}.dnstest.default.svc.cluster.local
+  done'
 	INFO "Deleting Service \"dnstest\""
 	kubectl delete service dnstest
 	INFO "Deleting StatefulSet \"dnstest\""

From 29be7e936d44e2f08a46a2c4c5182749232c8c63 Mon Sep 17 00:00:00 2001
From: Vanessa Sochat <814322+vsoch@users.noreply.github.com>
Date: Thu, 7 May 2026 11:21:14 -0700
Subject: [PATCH 3/4] test: build in makefile (#11)

* test: build in makefile
* test: add compose directory
* build: podman does not allow pulling
* test: nri plugin
* fix: restore dockerfile base
* bug: we should not overwrite storage.conf

Signed-off-by: vsoch <vsoch@users.noreply.github.com>
---
 Dockerfile                               |  5 ++++-
 Makefile                                 | 10 +++++++---
 compose/prebuilt-node.yaml               |  3 +++
 service/usernetes-start-control-plane.sh | 19 ++++++++++++++-----
 service/usernetes-start-worker.sh        | 10 ++++++++--
 5 files changed, 36 insertions(+), 11 deletions(-)
 create mode 100644 compose/prebuilt-node.yaml

diff --git a/Dockerfile b/Dockerfile
index 5f25c810..520c4cbd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,10 @@
-ARG BASE_IMAGE=ghcr.io/converged-computing/usernetes:node-base
+# ARG BASE_IMAGE=ghcr.io/converged-computing/usernetes:node-base
+ARG BASE_IMAGE=usernetes_base
 # Edit this image to add / adopt for your environment
 FROM ${BASE_IMAGE}
 # This are private on our cluster and need to be copied to here
 COPY cspca.llnl.gov.cer.pem /usr/local/share/ca-certificates/
 COPY cspca.cer.pem /usr/local/share/ca-certificates/
+COPY PAN-cspca.llnl.gov.crt /usr/local/share/ca-certificates/
+COPY hpc-profile.json /var/lib/kubelet/seccomp/hpc-profile.json
 RUN update-ca-certificates
diff --git a/Makefile b/Makefile
index 8ed4c7b6..81203db7 100644
--- a/Makefile
+++ b/Makefile
@@ -24,9 +24,7 @@ export NODE_SUBNET ?= $(shell $(CURDIR)/Makefile.d/node-subnet.sh)
 export NODE_IP := $(subst .0/24,.100,$(NODE_SUBNET))
 
 export CONTAINER_ENGINE ?= $(shell $(CURDIR)/Makefile.d/detect-container-engine.sh CONTAINER_ENGINE)
-
 export CONTAINER_ENGINE_TYPE ?= $(shell $(CURDIR)/Makefile.d/detect-container-engine.sh CONTAINER_ENGINE_TYPE)
-
 COMPOSE ?= $(shell $(CURDIR)/Makefile.d/detect-container-engine.sh COMPOSE)
 
 NODE_SERVICE_NAME := node
@@ -85,7 +83,13 @@ render: check-preflight
 up: check-preflight
 	# Podman creates cni files in a shared location, this ensures unique names that do not clobbed one another
 	sed -i "s/default_network/$(HOSTNAME)/g" $(HERE)/docker-compose.yaml
-	$(COMPOSE) up --build -d
+	$(COMPOSE) up -d
+
+.PHONY: up-built
+up-built: check-preflight
+	# Podman creates cni files in a shared location, this ensures unique names that do not clobbed one another
+	sed -i "s/default_network/$(HOSTNAME)/g" $(HERE)/docker-compose.yaml
+	$(COMPOSE) -f $(HERE)/docker-compose.yaml -f $(HERE)/compose/prebuilt-node.yaml up -d
 
 .PHONY: down
 down:
diff --git a/compose/prebuilt-node.yaml b/compose/prebuilt-node.yaml
new file mode 100644
index 00000000..27b475ac
--- /dev/null
+++ b/compose/prebuilt-node.yaml
@@ -0,0 +1,3 @@
+services:
+  usernetes_node:
+    image: usernetes_node
diff --git a/service/usernetes-start-control-plane.sh b/service/usernetes-start-control-plane.sh
index 3eb884d5..78825067 100755
--- a/service/usernetes-start-control-plane.sh
+++ b/service/usernetes-start-control-plane.sh
@@ -5,7 +5,7 @@ set -euo pipefail
 # These are variables we likely will change
 # LC only supplies podman
 USERNETES_CONTAINER_TECH=${1:-"podman"} 
-USERNETES_TEMPLATE_PATH=/usr/workspace/usernetes/usernetes-06-26-2025
+USERNETES_TEMPLATE_PATH=/usr/workspace/usernetes/usernetes-develop
 
 # We will copy join command here
 shared_join_command_dir="/usr/workspace/usernetes"
@@ -101,9 +101,12 @@ mkdir -p "${XDG_RUNTIME_DIR}"
 setup_podman() {
     # These are likely to give issues. This resets podman with a vfs backend and then
     # cleans up tmp in the unshared context
+    if [[ -e "${HOME}/.config/containers/storage.conf" ]]; then
+        return    
+    fi
     if [[ -x "/collab/usr/gapps/lcweg/containers/scripts/enable-podman.sh" ]]; then
         log "      Running enable-podman.sh vfs"
-        if ! bash /collab/usr/gapps/lcweg/containers/scripts/enable-podman.sh vfs; then
+        if ! bash /collab/usr/gapps/lcweg/containers/scripts/enable-podman.sh overlay; then
             log "      WARNING: enable-podman.sh script failed. Continuing, but podman might not be configured correctly."
         fi
     else
@@ -126,8 +129,13 @@ unshare_cleanup
 # Usernetes Specific Setup
 log "📂 Copying Usernetes template from ${USERNETES_TEMPLATE_PATH}"
 cp -R "${USERNETES_TEMPLATE_PATH}" "${TMPDIR}/usernetes"
-cd "${TMPDIR}/usernetes" # Now inside the copied template
-sleep 3 # Allow filesystem operations to settle if needed
+
+ # Now inside the copied template
+cd "${TMPDIR}/usernetes"
+sleep 3
+
+log "👷 Building Usernetes container image 'usernetes_base'"
+${container_runtime_path} build --userns-uid-map=0:0:1 --userns-uid-map=1:1:1999 --userns-uid-map=65534:2000:2 -f $(pwd)/Dockerfile.d/Dockerfile.base -t usernetes_base $(pwd)
 
 log "👷 Building Usernetes container image 'usernetes_node'"
 ${container_runtime_path} build --userns-uid-map=0:0:1 --userns-uid-map=1:1:1999 --userns-uid-map=65534:2000:2 -f $(pwd)/Dockerfile -t usernetes_node $(pwd)
@@ -136,6 +144,7 @@ cleanup() {
     log "🧹 Cleaning up old networks or volumes (best effort)"
     make down-v || log "      'make down-v' failed, possibly because nothing was running. Continuing."
 
+
     # Explicit cleanup, as 'make down-v' might not cover everything or could fail
     "${container_runtime_path}" network rm usernetes_default -f || log "      Network 'usernetes_default' not found."
     "${container_runtime_path}" volume rm usernetes_node-var -f || log "      Volume 'usernetes_node-var' not found."
@@ -145,7 +154,7 @@ cleanup() {
 cleanup
 
 log "    ⬆️ Bringing up the Usernetes node(s) with 'make up'"
-if ! make up; then
+if ! make up-built; then
     error_exit "Failed to bring up Usernetes with 'make up'."
 fi
 sleep 3
diff --git a/service/usernetes-start-worker.sh b/service/usernetes-start-worker.sh
index 276984d0..eda0ae9f 100755
--- a/service/usernetes-start-worker.sh
+++ b/service/usernetes-start-worker.sh
@@ -5,7 +5,7 @@ set -euo pipefail
 # These are variables we likely will change
 # LC only supplies podman
 USERNETES_CONTAINER_TECH=${1:-"podman"} 
-USERNETES_TEMPLATE_PATH=/usr/workspace/usernetes/usernetes-06-26-2025
+USERNETES_TEMPLATE_PATH=/usr/workspace/usernetes/usernetes-develop
 
 # Logging functions for consistency (like Akihiro!)
 log() {
@@ -104,6 +104,9 @@ mkdir -p "${XDG_RUNTIME_DIR}"
 setup_podman() {
     # These are likely to give issues. This resets podman with a vfs backend and then
     # cleans up tmp in the unshared context
+    if [[ -e "${HOME}/.config/containers/storage.conf" ]]; then
+        return    
+    fi
     if [[ -x "/collab/usr/gapps/lcweg/containers/scripts/enable-podman.sh" ]]; then
         log "      Running enable-podman.sh vfs"
         if ! bash /collab/usr/gapps/lcweg/containers/scripts/enable-podman.sh vfs; then
@@ -134,6 +137,9 @@ cp -R "${USERNETES_TEMPLATE_PATH}" "${TMPDIR}/usernetes"
 cd "${TMPDIR}/usernetes"
 sleep 3
 
+log "👷 Building Usernetes container image 'usernetes_base'"
+${container_runtime_path} build --userns-uid-map=0:0:1 --userns-uid-map=1:1:1999 --userns-uid-map=65534:2000:2 -f $(pwd)/Dockerfile.d/Dockerfile.base -t usernetes_base $(pwd)
+
 log "👷 Building Usernetes container image 'usernetes_node'"
 ${container_runtime_path} build --userns-uid-map=0:0:1 --userns-uid-map=1:1:1999 --userns-uid-map=65534:2000:2 -f $(pwd)/Dockerfile -t usernetes_node $(pwd)
 
@@ -150,7 +156,7 @@ cleanup() {
 cleanup
 
 log "    ⬆️ Bringing up the Usernetes node(s) with 'make up'"
-if ! make up; then
+if ! make up-built; then
     error_exit "Failed to bring up Usernetes with 'make up'."
 fi
 sleep 3

From aeaf88e5a3956f228c4f6edcbfd828b8a69e5a40 Mon Sep 17 00:00:00 2001
From: vsoch <vsoch@users.noreply.github.com>
Date: Thu, 7 May 2026 19:41:07 -0700
Subject: [PATCH 4/4] amd gpu: tested and organized

We need to find the right base combination for pytorch.
There is the dual challenge/complexity of matching old
AMD gpus plus containers with builds that take particular
patterns create whiteout or input/output errors in our
setup.

Signed-off-by: vsoch <vsoch@users.noreply.github.com>
---
 service/README.md                         | 62 +----------------------
 service/gpus/README.md                    | 26 ++++++++++
 service/gpus/pytorch-amd-interactive.yaml | 32 ++++++++++++
 3 files changed, 60 insertions(+), 60 deletions(-)
 create mode 100644 service/gpus/README.md
 create mode 100644 service/gpus/pytorch-amd-interactive.yaml

diff --git a/service/README.md b/service/README.md
index 9bd636e2..49b79182 100644
--- a/service/README.md
+++ b/service/README.md
@@ -61,68 +61,10 @@ u7s-corona190   Ready    control-plane   5m    v1.30.0
 u7s-corona196   Ready    <none>          3m7s  v1.30.0
 ```
 
-Install the Flux Operator...
+You can now install the Flux Operator and run experiments, or look at [using gpus](gpus).
 
 ```bash
 kubectl apply -f https://raw.githubusercontent.com/flux-framework/flux-operator/refs/heads/main/examples/dist/flux-operator.yaml
 ```
 
-Test away! Good luck. Other containers to try:
-
-
-```bash
-# testing bare metal - 53 seconds
-flux run -N1 -n 48 /usr/workspace/usernetes/lammps/build/install/bin/lmp -v x 8 -v y 8 -v z 8 -in in.reaxc.hns -nocite
-
-# 2 nodes, 29 seconds
-flux run -N2 -n 96 /usr/workspace/usernetes/lammps/build/install/bin/lmp -v x 8 -v y 8 -v z 8 -in in.reaxc.hns -nocite
-
-# mpirun with one node: 1:18s
-/opt/toss/openmpi/4.1/gnu/bin/mpirun --allow-run-as-root --mca plm_rsh_agent "" -np 48 lmp -v x 8 -v y 8 -v z 8 -in in.reaxc.hns -nocite
-
-# OSU Latency (need to compare these two)
-flux run -N2 -n2 osu_latency
-flux run -N2 --env UCX_TLS=rc_x,sm,self --env OMPI_MCA_pml=ucx --env UCX_NET_DEVICES=mlx5_0:1 -n2 osu_latency
-
-# LAMMPS (many of these likely aren't required, we will learn with experiments)
-export OMPI_MCA_opal_warn_on_missing_libcuda=0
-export OMPI_MCA_btl=^openib,self,vader
-export OMPI_MCA_pml=ucx
-export OMPI_MCA_osc=ucx
-export UCX_TLS=all
-flux run -N2 -opmi=pmi2 -n 96 lmp -v x 8 -v y 8 -v z 8 -in in.reaxc.hns -nocite
-
-export OMPI_MCA_pml=ucx
-export UCX_MEMTYPE_CACHE=y
-export UCX_LOG_LEVEL=DEBUG
-export OMPI_MCA_btl="^openib,tcp"
-flux run -N2 --env UCX_TLS=rc_x,sm,self --env OMPI_MCA_pml=ucx --env UCX_NET_DEVICES=mlx5_0:1 -n2 osu_latency
-
-# We also should test this - this helped on Azure
-export UCX_IB_MLX5_DEVX=y
-
-export OMPI_MCA_opal_common_ucx_opal_mem_hooks=1
-export OMPI_MCA_btl_openib_allow_ib=true
-export UCX_NET_DEVICES=mlx5_0:1
-export UCX_TLS=rc,sm,self
-export OMPI_MCA_pml=ucx
-export OMPI_MCA_osc=ucx
-flux run -N2 -n96 lmp -v x 8 -v y 8 -v z 8 -in in.reaxc.hns -nocite
-```
-
-### GPUs
-
-You can install the [ROCm/k8s-device-plugin](https://github.com/ROCm/k8s-device-plugin) to expose GPU devices to your pods.
-
-```bash
-# Install the driver plugin
-kubectl create -f https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml
-
-# Create a test workflow that uses GPU (takes a bit to pull)
-https://raw.githubusercontent.com/ROCm/k8s-device-plugin/763445e18f3838fa72b22e31a04ec25987334bff/example/pod/pytorch-non-privileged.yaml
-
-# Get logs (it takes a while to pull...)
-kubectl logs alexnet-tf-gpu-pod alexnet-tf-gpu-container
-```
-
-Our final experiments will be done separately, and these notes likely cleaned up.
+Test away! Good luck.
diff --git a/service/gpus/README.md b/service/gpus/README.md
new file mode 100644
index 00000000..30b17ecd
--- /dev/null
+++ b/service/gpus/README.md
@@ -0,0 +1,26 @@
+# GPUs
+
+You can install the [ROCm/k8s-device-plugin](https://github.com/ROCm/k8s-device-plugin) to expose GPU devices to your pods.
+
+```bash
+kubectl create -f https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml
+```
+
+# Create a test workflow that uses GPU (takes a bit to pull)
+
+```bash
+# test rocminfo, or rocm-smi inside the pod
+kubectl apply -f ./service/gpus/pytorch-amd-interactive.yaml
+```
+When we can figure out the right container, this should work inside (latest segfaults, likely incompatible, and I have not been able to use older versions due to whiteout file issues).
+
+```python
+import torch
+if torch.cuda.is_available():
+  print(f"GPU is available. Device count: {torch.cuda.device_count()}")
+  print(f"Device name: {torch.cuda.get_device_name(0)}")
+  x = torch.ones(3, 3, device='cuda')
+  y = torch.ones(3, 3, device='cuda') * 2
+  z = x + y
+  print(f"Result of tensor addition on GPU: {z}")
+```
diff --git a/service/gpus/pytorch-amd-interactive.yaml b/service/gpus/pytorch-amd-interactive.yaml
new file mode 100644
index 00000000..6a464b39
--- /dev/null
+++ b/service/gpus/pytorch-amd-interactive.yaml
@@ -0,0 +1,32 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: pytorch-non-privileged-gpu-pod
+spec:
+  restartPolicy: Never
+  hostIPC: true  
+  volumes:
+  - name: dshm
+    emptyDir:
+      medium: Memory
+  containers:
+  - name: pytorch-gpu-container
+    volumeMounts:
+    - mountPath: /dev/shm
+      name: dshm  
+    # Note, currently getting whiteour errors. The latest is incompatible with our old GPUs.
+    # This likely will work, but we need the right container.
+    image: rocm/pytorch:rocm5.4_ubuntu20.04_py3.8_pytorch_1.12.1
+    # image: rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1
+    # image: rocm/pytorch:latest
+    command:
+    - sleep
+    - infinity
+    securityContext:
+      privileged: false
+      allowPrivilegeEscalation: false      
+      seccompProfile:
+        type: Unconfined
+    resources:
+      limits:
+        amd.com/gpu: 8