diff --git a/chart/templates/daemonset.yaml b/chart/templates/daemonset.yaml index 0f4c30e..d5ac519 100644 --- a/chart/templates/daemonset.yaml +++ b/chart/templates/daemonset.yaml @@ -34,7 +34,7 @@ spec: {{/* privileged: true*/}} {{/* runAsNonRoot: false*/}} {{/* runAsUser: 0*/}} - image: "nvcr.io/nvidia/cuda:12.5.0-runtime-ubuntu22.04" + image: "nvcr.io/nvidia/cuda:12.4.1-runtime-ubuntu22.04" volumeMounts: - name: host mountPath: /host @@ -44,4 +44,4 @@ spec: volumes: - name: host hostPath: - path: / \ No newline at end of file + path: / diff --git a/nvidia-monitor.Dockerfile b/nvidia-monitor.Dockerfile index 8e8e7af..f50f6af 100644 --- a/nvidia-monitor.Dockerfile +++ b/nvidia-monitor.Dockerfile @@ -1,9 +1,7 @@ -FROM nvcr.io/nvidia/cuda:12.5.0-runtime-ubuntu22.04 +FROM nvcr.io/nvidia/cuda:12.4.1-runtime-ubuntu22.04 -# "https://github.com/containerd/nerdctl/releases/download/v1.7.5/nerdctl-full-1.7.5-linux-amd64.tar.gz" \ -RUN wget -qO nerdctl.tar.gz "https://github.com/containerd/nerdctl/releases/download/v1.7.6/nerdctl-1.7.6-linux-amd64.tar.gz" \ - && tar Cxzvvf /usr/local nerdctl.tar.gz \ - && rm nerdctl.tar.gz +WORKDIR /root +COPY scripts/monitor.sh monitor.sh -# for i in $(nerdctl -a /host/run/containerd/containerd.sock -n k8s.io container ls --format "{{.ID}}"); do nerdctl -a /host/run/containerd/containerd.sock -n k8s.io inspect -f '{{.State.Pid}} {{index .Config.Labels "io.kubernetes.pod.name"}}' $i; done | grep gooey-gpu +# ./monitor.sh # nvidia-smi diff --git a/scripts/monitor.sh b/scripts/monitor.sh new file mode 100755 index 0000000..3858234 --- /dev/null +++ b/scripts/monitor.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +set -e + +ENV_VAR_FILTER="MODEL_IDS" + +function print3() { + printf "%-8s %-80s %12s\n" "$1" "${2%,}" "$3" +} + +print3 "PID" "Model IDs" "GPU Mem" + +nvidia-smi --query-compute-apps=pid,used_memory --format=csv,noheader,nounits \ + | while IFS=',' read -r pid mem; do + pid="${pid//[[:space:]]/}" + mem="$(echo $mem|tr -d '[:space:],')" + cmd_env=$(ps eww -p "$pid" -o command=) + val=$(printf '%s\n' "$cmd_env" | tr ' ' '\n' | grep "$ENV_VAR_FILTER=" | cut -d= -f2- | tr '\n' ',') + if [ -z "$val" ]; then + val=$(printf '%s\n' "$cmd_env" | sed -nE 's/.*celery@([^:]+):MainProcess.*/\1/p') + fi + mem_gib=$(awk "BEGIN{printf \"%.2f\", $mem/1024}") + print3 "$pid" "${val%,}" "${mem_gib}GiB" + done