Add DCGM exporter

tkestack · Jun 1, 2018 · 48701fa · 48701fa
1 parent 6c04a3d
commit 48701fa
Show file tree

Hide file tree

Showing 8 changed files with 338 additions and 0 deletions.
diff --git a/exporters/prometheus-dcgm/Makefile b/exporters/prometheus-dcgm/Makefile
@@ -0,0 +1,9 @@
+install:
+	install -d -m 755 /run/prometheus
+	install -m 775 dcgm-exporter/dcgm-exporter /usr/local/bin/
+	install -m 644 prometheus-dcgm.service /etc/systemd/system/
+
+uninstall:
+	rm -rf /run/prometheus
+	rm -f /usr/local/bin/dcgm-exporter
+	rm -f /etc/systemd/system/prometheus-dcgm.service
diff --git a/exporters/prometheus-dcgm/README.md b/exporters/prometheus-dcgm/README.md
@@ -0,0 +1,79 @@
+# NVIDIA DCGM exporter for Prometheus
+
+Simple script to export metrics from [NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/data-center-gpu-manager-dcgm) to [Prometheus](https://prometheus.io/).
+
+### DCGM supported GPUs
+
+Make sure all GPUs on the system are DCGM supported, otherwise the script will fail.
+```
+$ docker run --runtime=nvidia --rm --name=nvidia-dcgm-exporter nvidia/dcgm-exporter
+
+# The output of dcgmi discovery and nvidia-smi should be same.
+
+$ docker exec nvidia-dcgm-exporter dcgmi discovery -i a -v | grep -c 'GPU ID:'
+$ nvidia-smi -L | wc -l
+```
+
+## Bare-metal install
+```sh
+# Download and install DCGM, then
+
+$ sudo make install
+$ sudo systemctl start prometheus-dcgm
+```
+
+## Container install
+```sh
+$ docker-compose up
+```
+
+## Deploy on kubernetes cluster
+```sh
+# First, set deafult runtime to nvidia on your GPU node by editing /etc/docker/daemon.json.
+
+{
+    "default-runtime": "nvidia",
+    "runtimes": {
+    	"nvidia": {
+            "path": "/usr/bin/nvidia-container-runtime",
+            "runtimeArgs": []
+        }
+    }
+}
+
+$ sudo systemctl daemon-reload
+$ sudo systemctl restart docker
+$ sudo systemctl restart kubelet
+$ kubectl create -f dcgm-exporter-daemonset.yaml
+```
+
+## node-exporter
+
+Add GPU metrics directly to node-exporter.
+```sh
+$ docker run -d --runtime=nvidia --rm --name=nvidia-dcgm-exporter nvidia/dcgm-exporter
+$ docker run -d --rm --net="host" --pid="host" --volumes-from nvidia-dcgm-exporter:ro quay.io/prometheus/node-exporter --collector.textfile.directory="/run/prometheus"
+
+$ curl localhost:9100/metrics
+
+# Sample output
+
+# HELP dcgm_gpu_temp GPU temperature (in C).
+# TYPE dcgm_gpu_temp gauge
+dcgm_gpu_temp{gpu="0",uuid="GPU-8f640a3c-7e9a-608d-02a3-f4372d72b323"} 34
+# HELP dcgm_gpu_utilization GPU utilization (in %).
+# TYPE dcgm_gpu_utilization gauge
+dcgm_gpu_utilization{gpu="0",uuid="GPU-8f640a3c-7e9a-608d-02a3-f4372d72b323"} 0
+# HELP dcgm_power_usage Power draw (in W).
+# TYPE dcgm_power_usage gauge
+dcgm_power_usage{gpu="0",uuid="GPU-8f640a3c-7e9a-608d-02a3-f4372d72b323"} 31.737
+# HELP dcgm_sm_clock SM clock frequency (in MHz).
+# TYPE dcgm_sm_clock gauge
+dcgm_sm_clock{gpu="0",uuid="GPU-8f640a3c-7e9a-608d-02a3-f4372d72b323"} 135
+# HELP dcgm_total_energy_consumption Total energy consumption since boot (in mJ).
+# TYPE dcgm_total_energy_consumption counter
+dcgm_total_energy_consumption{gpu="0",uuid="GPU-8f640a3c-7e9a-608d-02a3-f4372d72b323"} 7.824041e+06
+# HELP dcgm_xid_errors Value of the last XID error encountered.
+# TYPE dcgm_xid_errors gauge
+dcgm_xid_errors{gpu="0",uuid="GPU-8f640a3c-7e9a-608d-02a3-f4372d72b323"} 0
+```
diff --git a/exporters/prometheus-dcgm/dcgm-exporter/Dockerfile b/exporters/prometheus-dcgm/dcgm-exporter/Dockerfile
@@ -0,0 +1,15 @@
+FROM ubuntu:16.04
+
+ARG DCGM_VERSION=1.4.3
+
+COPY datacenter-gpu-manager_${DCGM_VERSION}_amd64.deb /tmp
+RUN dpkg -i /tmp/*.deb && rm -f /tmp/*
+
+COPY dcgm-exporter /usr/local/bin
+
+ENV NVIDIA_VISIBLE_DEVICES all
+ENV NVIDIA_DRIVER_CAPABILITIES utility
+
+VOLUME /run/prometheus
+
+ENTRYPOINT [ "dcgm-exporter", "-e" ]
diff --git a/exporters/prometheus-dcgm/dcgm-exporter/dcgm-exporter b/exporters/prometheus-dcgm/dcgm-exporter/dcgm-exporter
@@ -0,0 +1,158 @@
+#! /bin/bash
+
+set -u
+
+HOST_ENGINE=no
+COLLECT_INTERVAL_MS=1000
+OUTPUT_FILE=/run/prometheus/dcgm.prom
+
+usage() {
+    echo "usage: $0 [-h] [-e] [-o output_file] [-d collect_interval_ms (>=100)]" >&2
+}
+
+options=$(getopt -o heo:d: -- "$@")
+if [ $? -ne 0 ]; then
+    usage && exit 1
+fi
+eval set -- "${options}"
+
+while true; do
+    case "$1" in
+        -h) usage && exit 1;;
+        -e) HOST_ENGINE=yes; shift;;
+        -o) OUTPUT_FILE=$2; shift 2;;
+        -d) COLLECT_INTERVAL_MS=$2; shift 2;;
+        *)  break;;
+    esac
+done
+
+if [ "${COLLECT_INTERVAL_MS}" -lt 100 ]; then
+    usage && exit 1
+fi
+
+mkdir -p $(dirname ${OUTPUT_FILE})
+trap 'echo "Caught signal, terminating..."' HUP INT QUIT PIPE TERM
+
+if [ "${HOST_ENGINE}" = "yes" ]; then
+    echo "Starting NVIDIA host engine..."
+    nv-hostengine 2> /dev/null
+fi
+
+echo "Collecting metrics at ${OUTPUT_FILE} every ${COLLECT_INTERVAL_MS}ms..."
+
+dcgmi dmon -d "${COLLECT_INTERVAL_MS}" -e \
+"54,"\
+"100,101,"\
+"140,150,"\
+"155,156,"\
+"200,201,202,"\
+"203,204,206,207,"\
+"230,240,241,242,243,244,245,246,"\
+"251,252,"\
+"310,311,312,313,"\
+"390,391,392,"\
+"409,419,429,439" | \
+awk -v "out=${OUTPUT_FILE}" -v "ngpus=$(nvidia-smi -L | wc -l)" '
+
+function metric(name, type, help, value) {
+    if (value !~ "N/A") {
+        if (gpu == 0) {
+            printf "# HELP dcgm_%s %s\n", name, help > out".swp"
+            printf "# TYPE dcgm_%s %s\n", name, type > out".swp"
+        }
+        printf "dcgm_%s{gpu=\"%s\",uuid=\"%s\"} %s\n", name, gpu, uuid, value > out".swp"
+    }
+}
+(NF && NR > 2 && !($1 ~ "^#" || $1 ~ "^Id")) {
+    # Labels
+    i = 1
+    gpu = $(i++)                                                                                                      # field 0 (implicit)
+    uuid = $(i++)                                                                                                     # field 54
+
+    # Clocks
+    metric("sm_clock", "gauge", "SM clock frequency (in MHz).", $(i++))                                               # field 100
+    metric("memory_clock", "gauge", "Memory clock frequency (in MHz).", $(i++))                                       # field 101
+
+    # Temperature
+    metric("memory_temp", "gauge", "Memory temperature (in C).", $(i++))                                              # field 140
+    metric("gpu_temp", "gauge", "GPU temperature (in C).", $(i++))                                                    # field 150
+
+    # Power
+    metric("power_usage", "gauge", "Power draw (in W).", $(i++))                                                      # field 155
+    metric("total_energy_consumption", "counter", "Total energy consumption since boot (in mJ).", $(i++))             # field 156
+
+    # PCIe
+    metric("pcie_tx_throughput", "counter", "Total number of bytes transmitted through PCIe TX (in KB)", $(i++))                                                                     # field 200
+    metric("pcie_rx_throughput", "counter", "Total number of bytes received through PCIe RX (in KB)", $(i++))                                                                     # field 201
+    metric("pcie_replay_counter", "counter", "Total number of PCIe retries.", $(i++))                                 # field 202
+
+    # Utilization (the sample period varies depending on the product)
+    metric("gpu_utilization", "gauge", "GPU utilization (in %).", $(i++))                                             # field 203
+    metric("mem_copy_utilization", "gauge", "Memory utilization (in %).", $(i++))                                     # field 204
+    metric("enc_utilization", "gauge", "Encoder utilization (in %).", $(i++))                                         # field 206
+    metric("dec_utilization", "gauge", "Decoder utilization (in %).", $(i++))                                         # field 207
+
+    # Errors and violations
+    metric("xid_errors", "gauge", "Value of the last XID error encountered.", $(i++))                                 # field 230
+    metric("power_violation", "counter", "Throttling duration due to power constraints (in us).", $(i++))             # field 240
+    metric("thermal_violation", "counter", "Throttling duration due to thermal constraints (in us).", $(i++))         # field 241
+    metric("sync_boost_violation", "counter", "Throttling duration due to sync-boost constraints (in us).", $(i++))   # field 242
+    metric("board_limit_violation", "counter", "Throttling duration due to board limit constraints (in us).", $(i++)) # field 243 FIXME
+    metric("low_util_violation", "counter", "Throttling duration due to low utilization (in us).", $(i++))            # field 244
+    metric("reliability_violation", "counter", "Throttling duration due to reliability constraints (in us).", $(i++)) # field 245 FIXME
+    metric("app_clock_violation", "counter", "Total throttling duration (in us).", $(i++))                            # field 246
+
+    # Memory usage
+    metric("fb_free", "gauge", "Framebuffer memory free (in MiB).", $(i++))                                           # field 251
+    metric("fb_used", "gauge", "Framebuffer memory used (in MiB).", $(i++))                                           # field 252
+
+    # ECC
+    metric("ecc_sbe_volatile_total", "counter", "Total number of single-bit volatile ECC errors.", $(i++))            # field 310
+    metric("ecc_dbe_volatile_total", "counter", "Total number of double-bit volatile ECC errors.", $(i++))            # field 311
+    metric("ecc_sbe_aggregate_total", "counter", "Total number of single-bit persistent ECC errors.", $(i++))         # field 312
+    metric("ecc_dbe_aggregate_total", "counter", "Total number of double-bit persistent ECC errors.", $(i++))         # field 313
+
+    # Retired pages
+    metric("retired_pages_sbe", "counter", "Total number of retired pages due to single-bit errors.", $(i++))         # field 390
+    metric("retired_pages_dbe", "counter", "Total number of retired pages due to double-bit errors.", $(i++))         # field 391
+    metric("retired_pages_pending", "counter", "Total number of pages pending retirement.", $(i++))                   # field 392
+
+    # NVLink
+    metric("nvlink_flit_crc_error_count_total", "counter", "Total number of NVLink flow-control CRC errors.", $(i++)) # field 409
+    metric("nvlink_data_crc_error_count_total", "counter", "Total number of NVLink data CRC errors.", $(i++))         # field 419
+    metric("nvlink_replay_error_count_total", "counter", "Total number of NVLink retries.", $(i++))                   # field 429
+    metric("nvlink_recovery_error_count_total", "counter", "Total number of NVLink recovery errors.", $(i++))         # field 439
+    #metric("nvlink_bandwidth_total", "counter", "Total number of NVLink bandwidth counters for all lanes", $(i++))   # field 449 TODO
+
+    # Flush output file and move it for atomicity
+    if (gpu == ngpus - 1) {
+        close(out".swp")
+        system("mv "out".swp "out)
+    }
+}' &
+
+wait $!
+
+if [ "${HOST_ENGINE}" = "yes" ]; then
+    echo "Stopping NVIDIA host engine..."
+    nv-hostengine --term
+
+    if [ -f /run/nvhostengine.pid ]; then
+        pid=$(< /run/nvhostengine.pid)
+
+        kill -SIGTERM "${pid}"
+        for i in $(seq 1 100); do
+            kill -0 "${pid}" 2> /dev/null || break
+            sleep 0.1
+        done
+        if [ $i -eq 100 ]; then
+            echo "Could not stop NVIDIA host engine" >&2
+            kill -9 "${pid}" 2> /dev/null
+            exit 1
+        fi
+        rm -f /run/nvhostengine.pid
+    fi
+fi
+
+echo "Done"
+exit 0
diff --git a/exporters/prometheus-dcgm/docker-compose.yml b/exporters/prometheus-dcgm/docker-compose.yml
@@ -0,0 +1,50 @@
+version: '2.3'
+
+services:
+    prometheus:
+        build: prometheus
+        ports:
+            - 9090:9090
+        networks:
+            - default
+
+    node_exporter:
+        image: prom/node-exporter
+        command: --collector.textfile.directory=/run/prometheus
+        pid: "host"
+        volumes:
+            - prometheus_data:/prometheus
+            - prometheus_textfiles:/run/prometheus:ro
+        networks:
+            - default
+
+    grafana:
+        image: grafana/grafana
+        volumes:
+            - grafana_data:/var/lib/grafana
+        ports:
+            - 3000:3000
+        networks:
+            - default
+
+    dcgm_exporter:
+        image: nvidia/dcgm-exporter:1.4.3
+        runtime: nvidia
+        volumes:
+            - prometheus_textfiles:/run/prometheus
+        networks:
+            - default
+
+volumes:
+    prometheus_textfiles:
+        driver_opts:
+            type: tmpfs
+            device: tmpfs
+    prometheus_data:
+        driver: local
+    grafana_data:
+        driver: local
+
+networks:
+    default:
+        driver: bridge
diff --git a/exporters/prometheus-dcgm/prometheus-dcgm.service b/exporters/prometheus-dcgm/prometheus-dcgm.service
@@ -0,0 +1,12 @@
+[Unit]
+Description=Prometheus DCGM exporter
+Wants=dcgm.service
+After=dcgm.service
+
+[Service]
+Type=simple
+Restart=always
+ExecStart=/usr/local/bin/dcgm-exporter
+
+[Install]
+WantedBy=multi-user.target
diff --git a/exporters/prometheus-dcgm/prometheus/Dockerfile b/exporters/prometheus-dcgm/prometheus/Dockerfile
@@ -0,0 +1,3 @@
+FROM prom/prometheus
+
+COPY prometheus.yml /etc/prometheus/
diff --git a/exporters/prometheus-dcgm/prometheus/prometheus.yml b/exporters/prometheus-dcgm/prometheus/prometheus.yml
@@ -0,0 +1,12 @@
+global:
+  scrape_interval: 15s
+
+scrape_configs:
+  - job_name: 'prometheus'
+    scrape_interval: 5s
+    static_configs:
+      - targets: ['localhost:9090']
+  - job_name: 'node_exporter'
+    scrape_interval: 1s
+    static_configs:
+      - targets: ['node_exporter:9100']
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		FROM prom/prometheus

		COPY prometheus.yml /etc/prometheus/