Skip to content

Commit

Permalink
Add DCGM exporter
Browse files Browse the repository at this point in the history
  • Loading branch information
3XX0 authored and guptaNswati committed Jun 1, 2018
1 parent 6c04a3d commit 48701fa
Show file tree
Hide file tree
Showing 8 changed files with 338 additions and 0 deletions.
9 changes: 9 additions & 0 deletions exporters/prometheus-dcgm/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
install:
install -d -m 755 /run/prometheus
install -m 775 dcgm-exporter/dcgm-exporter /usr/local/bin/
install -m 644 prometheus-dcgm.service /etc/systemd/system/

uninstall:
rm -rf /run/prometheus
rm -f /usr/local/bin/dcgm-exporter
rm -f /etc/systemd/system/prometheus-dcgm.service
79 changes: 79 additions & 0 deletions exporters/prometheus-dcgm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# NVIDIA DCGM exporter for Prometheus

Simple script to export metrics from [NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/data-center-gpu-manager-dcgm) to [Prometheus](https://prometheus.io/).

### DCGM supported GPUs

Make sure all GPUs on the system are DCGM supported, otherwise the script will fail.
```
$ docker run --runtime=nvidia --rm --name=nvidia-dcgm-exporter nvidia/dcgm-exporter
# The output of dcgmi discovery and nvidia-smi should be same.
$ docker exec nvidia-dcgm-exporter dcgmi discovery -i a -v | grep -c 'GPU ID:'
$ nvidia-smi -L | wc -l
```

## Bare-metal install
```sh
# Download and install DCGM, then

$ sudo make install
$ sudo systemctl start prometheus-dcgm
```

## Container install
```sh
$ docker-compose up
```

## Deploy on kubernetes cluster
```sh
# First, set deafult runtime to nvidia on your GPU node by editing /etc/docker/daemon.json.

{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"path": "/usr/bin/nvidia-container-runtime",
"runtimeArgs": []
}
}
}

$ sudo systemctl daemon-reload
$ sudo systemctl restart docker
$ sudo systemctl restart kubelet
$ kubectl create -f dcgm-exporter-daemonset.yaml
```

## node-exporter

Add GPU metrics directly to node-exporter.
```sh
$ docker run -d --runtime=nvidia --rm --name=nvidia-dcgm-exporter nvidia/dcgm-exporter
$ docker run -d --rm --net="host" --pid="host" --volumes-from nvidia-dcgm-exporter:ro quay.io/prometheus/node-exporter --collector.textfile.directory="/run/prometheus"

$ curl localhost:9100/metrics

# Sample output

# HELP dcgm_gpu_temp GPU temperature (in C).
# TYPE dcgm_gpu_temp gauge
dcgm_gpu_temp{gpu="0",uuid="GPU-8f640a3c-7e9a-608d-02a3-f4372d72b323"} 34
# HELP dcgm_gpu_utilization GPU utilization (in %).
# TYPE dcgm_gpu_utilization gauge
dcgm_gpu_utilization{gpu="0",uuid="GPU-8f640a3c-7e9a-608d-02a3-f4372d72b323"} 0
# HELP dcgm_power_usage Power draw (in W).
# TYPE dcgm_power_usage gauge
dcgm_power_usage{gpu="0",uuid="GPU-8f640a3c-7e9a-608d-02a3-f4372d72b323"} 31.737
# HELP dcgm_sm_clock SM clock frequency (in MHz).
# TYPE dcgm_sm_clock gauge
dcgm_sm_clock{gpu="0",uuid="GPU-8f640a3c-7e9a-608d-02a3-f4372d72b323"} 135
# HELP dcgm_total_energy_consumption Total energy consumption since boot (in mJ).
# TYPE dcgm_total_energy_consumption counter
dcgm_total_energy_consumption{gpu="0",uuid="GPU-8f640a3c-7e9a-608d-02a3-f4372d72b323"} 7.824041e+06
# HELP dcgm_xid_errors Value of the last XID error encountered.
# TYPE dcgm_xid_errors gauge
dcgm_xid_errors{gpu="0",uuid="GPU-8f640a3c-7e9a-608d-02a3-f4372d72b323"} 0
```
15 changes: 15 additions & 0 deletions exporters/prometheus-dcgm/dcgm-exporter/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM ubuntu:16.04

ARG DCGM_VERSION=1.4.3

COPY datacenter-gpu-manager_${DCGM_VERSION}_amd64.deb /tmp
RUN dpkg -i /tmp/*.deb && rm -f /tmp/*

COPY dcgm-exporter /usr/local/bin

ENV NVIDIA_VISIBLE_DEVICES all
ENV NVIDIA_DRIVER_CAPABILITIES utility

VOLUME /run/prometheus

ENTRYPOINT [ "dcgm-exporter", "-e" ]
158 changes: 158 additions & 0 deletions exporters/prometheus-dcgm/dcgm-exporter/dcgm-exporter
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
#! /bin/bash

set -u

HOST_ENGINE=no
COLLECT_INTERVAL_MS=1000
OUTPUT_FILE=/run/prometheus/dcgm.prom

usage() {
echo "usage: $0 [-h] [-e] [-o output_file] [-d collect_interval_ms (>=100)]" >&2
}

options=$(getopt -o heo:d: -- "$@")
if [ $? -ne 0 ]; then
usage && exit 1
fi
eval set -- "${options}"

while true; do
case "$1" in
-h) usage && exit 1;;
-e) HOST_ENGINE=yes; shift;;
-o) OUTPUT_FILE=$2; shift 2;;
-d) COLLECT_INTERVAL_MS=$2; shift 2;;
*) break;;
esac
done

if [ "${COLLECT_INTERVAL_MS}" -lt 100 ]; then
usage && exit 1
fi

mkdir -p $(dirname ${OUTPUT_FILE})
trap 'echo "Caught signal, terminating..."' HUP INT QUIT PIPE TERM

if [ "${HOST_ENGINE}" = "yes" ]; then
echo "Starting NVIDIA host engine..."
nv-hostengine 2> /dev/null
fi

echo "Collecting metrics at ${OUTPUT_FILE} every ${COLLECT_INTERVAL_MS}ms..."

dcgmi dmon -d "${COLLECT_INTERVAL_MS}" -e \
"54,"\
"100,101,"\
"140,150,"\
"155,156,"\
"200,201,202,"\
"203,204,206,207,"\
"230,240,241,242,243,244,245,246,"\
"251,252,"\
"310,311,312,313,"\
"390,391,392,"\
"409,419,429,439" | \
awk -v "out=${OUTPUT_FILE}" -v "ngpus=$(nvidia-smi -L | wc -l)" '
function metric(name, type, help, value) {
if (value !~ "N/A") {
if (gpu == 0) {
printf "# HELP dcgm_%s %s\n", name, help > out".swp"
printf "# TYPE dcgm_%s %s\n", name, type > out".swp"
}
printf "dcgm_%s{gpu=\"%s\",uuid=\"%s\"} %s\n", name, gpu, uuid, value > out".swp"
}
}
(NF && NR > 2 && !($1 ~ "^#" || $1 ~ "^Id")) {
# Labels
i = 1
gpu = $(i++) # field 0 (implicit)
uuid = $(i++) # field 54
# Clocks
metric("sm_clock", "gauge", "SM clock frequency (in MHz).", $(i++)) # field 100
metric("memory_clock", "gauge", "Memory clock frequency (in MHz).", $(i++)) # field 101
# Temperature
metric("memory_temp", "gauge", "Memory temperature (in C).", $(i++)) # field 140
metric("gpu_temp", "gauge", "GPU temperature (in C).", $(i++)) # field 150
# Power
metric("power_usage", "gauge", "Power draw (in W).", $(i++)) # field 155
metric("total_energy_consumption", "counter", "Total energy consumption since boot (in mJ).", $(i++)) # field 156
# PCIe
metric("pcie_tx_throughput", "counter", "Total number of bytes transmitted through PCIe TX (in KB)", $(i++)) # field 200
metric("pcie_rx_throughput", "counter", "Total number of bytes received through PCIe RX (in KB)", $(i++)) # field 201
metric("pcie_replay_counter", "counter", "Total number of PCIe retries.", $(i++)) # field 202
# Utilization (the sample period varies depending on the product)
metric("gpu_utilization", "gauge", "GPU utilization (in %).", $(i++)) # field 203
metric("mem_copy_utilization", "gauge", "Memory utilization (in %).", $(i++)) # field 204
metric("enc_utilization", "gauge", "Encoder utilization (in %).", $(i++)) # field 206
metric("dec_utilization", "gauge", "Decoder utilization (in %).", $(i++)) # field 207
# Errors and violations
metric("xid_errors", "gauge", "Value of the last XID error encountered.", $(i++)) # field 230
metric("power_violation", "counter", "Throttling duration due to power constraints (in us).", $(i++)) # field 240
metric("thermal_violation", "counter", "Throttling duration due to thermal constraints (in us).", $(i++)) # field 241
metric("sync_boost_violation", "counter", "Throttling duration due to sync-boost constraints (in us).", $(i++)) # field 242
metric("board_limit_violation", "counter", "Throttling duration due to board limit constraints (in us).", $(i++)) # field 243 FIXME
metric("low_util_violation", "counter", "Throttling duration due to low utilization (in us).", $(i++)) # field 244
metric("reliability_violation", "counter", "Throttling duration due to reliability constraints (in us).", $(i++)) # field 245 FIXME
metric("app_clock_violation", "counter", "Total throttling duration (in us).", $(i++)) # field 246
# Memory usage
metric("fb_free", "gauge", "Framebuffer memory free (in MiB).", $(i++)) # field 251
metric("fb_used", "gauge", "Framebuffer memory used (in MiB).", $(i++)) # field 252
# ECC
metric("ecc_sbe_volatile_total", "counter", "Total number of single-bit volatile ECC errors.", $(i++)) # field 310
metric("ecc_dbe_volatile_total", "counter", "Total number of double-bit volatile ECC errors.", $(i++)) # field 311
metric("ecc_sbe_aggregate_total", "counter", "Total number of single-bit persistent ECC errors.", $(i++)) # field 312
metric("ecc_dbe_aggregate_total", "counter", "Total number of double-bit persistent ECC errors.", $(i++)) # field 313
# Retired pages
metric("retired_pages_sbe", "counter", "Total number of retired pages due to single-bit errors.", $(i++)) # field 390
metric("retired_pages_dbe", "counter", "Total number of retired pages due to double-bit errors.", $(i++)) # field 391
metric("retired_pages_pending", "counter", "Total number of pages pending retirement.", $(i++)) # field 392
# NVLink
metric("nvlink_flit_crc_error_count_total", "counter", "Total number of NVLink flow-control CRC errors.", $(i++)) # field 409
metric("nvlink_data_crc_error_count_total", "counter", "Total number of NVLink data CRC errors.", $(i++)) # field 419
metric("nvlink_replay_error_count_total", "counter", "Total number of NVLink retries.", $(i++)) # field 429
metric("nvlink_recovery_error_count_total", "counter", "Total number of NVLink recovery errors.", $(i++)) # field 439
#metric("nvlink_bandwidth_total", "counter", "Total number of NVLink bandwidth counters for all lanes", $(i++)) # field 449 TODO
# Flush output file and move it for atomicity
if (gpu == ngpus - 1) {
close(out".swp")
system("mv "out".swp "out)
}
}' &

wait $!

if [ "${HOST_ENGINE}" = "yes" ]; then
echo "Stopping NVIDIA host engine..."
nv-hostengine --term

if [ -f /run/nvhostengine.pid ]; then
pid=$(< /run/nvhostengine.pid)

kill -SIGTERM "${pid}"
for i in $(seq 1 100); do
kill -0 "${pid}" 2> /dev/null || break
sleep 0.1
done
if [ $i -eq 100 ]; then
echo "Could not stop NVIDIA host engine" >&2
kill -9 "${pid}" 2> /dev/null
exit 1
fi
rm -f /run/nvhostengine.pid
fi
fi

echo "Done"
exit 0
50 changes: 50 additions & 0 deletions exporters/prometheus-dcgm/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
version: '2.3'

services:
prometheus:
build: prometheus
ports:
- 9090:9090
networks:
- default

node_exporter:
image: prom/node-exporter
command: --collector.textfile.directory=/run/prometheus
pid: "host"
volumes:
- prometheus_data:/prometheus
- prometheus_textfiles:/run/prometheus:ro
networks:
- default

grafana:
image: grafana/grafana
volumes:
- grafana_data:/var/lib/grafana
ports:
- 3000:3000
networks:
- default

dcgm_exporter:
image: nvidia/dcgm-exporter:1.4.3
runtime: nvidia
volumes:
- prometheus_textfiles:/run/prometheus
networks:
- default

volumes:
prometheus_textfiles:
driver_opts:
type: tmpfs
device: tmpfs
prometheus_data:
driver: local
grafana_data:
driver: local

networks:
default:
driver: bridge
12 changes: 12 additions & 0 deletions exporters/prometheus-dcgm/prometheus-dcgm.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[Unit]
Description=Prometheus DCGM exporter
Wants=dcgm.service
After=dcgm.service

[Service]
Type=simple
Restart=always
ExecStart=/usr/local/bin/dcgm-exporter

[Install]
WantedBy=multi-user.target
3 changes: 3 additions & 0 deletions exporters/prometheus-dcgm/prometheus/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
FROM prom/prometheus

COPY prometheus.yml /etc/prometheus/
12 changes: 12 additions & 0 deletions exporters/prometheus-dcgm/prometheus/prometheus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
global:
scrape_interval: 15s

scrape_configs:
- job_name: 'prometheus'
scrape_interval: 5s
static_configs:
- targets: ['localhost:9090']
- job_name: 'node_exporter'
scrape_interval: 1s
static_configs:
- targets: ['node_exporter:9100']

0 comments on commit 48701fa

Please sign in to comment.