-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
6c04a3d
commit 48701fa
Showing
8 changed files
with
338 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
install: | ||
install -d -m 755 /run/prometheus | ||
install -m 775 dcgm-exporter/dcgm-exporter /usr/local/bin/ | ||
install -m 644 prometheus-dcgm.service /etc/systemd/system/ | ||
|
||
uninstall: | ||
rm -rf /run/prometheus | ||
rm -f /usr/local/bin/dcgm-exporter | ||
rm -f /etc/systemd/system/prometheus-dcgm.service |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
# NVIDIA DCGM exporter for Prometheus | ||
|
||
Simple script to export metrics from [NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/data-center-gpu-manager-dcgm) to [Prometheus](https://prometheus.io/). | ||
|
||
### DCGM supported GPUs | ||
|
||
Make sure all GPUs on the system are DCGM supported, otherwise the script will fail. | ||
``` | ||
$ docker run --runtime=nvidia --rm --name=nvidia-dcgm-exporter nvidia/dcgm-exporter | ||
# The output of dcgmi discovery and nvidia-smi should be same. | ||
$ docker exec nvidia-dcgm-exporter dcgmi discovery -i a -v | grep -c 'GPU ID:' | ||
$ nvidia-smi -L | wc -l | ||
``` | ||
|
||
## Bare-metal install | ||
```sh | ||
# Download and install DCGM, then | ||
|
||
$ sudo make install | ||
$ sudo systemctl start prometheus-dcgm | ||
``` | ||
|
||
## Container install | ||
```sh | ||
$ docker-compose up | ||
``` | ||
|
||
## Deploy on kubernetes cluster | ||
```sh | ||
# First, set deafult runtime to nvidia on your GPU node by editing /etc/docker/daemon.json. | ||
|
||
{ | ||
"default-runtime": "nvidia", | ||
"runtimes": { | ||
"nvidia": { | ||
"path": "/usr/bin/nvidia-container-runtime", | ||
"runtimeArgs": [] | ||
} | ||
} | ||
} | ||
|
||
$ sudo systemctl daemon-reload | ||
$ sudo systemctl restart docker | ||
$ sudo systemctl restart kubelet | ||
$ kubectl create -f dcgm-exporter-daemonset.yaml | ||
``` | ||
|
||
## node-exporter | ||
|
||
Add GPU metrics directly to node-exporter. | ||
```sh | ||
$ docker run -d --runtime=nvidia --rm --name=nvidia-dcgm-exporter nvidia/dcgm-exporter | ||
$ docker run -d --rm --net="host" --pid="host" --volumes-from nvidia-dcgm-exporter:ro quay.io/prometheus/node-exporter --collector.textfile.directory="/run/prometheus" | ||
|
||
$ curl localhost:9100/metrics | ||
|
||
# Sample output | ||
|
||
# HELP dcgm_gpu_temp GPU temperature (in C). | ||
# TYPE dcgm_gpu_temp gauge | ||
dcgm_gpu_temp{gpu="0",uuid="GPU-8f640a3c-7e9a-608d-02a3-f4372d72b323"} 34 | ||
# HELP dcgm_gpu_utilization GPU utilization (in %). | ||
# TYPE dcgm_gpu_utilization gauge | ||
dcgm_gpu_utilization{gpu="0",uuid="GPU-8f640a3c-7e9a-608d-02a3-f4372d72b323"} 0 | ||
# HELP dcgm_power_usage Power draw (in W). | ||
# TYPE dcgm_power_usage gauge | ||
dcgm_power_usage{gpu="0",uuid="GPU-8f640a3c-7e9a-608d-02a3-f4372d72b323"} 31.737 | ||
# HELP dcgm_sm_clock SM clock frequency (in MHz). | ||
# TYPE dcgm_sm_clock gauge | ||
dcgm_sm_clock{gpu="0",uuid="GPU-8f640a3c-7e9a-608d-02a3-f4372d72b323"} 135 | ||
# HELP dcgm_total_energy_consumption Total energy consumption since boot (in mJ). | ||
# TYPE dcgm_total_energy_consumption counter | ||
dcgm_total_energy_consumption{gpu="0",uuid="GPU-8f640a3c-7e9a-608d-02a3-f4372d72b323"} 7.824041e+06 | ||
# HELP dcgm_xid_errors Value of the last XID error encountered. | ||
# TYPE dcgm_xid_errors gauge | ||
dcgm_xid_errors{gpu="0",uuid="GPU-8f640a3c-7e9a-608d-02a3-f4372d72b323"} 0 | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
FROM ubuntu:16.04 | ||
|
||
ARG DCGM_VERSION=1.4.3 | ||
|
||
COPY datacenter-gpu-manager_${DCGM_VERSION}_amd64.deb /tmp | ||
RUN dpkg -i /tmp/*.deb && rm -f /tmp/* | ||
|
||
COPY dcgm-exporter /usr/local/bin | ||
|
||
ENV NVIDIA_VISIBLE_DEVICES all | ||
ENV NVIDIA_DRIVER_CAPABILITIES utility | ||
|
||
VOLUME /run/prometheus | ||
|
||
ENTRYPOINT [ "dcgm-exporter", "-e" ] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
#! /bin/bash | ||
|
||
set -u | ||
|
||
HOST_ENGINE=no | ||
COLLECT_INTERVAL_MS=1000 | ||
OUTPUT_FILE=/run/prometheus/dcgm.prom | ||
|
||
usage() { | ||
echo "usage: $0 [-h] [-e] [-o output_file] [-d collect_interval_ms (>=100)]" >&2 | ||
} | ||
|
||
options=$(getopt -o heo:d: -- "$@") | ||
if [ $? -ne 0 ]; then | ||
usage && exit 1 | ||
fi | ||
eval set -- "${options}" | ||
|
||
while true; do | ||
case "$1" in | ||
-h) usage && exit 1;; | ||
-e) HOST_ENGINE=yes; shift;; | ||
-o) OUTPUT_FILE=$2; shift 2;; | ||
-d) COLLECT_INTERVAL_MS=$2; shift 2;; | ||
*) break;; | ||
esac | ||
done | ||
|
||
if [ "${COLLECT_INTERVAL_MS}" -lt 100 ]; then | ||
usage && exit 1 | ||
fi | ||
|
||
mkdir -p $(dirname ${OUTPUT_FILE}) | ||
trap 'echo "Caught signal, terminating..."' HUP INT QUIT PIPE TERM | ||
|
||
if [ "${HOST_ENGINE}" = "yes" ]; then | ||
echo "Starting NVIDIA host engine..." | ||
nv-hostengine 2> /dev/null | ||
fi | ||
|
||
echo "Collecting metrics at ${OUTPUT_FILE} every ${COLLECT_INTERVAL_MS}ms..." | ||
|
||
dcgmi dmon -d "${COLLECT_INTERVAL_MS}" -e \ | ||
"54,"\ | ||
"100,101,"\ | ||
"140,150,"\ | ||
"155,156,"\ | ||
"200,201,202,"\ | ||
"203,204,206,207,"\ | ||
"230,240,241,242,243,244,245,246,"\ | ||
"251,252,"\ | ||
"310,311,312,313,"\ | ||
"390,391,392,"\ | ||
"409,419,429,439" | \ | ||
awk -v "out=${OUTPUT_FILE}" -v "ngpus=$(nvidia-smi -L | wc -l)" ' | ||
function metric(name, type, help, value) { | ||
if (value !~ "N/A") { | ||
if (gpu == 0) { | ||
printf "# HELP dcgm_%s %s\n", name, help > out".swp" | ||
printf "# TYPE dcgm_%s %s\n", name, type > out".swp" | ||
} | ||
printf "dcgm_%s{gpu=\"%s\",uuid=\"%s\"} %s\n", name, gpu, uuid, value > out".swp" | ||
} | ||
} | ||
(NF && NR > 2 && !($1 ~ "^#" || $1 ~ "^Id")) { | ||
# Labels | ||
i = 1 | ||
gpu = $(i++) # field 0 (implicit) | ||
uuid = $(i++) # field 54 | ||
# Clocks | ||
metric("sm_clock", "gauge", "SM clock frequency (in MHz).", $(i++)) # field 100 | ||
metric("memory_clock", "gauge", "Memory clock frequency (in MHz).", $(i++)) # field 101 | ||
# Temperature | ||
metric("memory_temp", "gauge", "Memory temperature (in C).", $(i++)) # field 140 | ||
metric("gpu_temp", "gauge", "GPU temperature (in C).", $(i++)) # field 150 | ||
# Power | ||
metric("power_usage", "gauge", "Power draw (in W).", $(i++)) # field 155 | ||
metric("total_energy_consumption", "counter", "Total energy consumption since boot (in mJ).", $(i++)) # field 156 | ||
# PCIe | ||
metric("pcie_tx_throughput", "counter", "Total number of bytes transmitted through PCIe TX (in KB)", $(i++)) # field 200 | ||
metric("pcie_rx_throughput", "counter", "Total number of bytes received through PCIe RX (in KB)", $(i++)) # field 201 | ||
metric("pcie_replay_counter", "counter", "Total number of PCIe retries.", $(i++)) # field 202 | ||
# Utilization (the sample period varies depending on the product) | ||
metric("gpu_utilization", "gauge", "GPU utilization (in %).", $(i++)) # field 203 | ||
metric("mem_copy_utilization", "gauge", "Memory utilization (in %).", $(i++)) # field 204 | ||
metric("enc_utilization", "gauge", "Encoder utilization (in %).", $(i++)) # field 206 | ||
metric("dec_utilization", "gauge", "Decoder utilization (in %).", $(i++)) # field 207 | ||
# Errors and violations | ||
metric("xid_errors", "gauge", "Value of the last XID error encountered.", $(i++)) # field 230 | ||
metric("power_violation", "counter", "Throttling duration due to power constraints (in us).", $(i++)) # field 240 | ||
metric("thermal_violation", "counter", "Throttling duration due to thermal constraints (in us).", $(i++)) # field 241 | ||
metric("sync_boost_violation", "counter", "Throttling duration due to sync-boost constraints (in us).", $(i++)) # field 242 | ||
metric("board_limit_violation", "counter", "Throttling duration due to board limit constraints (in us).", $(i++)) # field 243 FIXME | ||
metric("low_util_violation", "counter", "Throttling duration due to low utilization (in us).", $(i++)) # field 244 | ||
metric("reliability_violation", "counter", "Throttling duration due to reliability constraints (in us).", $(i++)) # field 245 FIXME | ||
metric("app_clock_violation", "counter", "Total throttling duration (in us).", $(i++)) # field 246 | ||
# Memory usage | ||
metric("fb_free", "gauge", "Framebuffer memory free (in MiB).", $(i++)) # field 251 | ||
metric("fb_used", "gauge", "Framebuffer memory used (in MiB).", $(i++)) # field 252 | ||
# ECC | ||
metric("ecc_sbe_volatile_total", "counter", "Total number of single-bit volatile ECC errors.", $(i++)) # field 310 | ||
metric("ecc_dbe_volatile_total", "counter", "Total number of double-bit volatile ECC errors.", $(i++)) # field 311 | ||
metric("ecc_sbe_aggregate_total", "counter", "Total number of single-bit persistent ECC errors.", $(i++)) # field 312 | ||
metric("ecc_dbe_aggregate_total", "counter", "Total number of double-bit persistent ECC errors.", $(i++)) # field 313 | ||
# Retired pages | ||
metric("retired_pages_sbe", "counter", "Total number of retired pages due to single-bit errors.", $(i++)) # field 390 | ||
metric("retired_pages_dbe", "counter", "Total number of retired pages due to double-bit errors.", $(i++)) # field 391 | ||
metric("retired_pages_pending", "counter", "Total number of pages pending retirement.", $(i++)) # field 392 | ||
# NVLink | ||
metric("nvlink_flit_crc_error_count_total", "counter", "Total number of NVLink flow-control CRC errors.", $(i++)) # field 409 | ||
metric("nvlink_data_crc_error_count_total", "counter", "Total number of NVLink data CRC errors.", $(i++)) # field 419 | ||
metric("nvlink_replay_error_count_total", "counter", "Total number of NVLink retries.", $(i++)) # field 429 | ||
metric("nvlink_recovery_error_count_total", "counter", "Total number of NVLink recovery errors.", $(i++)) # field 439 | ||
#metric("nvlink_bandwidth_total", "counter", "Total number of NVLink bandwidth counters for all lanes", $(i++)) # field 449 TODO | ||
# Flush output file and move it for atomicity | ||
if (gpu == ngpus - 1) { | ||
close(out".swp") | ||
system("mv "out".swp "out) | ||
} | ||
}' & | ||
|
||
wait $! | ||
|
||
if [ "${HOST_ENGINE}" = "yes" ]; then | ||
echo "Stopping NVIDIA host engine..." | ||
nv-hostengine --term | ||
|
||
if [ -f /run/nvhostengine.pid ]; then | ||
pid=$(< /run/nvhostengine.pid) | ||
|
||
kill -SIGTERM "${pid}" | ||
for i in $(seq 1 100); do | ||
kill -0 "${pid}" 2> /dev/null || break | ||
sleep 0.1 | ||
done | ||
if [ $i -eq 100 ]; then | ||
echo "Could not stop NVIDIA host engine" >&2 | ||
kill -9 "${pid}" 2> /dev/null | ||
exit 1 | ||
fi | ||
rm -f /run/nvhostengine.pid | ||
fi | ||
fi | ||
|
||
echo "Done" | ||
exit 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
version: '2.3' | ||
|
||
services: | ||
prometheus: | ||
build: prometheus | ||
ports: | ||
- 9090:9090 | ||
networks: | ||
- default | ||
|
||
node_exporter: | ||
image: prom/node-exporter | ||
command: --collector.textfile.directory=/run/prometheus | ||
pid: "host" | ||
volumes: | ||
- prometheus_data:/prometheus | ||
- prometheus_textfiles:/run/prometheus:ro | ||
networks: | ||
- default | ||
|
||
grafana: | ||
image: grafana/grafana | ||
volumes: | ||
- grafana_data:/var/lib/grafana | ||
ports: | ||
- 3000:3000 | ||
networks: | ||
- default | ||
|
||
dcgm_exporter: | ||
image: nvidia/dcgm-exporter:1.4.3 | ||
runtime: nvidia | ||
volumes: | ||
- prometheus_textfiles:/run/prometheus | ||
networks: | ||
- default | ||
|
||
volumes: | ||
prometheus_textfiles: | ||
driver_opts: | ||
type: tmpfs | ||
device: tmpfs | ||
prometheus_data: | ||
driver: local | ||
grafana_data: | ||
driver: local | ||
|
||
networks: | ||
default: | ||
driver: bridge |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
[Unit] | ||
Description=Prometheus DCGM exporter | ||
Wants=dcgm.service | ||
After=dcgm.service | ||
|
||
[Service] | ||
Type=simple | ||
Restart=always | ||
ExecStart=/usr/local/bin/dcgm-exporter | ||
|
||
[Install] | ||
WantedBy=multi-user.target |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
FROM prom/prometheus | ||
|
||
COPY prometheus.yml /etc/prometheus/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
global: | ||
scrape_interval: 15s | ||
|
||
scrape_configs: | ||
- job_name: 'prometheus' | ||
scrape_interval: 5s | ||
static_configs: | ||
- targets: ['localhost:9090'] | ||
- job_name: 'node_exporter' | ||
scrape_interval: 1s | ||
static_configs: | ||
- targets: ['node_exporter:9100'] |