From 4d93accfaedf6acdd778ba818c464e21ac48eef1 Mon Sep 17 00:00:00 2001 From: Pramod Ramarao Date: Sat, 17 Oct 2020 15:46:09 -0700 Subject: [PATCH] Move exporter to stable release 2.1.0 Includes the following changes: 1. Update DCGM version to 2.0.13 (includes MIG metrics) 2. Added error handling for setcap in the entrypoint script - this fixes an issue where the setcap command fails in OpenShift, in turn causing a CrashLoopBackoff error for the dcgm-exporter pod 3. Renamed entrypoint script 4. Fixed typo in the DCGM download URL for UBI8 5. Updated Helm chart versioning and metadata 6. Updated documentation --- Makefile | 4 +- README.md | 62 ++++++++++++++++++---------- dcgm-exporter.yaml | 12 +++--- deployment/dcgm-exporter/Chart.yaml | 17 ++++++-- deployment/dcgm-exporter/values.yaml | 2 +- docker/Dockerfile.ubi8 | 10 ++--- docker/Dockerfile.ubuntu18.04 | 6 +-- docker/Dockerfile.ubuntu20.04 | 6 +-- docker/dcgm-exporter-entrypoint.sh | 21 ++++++++++ docker/docker-entrypoint.sh | 17 -------- service-monitor.yaml | 4 +- 11 files changed, 96 insertions(+), 65 deletions(-) create mode 100644 docker/dcgm-exporter-entrypoint.sh delete mode 100644 docker/docker-entrypoint.sh diff --git a/Makefile b/Makefile index 77016c5..9b2dc88 100644 --- a/Makefile +++ b/Makefile @@ -16,9 +16,9 @@ DOCKER ?= docker MKDIR ?= mkdir REGISTRY ?= nvidia -DCGM_VERSION := 2.0.10 +DCGM_VERSION := 2.0.13 GOLANG_VERSION := 1.14.2 -VERSION := 2.1.0-rc.2 +VERSION := 2.1.0 FULL_VERSION := $(DCGM_VERSION)-$(VERSION) .PHONY: all binary install check-format diff --git a/README.md b/README.md index 0a83eec..ed5ffd4 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,24 @@ # NVIDIA GPU Monitoring Tools +This repository contains Golang bindings and DCGM-Exporter for gathering GPU telemetry in Kubernetes. + ## Bindings -This Github repository contains Golang bindings for the following two libraries: +Golang bindings are provided for the following two libraries: - [NVIDIA Management Library (NVML)](https://docs.nvidia.com/deploy/nvml-api/nvml-api-reference.html#nvml-api-reference) is a C-based API for monitoring and managing NVIDIA GPU devices. - [NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/dcgm) is a set of tools for managing and monitoring NVIDIA GPUs in cluster environments. It's a low overhead tool suite that performs a variety of functions on each host system including active health monitoring, diagnostics, system validation, policies, power and clock management, group configuration and accounting. You will also find samples for both of these bindings in this repository. -## DCGM exporter - -This Github repository also contains the DCGM exporter software. It exposes GPU metrics exporter for [Prometheus](https://prometheus.io/) leveraging [NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/dcgm). +## DCGM-Exporter -Find the installation and run instructions [here](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/exporters/prometheus-dcgm/README.md). +The repository also contains DCGM-Exporter. It exposes GPU metrics exporter for [Prometheus](https://prometheus.io/) leveraging [NVIDIA DCGM](https://developer.nvidia.com/dcgm). ### Quickstart -To gather metrics on a GPU node, simply start the dcgm-exporter container: +To gather metrics on a GPU node, simply start the `dcgm-exporter` container: ``` -$ docker run -d --gpus all --rm -p 9400:9400 nvidia/dcgm-exporter:latest +$ docker run -d --gpus all --rm -p 9400:9400 nvidia/dcgm-exporter:2.0.13-2.1.0-ubuntu18.04 $ curl localhost:9400/metrics # HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). # TYPE DCGM_FI_DEV_SM_CLOCK gauge @@ -35,15 +35,34 @@ DCGM_FI_DEV_MEMORY_TEMP{gpu="0", UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52" ### Quickstart on Kubernetes -Note: Consider using the [NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-operator) rather than the DCGM exporter directly. +Note: Consider using the [NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-operator) rather than DCGM-Exporter directly. Ensure you have already setup your cluster with the [default runtime as NVIDIA](https://github.com/NVIDIA/nvidia-container-runtime#docker-engine-setup). -To gather metrics on your GPU nodes you can deploy the daemonset: + +The recommended way to install DCGM-Exporter is to use the Helm chart: +``` +$ helm repo add gpu-helm-charts \ + https://nvidia.github.io/gpu-monitoring-tools/helm-charts +``` +Update the repo: +``` +$ helm repo update +``` +And install the chart: ``` -$ kubectl create -f https://raw.githubusercontent.com/NVIDIA/gpu-monitoring-tools/2.0.0-rc.12/dcgm-exporter.yaml +$ helm install \ + --generate-name \ + gpu-helm-charts/dcgm-exporter +``` + +Once the `dcgm-exporter` pod is deployed, you can use port forwarding to obtain metrics quickly: + + +``` +$ kubectl create -f https://raw.githubusercontent.com/NVIDIA/gpu-monitoring-tools/master/dcgm-exporter.yaml # Let's get the output of a random pod: -$ NAME=$(kubectl get pods -l "app.kubernetes.io/name=dcgm-exporter, app.kubernetes.io/version=2.0.0-rc.12" \ +$ NAME=$(kubectl get pods -l "app.kubernetes.io/name=dcgm-exporter" \ -o "jsonpath={ .items[0].metadata.name}") $ kubectl port-forward $NAME 8080:9400 & @@ -61,16 +80,15 @@ DCGM_FI_DEV_MEMORY_TEMP{gpu="0", UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52" ... ``` -To integrate `dcgm-exporter` with Prometheus and Grafana, see the full instructions in the [user guide](https://docs.nvidia.com/datacenter/cloud-native/kubernetes/dcgme2e.html#gpu-telemetry). +To integrate DCGM-Exporter with Prometheus and Grafana, see the full instructions in the [user guide](https://docs.nvidia.com/datacenter/cloud-native/kubernetes/dcgme2e.html#gpu-telemetry). `dcgm-exporter` is deployed as part of the GPU Operator. To get started with integrating with Prometheus, check the Operator [user guide](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html#gpu-telemetry). -### Building From source and Running on Bare Metal +### Building from Source -The dcgm-exporter is actually fairly straightforward to build and use. +`dcgm-exporter` is actually fairly straightforward to build and use. Ensure you have the following: - [Golang >= 1.14 installed](https://golang.org/) - [DCGM installed](https://developer.nvidia.com/dcgm) -- On DGX, the NVIDIA Fabric Manager up and running ``` $ git clone https://github.com/NVIDIA/gpu-monitoring-tools.git @@ -93,11 +111,11 @@ DCGM_FI_DEV_MEMORY_TEMP{gpu="0", UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52" ... ``` +### Changing Metrics -### Changing the Metrics - -With dcgm-exporter 2.0 you can configure which fields are collected by specifying a custom CSV file. -You will find the [default CSV file here](https://github.com/NVIDIA/gpu-monitoring-tools/blob/2.0.0-rc.12/etc/dcgm-exporter/default-counters.csv) and on your system or container at /etc/dcgm-exporter/default-counters.csv +With `dcgm-exporter` you can configure which fields are collected by specifying a custom CSV file. +You will find the default CSV file under `etc/dcgm-exporter/default-counters.csv` in the repository, which is copied on your system or container at +`/etc/dcgm-exporter/default-counters.csv` The format of this file is pretty straightforward: ``` @@ -117,13 +135,13 @@ $ dcgm-exporter -f /tmp/custom-collectors.csv Notes: - Always make sure your entries have 3 commas (',') -- The complete list of counters that can be collected can be found on the DCGM API reference website: https://docs.nvidia.com/datacenter/dcgm/1.7/dcgm-api/group__dcgmFieldIdentifiers.html +- The complete list of counters that can be collected can be found on the DCGM API reference manual: https://docs.nvidia.com/datacenter/dcgm/latest/dcgm-api/group__dcgmFieldIdentifiers.html ### What about a Grafana Dashboard? -You can find the official NVIDIA dcgm-exporter dashboard here: https://grafana.com/grafana/dashboards/12239 +You can find the official NVIDIA DCGM-Exporter dashboard here: https://grafana.com/grafana/dashboards/12239 -You will also find the json file on this repo: https://github.com/NVIDIA/gpu-monitoring-tools/blob/2.0.0-rc.12/grafana/dcgm-exporter-dashboard.json +You will also find the `json` file on this repo under `grafana/dcgm-exporter-dashboard.json` Pull requests are accepted! diff --git a/dcgm-exporter.yaml b/dcgm-exporter.yaml index 64ff25f..206a62e 100644 --- a/dcgm-exporter.yaml +++ b/dcgm-exporter.yaml @@ -18,23 +18,23 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.1.0-rc.2" + app.kubernetes.io/version: "2.1.0" spec: updateStrategy: type: RollingUpdate selector: matchLabels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.1.0-rc.2" + app.kubernetes.io/version: "2.1.0" template: metadata: labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.1.0-rc.2" + app.kubernetes.io/version: "2.1.0" name: "dcgm-exporter" spec: containers: - - image: "nvidia/dcgm-exporter:2.0.10-2.1.0-rc.2-ubuntu18.04" + - image: "nvidia/dcgm-exporter:2.0.13-2.1.0-ubuntu18.04" env: - name: "DCGM_EXPORTER_LISTEN" value: ":9400" @@ -64,11 +64,11 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.1.0-rc.2" + app.kubernetes.io/version: "2.1.0" spec: selector: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.1.0-rc.2" + app.kubernetes.io/version: "2.1.0" ports: - name: "metrics" port: 9400 diff --git a/deployment/dcgm-exporter/Chart.yaml b/deployment/dcgm-exporter/Chart.yaml index 423d9cb..58458bb 100644 --- a/deployment/dcgm-exporter/Chart.yaml +++ b/deployment/dcgm-exporter/Chart.yaml @@ -1,8 +1,17 @@ -apiVersion: v1 +apiVersion: v2 name: dcgm-exporter description: A Helm chart for DCGM exporter +version: "2.1.0" +kubeVersion: ">= 1.13.0" +appVersion: "2.1.0" sources: - https://gitlab.com/nvidia/container-toolkit/gpu-monitoring-tools -version: "1.1.0" -appVersion: "2.0.10" -kubeVersion: ">= 1.13.0" +home: https://github.com/nvidia/gpu-monitoring-tools/ +icon: https://assets.nvidiagrid.net/ngc/logos/DCGM.png +keywords: + - gpu + - cuda + - compute + - monitoring + - telemetry + - tesla diff --git a/deployment/dcgm-exporter/values.yaml b/deployment/dcgm-exporter/values.yaml index 8b999c7..a898ed9 100644 --- a/deployment/dcgm-exporter/values.yaml +++ b/deployment/dcgm-exporter/values.yaml @@ -17,7 +17,7 @@ image: pullPolicy: IfNotPresent # Image tag defaults to AppVersion, but you can use the tag key # for the image tag, e.g: - tag: 2.0.10-2.1.0-rc.2-ubuntu18.04 + tag: 2.0.13-2.1.0-ubuntu18.04 # Comment the following line to stop profiling metrics from DCGM arguments: ["-f", "/etc/dcgm-exporter/dcp-metrics-included.csv"] diff --git a/docker/Dockerfile.ubi8 b/docker/Dockerfile.ubi8 index 9adf2da..5f42595 100644 --- a/docker/Dockerfile.ubi8 +++ b/docker/Dockerfile.ubi8 @@ -11,7 +11,7 @@ LABEL io.k8s.display-name="NVIDIA DCGM Exporter" ARG DCGM_VERSION RUN yum install -y wget libgomp && \ - wget https://developer.download.nvidia.com/compute/redist/dcgm/${DCGM_VERSION}/RPMS/x86_64/datacenter-gpu-manager-${DCGM_VERSION}-1.x86_64.rpm && \ + wget --no-check-certificate https://developer.download.nvidia.com/compute/redist/dcgm/${DCGM_VERSION}/RPMS/x86_64/datacenter-gpu-manager-${DCGM_VERSION}-1-x86_64.rpm && \ rpm -i ./datacenter-gpu-manager-*.rpm && \ rm ./datacenter-gpu-manager-*.rpm @@ -29,13 +29,13 @@ LABEL name="NVIDIA DCGM Exporter" LABEL vendor="NVIDIA" LABEL version="${VERSION}" LABEL release="N/A" -LABEL summary="Exports GPU Metrics in the prometheus format" +LABEL summary="Exports GPU Metrics to Prometheus" LABEL description="See summary" COPY ./LICENSE ./licenses/LICENSE ENV NO_SETCAP= -COPY docker/docker-entrypoint.sh /usr/local/dcgm/docker-entrypoint.sh -RUN chmod +x /usr/local/dcgm/docker-entrypoint.sh +COPY docker/dcgm-exporter-entrypoint.sh /usr/local/dcgm/dcgm-exporter-entrypoint.sh +RUN chmod +x /usr/local/dcgm/dcgm-exporter-entrypoint.sh -ENTRYPOINT ["/usr/local/dcgm/docker-entrypoint.sh"] +ENTRYPOINT ["/usr/local/dcgm/dcgm-exporter-entrypoint.sh"] diff --git a/docker/Dockerfile.ubuntu18.04 b/docker/Dockerfile.ubuntu18.04 index 4c61bd2..993715d 100644 --- a/docker/Dockerfile.ubuntu18.04 +++ b/docker/Dockerfile.ubuntu18.04 @@ -28,7 +28,7 @@ ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32 ENV NVIDIA_VISIBLE_DEVICES=all ENV NO_SETCAP= -COPY docker/docker-entrypoint.sh /usr/local/dcgm/docker-entrypoint.sh -RUN chmod +x /usr/local/dcgm/docker-entrypoint.sh +COPY docker/dcgm-exporter-entrypoint.sh /usr/local/dcgm/dcgm-exporter-entrypoint.sh +RUN chmod +x /usr/local/dcgm/dcgm-exporter-entrypoint.sh -ENTRYPOINT ["/usr/local/dcgm/docker-entrypoint.sh"] +ENTRYPOINT ["/usr/local/dcgm/dcgm-exporter-entrypoint.sh"] diff --git a/docker/Dockerfile.ubuntu20.04 b/docker/Dockerfile.ubuntu20.04 index d30736d..e806cdb 100644 --- a/docker/Dockerfile.ubuntu20.04 +++ b/docker/Dockerfile.ubuntu20.04 @@ -28,7 +28,7 @@ ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32 ENV NVIDIA_VISIBLE_DEVICES=all ENV NO_SETCAP= -COPY docker/docker-entrypoint.sh /usr/local/dcgm/docker-entrypoint.sh -RUN chmod +x /usr/local/dcgm/docker-entrypoint.sh +COPY docker/dcgm-exporter-entrypoint.sh /usr/local/dcgm/dcgm-exporter-entrypoint.sh +RUN chmod +x /usr/local/dcgm/dcgm-exporter-entrypoint.sh -ENTRYPOINT ["/usr/local/dcgm/docker-entrypoint.sh"] +ENTRYPOINT ["/usr/local/dcgm/dcgm-exporter-entrypoint.sh"] diff --git a/docker/dcgm-exporter-entrypoint.sh b/docker/dcgm-exporter-entrypoint.sh new file mode 100644 index 0000000..42d7802 --- /dev/null +++ b/docker/dcgm-exporter-entrypoint.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +set -euo pipefail + +# We want to setcap only when the container is started with the right caps +DCGM_EXPORTER=$(readlink -f $(which dcgm-exporter)) +if [ -z "$NO_SETCAP" ]; then + setcap 'cap_sys_admin=+ep' $DCGM_EXPORTER + if [ $? -eq 0 ]; then + if ! $DCGM_EXPORTER -v 1>/dev/null 2>/dev/null; then + >&2 echo "Warning #2: dcgm-exporter doesn't have sufficient privileges to expose profiling metrics. To get profiling metrics with dcgm-exporter, use --cap-add SYS_ADMIN" + setcap 'cap_sys_admin=-ep' $DCGM_EXPORTER + fi + else + >&2 echo "Warning #1: dcgm-exporter doesn't have sufficient privileges to expose profiling metrics. To get profiling metrics with dcgm-exporter, use --cap-add SYS_ADMIN" + fi + +fi + +# Pass the command line arguments to dcgm-exporter +set -- $DCGM_EXPORTER "$@" +exec "$@" diff --git a/docker/docker-entrypoint.sh b/docker/docker-entrypoint.sh deleted file mode 100644 index d6c8ea6..0000000 --- a/docker/docker-entrypoint.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# We want to setcap only when the container is started with the right permissions -DCGM_EXPORTER=$(readlink -f $(which dcgm-exporter)) -if [ -z "$NO_SETCAP" ]; then - setcap 'cap_sys_admin=+ep' $DCGM_EXPORTER - - if ! $DCGM_EXPORTER -v 1>/dev/null 2>/dev/null; then - >&2 echo "dcgm-exporter doesn't have sufficient privileges to expose profiling metrics. To use dcgm-exporter for profiling metrics use --cap-add SYS_ADMIN" - setcap 'cap_sys_admin=-ep' $DCGM_EXPORTER - fi -fi - -# Pass the command line arguments to dcgm-exporter -set -- $DCGM_EXPORTER "$@" -exec "$@" diff --git a/service-monitor.yaml b/service-monitor.yaml index 9477b98..9bb52bf 100644 --- a/service-monitor.yaml +++ b/service-monitor.yaml @@ -18,12 +18,12 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.1.0-rc.2" + app.kubernetes.io/version: "2.1.0" spec: selector: matchLabels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.1.0-rc.2" + app.kubernetes.io/version: "2.1.0" endpoints: - port: "metrics" path: "/metrics"