diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 3c3b4dc..98880b9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -4,8 +4,6 @@ services: - docker:dind variables: - DCGM_VERSION: "1.7.1" - METRICS_EXPORTER_VERSION: "1.0.0-alpha.0" stages: - build @@ -14,40 +12,23 @@ stages: dcgm:exporter: stage: build script: - - apk add make bash + - apk add make - docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}" - - make -C dcgm-exporter REGISTRY="${CI_REGISTRY_IMAGE}" TAG="${CI_COMMIT_SHA}" DCGM="${DCGM_VERSION}" build - - make -C dcgm-exporter REGISTRY="${CI_REGISTRY_IMAGE}" TAG="${CI_COMMIT_SHA}" push - -gpu:metrics:exporter: - stage: build - script: - - apk add make bash - - docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}" - - - make -C pod-gpu-metrics-exporter REGISTRY="${CI_REGISTRY_IMAGE}" TAG="${CI_COMMIT_SHA}" build - - make -C pod-gpu-metrics-exporter REGISTRY="${CI_REGISTRY_IMAGE}" TAG="${CI_COMMIT_SHA}" push + - make REGISTRY="${CI_REGISTRY_IMAGE}" TAG="${CI_COMMIT_SHA}" DCGM="${DCGM_VERSION}" build + - make REGISTRY="${CI_REGISTRY_IMAGE}" TAG="${CI_COMMIT_SHA}" push + - make REGISTRY="${CI_REGISTRY_IMAGE}" TAG="${CI_COMMIT_SHA}" push-short + - make REGISTRY="${CI_REGISTRY_IMAGE}" TAG="${CI_COMMIT_SHA}" push-latest dcgm:exporter: stage: release only: - tags script: - - apk add make bash + - apk add make - docker login -u "${REGISTRY_USER}" -p "${REGISTRY_TOKEN}" - - make -C dcgm-exporter REGISTRY="nvidia" TAG="${DCGM_VERSION}" DCGM="${DCGM_VERSION}" build - - make -C dcgm-exporter REGISTRY="nvidia" TAG="${DCGM_VERSION}" push - -gpu:metrics:exporter: - stage: release - only: - - tags - script: - - apk add make bash - - docker login -u "${REGISTRY_USER}" -p "${REGISTRY_TOKEN}" - - - make -C pod-gpu-metrics-exporter REGISTRY="nvidia" TAG="${METRICS_EXPORTER_VERSION}" build - - make -C pod-gpu-metrics-exporter REGISTRY="nvidia" TAG="${METRICS_EXPORTER_VERSION}" push - + - make REGISTRY="nvidia" DCGM="${DCGM_VERSION}" build + - make REGISTRY="nvidia" push + - make REGISTRY="nvidia" push-short + - make REGISTRY="nvidia" push-latest diff --git a/Makefile b/Makefile index 88c7b7b..f627e71 100644 --- a/Makefile +++ b/Makefile @@ -17,14 +17,29 @@ MKDIR ?= mkdir REGISTRY ?= nvidia/toolkit GOLANG_VERSION := 1.14.2 -VERSION := 2.0.0 +VERSION := 2.0.0-rc.0 -.PHONY: all +.PHONY: all binary install all: ubuntu18.04 ubi8 binary: go build -o dcgm-exporter github.com/NVIDIA/gpu-monitoring-tools/pkg +install: binary + install -m 557 dcgm-exporter /usr/bin/dcgm-exporter + +push: + $(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(VERSION)-ubuntu18.04" + $(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubi8" + +push-short: + $(DOCKER) tag "$(REGISTRY)/dcgm-exporter:$(VERSION)-ubuntu18.04" "$(REGISTRY)/dcgm-exporter:$(VERSION)" + $(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(VERSION)" + +push-latest: + $(DOCKER) tag "$(REGISTRY)/dcgm-exporter:$(VERSION)-ubuntu18.04" "$(REGISTRY)/dcgm-exporter:latest" + $(DOCKER) push "$(REGISTRY)/dcgm-exporter:latest" + ubuntu18.04: $(DOCKER) build --pull \ --build-arg GOLANG_VERSION="$(GOLANG_VERSION)" \ diff --git a/README.md b/README.md index e46a424..f5ca850 100644 --- a/README.md +++ b/README.md @@ -1,34 +1,67 @@ # NVIDIA GPU Monitoring Tools -## NVML Go Bindings +## Bindings -[NVIDIA Management Library (NVML)](https://docs.nvidia.com/deploy/nvml-api/nvml-api-reference.html#nvml-api-reference) is a C-based API for monitoring and managing NVIDIA GPU devices. -NVML go bindings are taken from [nvidia-docker 1.0](https://github.com/NVIDIA/nvidia-docker/tree/1.0) with some improvements and additions. NVML headers are also added to the package to make it easy to use and build. - -### NVML Samples -Three [samples](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/bindings/go/samples/nvml/README.md) are included to demonstrate how to use the NVML API. - - -## DCGM Go Bindings - -[NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/data-center-gpu-manager-dcgm) is a set of tools for managing and monitoring NVIDIA GPUs in cluster environments. It's a low overhead tool suite that performs a variety of functions on each host system including active health monitoring, diagnostics, system validation, policies, power and clock management, group configuration and accounting. - -DCGM go bindings makes administering and monitoring containerized GPU applications easy. - -### DCGM Samples - -DCGM can be run in different modes, seven [samples](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/bindings/go/samples/dcgm/README.md) and a [REST API](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/bindings/go/samples/dcgm/restApi/README.md) are included for showing how to use the DCGM API and run it in different modes. +This Github repository contains Golang bindings for the following two libraries: +- [NVIDIA Management Library (NVML)](https://docs.nvidia.com/deploy/nvml-api/nvml-api-reference.html#nvml-api-reference) is a C-based API for monitoring and managing NVIDIA GPU devices. +- [NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/data-center-gpu-manager-dcgm) is a set of tools for managing and monitoring NVIDIA GPUs in cluster environments. It's a low overhead tool suite that performs a variety of functions on each host system including active health monitoring, diagnostics, system validation, policies, power and clock management, group configuration and accounting. +You will also find samples for both of these bindings in this repository. ## DCGM exporter -GPU metrics exporter for [Prometheus](https://prometheus.io/) leveraging [NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/data-center-gpu-manager-dcgm) is a simple shell script that starts nv-hostengine, reads GPU metrics every 1 second and converts it to a standard Prometheus format. +This Github repository also contains the DCGM exporter software. It exposes GPU metrics exporter for [Prometheus](https://prometheus.io/) leveraging [NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/data-center-gpu-manager-dcgm). Find the installation and run instructions [here](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/exporters/prometheus-dcgm/README.md). +### Quickstart + +To gather metrics on a GPU node, simply start the dcgm-exporter container: +``` +$ docker run -d --gpus all --rm -p 8080:8080 nvidia/dcgm-exporter:latest +$ curl localhost:8080/metrics +# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). +# TYPE DCGM_FI_DEV_SM_CLOCK gauge +# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz). +# TYPE DCGM_FI_DEV_MEM_CLOCK gauge +# HELP DCGM_FI_DEV_MEMORY_TEMP Memory temperature (in C). +# TYPE DCGM_FI_DEV_MEMORY_TEMP gauge +... +DCGM_FI_DEV_SM_CLOCK{gpu="0" UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52"} 139 +DCGM_FI_DEV_MEM_CLOCK{gpu="0" UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52"} 405 +DCGM_FI_DEV_MEMORY_TEMP{gpu="0" UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52"} 9223372036854775794 +... +``` + +### Quickstart on Kubernetes + +Note: Consider using the [NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-operator) rather than the DCGM exporter directly. +To gather metrics on your GPU nodes you can deploy the daemonset: +``` +$ kubectl create -f https://raw.githubusercontent.com/NVIDIA/gpu-monitoring-tools/2.0.0-rc.0/daemonset.yaml + +# Let's get the output of a random pod: +$ NAME=$(kubectl get pods -l "app.kubernetes.io/name=dcgm-exporter, app.kubernetes.io/version=2.0.0-rc.0" \ + -o "jsonpath={ .items[0].metadata.name}") + +$ kubectl proxy --port=8080 +$ curl http://localhost:8080/api/v1/namespaces/default/pods/$NAME:8080/proxy +# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). +# TYPE DCGM_FI_DEV_SM_CLOCK gauge +# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz). +# TYPE DCGM_FI_DEV_MEM_CLOCK gauge +# HELP DCGM_FI_DEV_MEMORY_TEMP Memory temperature (in C). +# TYPE DCGM_FI_DEV_MEMORY_TEMP gauge +... +DCGM_FI_DEV_SM_CLOCK{gpu="0" UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52"} 139 +DCGM_FI_DEV_MEM_CLOCK{gpu="0" UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52"} 405 +DCGM_FI_DEV_MEMORY_TEMP{gpu="0" UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52"} 9223372036854775794 +... +``` + ## Issues and Contributing [Checkout the Contributing document!](CONTRIBUTING.md) * Please let us know by [filing a new issue](https://github.com/NVIDIA/gpu-monitoring-tools/issues/new) -* You can contribute by opening a [pull request](https://help.github.com/articles/using-pull-requests/) +* You can contribute by opening a [pull request](https://gitlab.com/nvidia/container-toolkit/gpu-monitoring-tools) diff --git a/RELEASE.md b/RELEASE.md new file mode 100644 index 0000000..06d9126 --- /dev/null +++ b/RELEASE.md @@ -0,0 +1,32 @@ +# Release + +This document, the release process as well as the versioning strategy for the DCGM exporter. +In the future this document will also contain information about the go bindings. + +## Versioning + +The DCGM container posses three major components: +- The DCGM Version (e.g: 1.17.3) +- The Exporter Version (e.g: 2.0.0) +- The platform of the container (e.g: ubuntu18.04) + +The overall version of the Vulkan container has two forms: +- The long form: `${DCGM_VERSION}-${EXPORTER_VERSION}-${PLATFORM}` +- The short form: `${DCGM_VERSION}` +- The latest tag: `latest` + +The long form is a unique tag that once pushed will always refer to the same container. +This means that no updates will be made to that tag and it will always point to the same container. + +The short form refers to the latest EXPORTER_VERSION with the platform fixed to ubuntu18.04. +The latest tag refers to the latest short form (i.e: latest DCGM_VERSION and EXPORTER_VERSION). + +Note: We do not maintain multiple version branches. + +## Releases + +Release of newer versions is done on demand and does not follow DCGM's release cadence. +Though it is very likely that when a new version of DCGM comes out a new version of the exporter will be released. + +All commit to the master branch generates an image on the gitlab registry. +Tagging a version will push an image to the nvidia/dcgm-exporter repository on the Dockerhub