Update tagging strategy

Signed-off-by: Renaud Gaubert <[email protected]>
tkestack · Apr 21, 2020 · 95da92e · 95da92e
1 parent 9435829
commit 95da92e
Show file tree

Hide file tree

Showing 4 changed files with 111 additions and 50 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -4,8 +4,6 @@ services:
   - docker:dind
 
 variables:
-  DCGM_VERSION: "1.7.1"
-  METRICS_EXPORTER_VERSION: "1.0.0-alpha.0"
 
 stages:
   - build
@@ -14,40 +12,23 @@ stages:
 dcgm:exporter:
   stage: build
   script:
-    - apk add make bash
+    - apk add make
     - docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}"
 
-    - make -C dcgm-exporter REGISTRY="${CI_REGISTRY_IMAGE}" TAG="${CI_COMMIT_SHA}" DCGM="${DCGM_VERSION}" build
-    - make -C dcgm-exporter REGISTRY="${CI_REGISTRY_IMAGE}" TAG="${CI_COMMIT_SHA}" push
-
-gpu:metrics:exporter:
-  stage: build
-  script:
-    - apk add make bash
-    - docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}"
-
-    - make -C pod-gpu-metrics-exporter REGISTRY="${CI_REGISTRY_IMAGE}" TAG="${CI_COMMIT_SHA}" build
-    - make -C pod-gpu-metrics-exporter REGISTRY="${CI_REGISTRY_IMAGE}" TAG="${CI_COMMIT_SHA}" push
+    - make REGISTRY="${CI_REGISTRY_IMAGE}" TAG="${CI_COMMIT_SHA}" DCGM="${DCGM_VERSION}" build
+    - make REGISTRY="${CI_REGISTRY_IMAGE}" TAG="${CI_COMMIT_SHA}" push
+    - make REGISTRY="${CI_REGISTRY_IMAGE}" TAG="${CI_COMMIT_SHA}" push-short
+    - make REGISTRY="${CI_REGISTRY_IMAGE}" TAG="${CI_COMMIT_SHA}" push-latest
 
 dcgm:exporter:
   stage: release
   only:
     - tags
   script:
-    - apk add make bash
+    - apk add make
     - docker login -u "${REGISTRY_USER}" -p "${REGISTRY_TOKEN}"
 
-    - make -C dcgm-exporter REGISTRY="nvidia" TAG="${DCGM_VERSION}" DCGM="${DCGM_VERSION}" build
-    - make -C dcgm-exporter REGISTRY="nvidia" TAG="${DCGM_VERSION}" push
-
-gpu:metrics:exporter:
-  stage: release
-  only:
-    - tags
-  script:
-    - apk add make bash
-    - docker login -u "${REGISTRY_USER}" -p "${REGISTRY_TOKEN}"
-
-    - make -C pod-gpu-metrics-exporter REGISTRY="nvidia" TAG="${METRICS_EXPORTER_VERSION}" build
-    - make -C pod-gpu-metrics-exporter REGISTRY="nvidia" TAG="${METRICS_EXPORTER_VERSION}" push
-
+    - make REGISTRY="nvidia" DCGM="${DCGM_VERSION}" build
+    - make REGISTRY="nvidia" push
+    - make REGISTRY="nvidia" push-short
+    - make REGISTRY="nvidia" push-latest
diff --git a/Makefile b/Makefile
@@ -17,14 +17,29 @@ MKDIR    ?= mkdir
 REGISTRY ?= nvidia/toolkit
 
 GOLANG_VERSION := 1.14.2
-VERSION        := 2.0.0
+VERSION        := 2.0.0-rc.0
 
-.PHONY: all
+.PHONY: all binary install
 all: ubuntu18.04 ubi8
 
 binary:
 	go build -o dcgm-exporter github.com/NVIDIA/gpu-monitoring-tools/pkg
 
+install: binary
+	install -m 557 dcgm-exporter /usr/bin/dcgm-exporter
+
+push:
+	$(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(VERSION)-ubuntu18.04"
+	$(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubi8"
+
+push-short:
+	$(DOCKER) tag "$(REGISTRY)/dcgm-exporter:$(VERSION)-ubuntu18.04" "$(REGISTRY)/dcgm-exporter:$(VERSION)"
+	$(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(VERSION)"
+
+push-latest:
+	$(DOCKER) tag "$(REGISTRY)/dcgm-exporter:$(VERSION)-ubuntu18.04" "$(REGISTRY)/dcgm-exporter:latest"
+	$(DOCKER) push "$(REGISTRY)/dcgm-exporter:latest"
+
 ubuntu18.04:
 	$(DOCKER) build --pull \
 		--build-arg GOLANG_VERSION="$(GOLANG_VERSION)" \

diff --git a/README.md b/README.md
@@ -1,34 +1,67 @@
 # NVIDIA GPU Monitoring Tools
 
-## NVML Go Bindings
+## Bindings
 
-[NVIDIA Management Library (NVML)](https://docs.nvidia.com/deploy/nvml-api/nvml-api-reference.html#nvml-api-reference) is a C-based API for monitoring and managing NVIDIA GPU devices. 
-NVML go bindings are taken from [nvidia-docker 1.0](https://github.com/NVIDIA/nvidia-docker/tree/1.0) with some improvements and additions. NVML headers are also added to the package to make it easy to use and build.
-
-### NVML Samples
-Three [samples](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/bindings/go/samples/nvml/README.md) are included to demonstrate how to use the NVML API.
-
-
-## DCGM Go Bindings
-
-[NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/data-center-gpu-manager-dcgm) is a set of tools for managing and monitoring NVIDIA GPUs in cluster environments. It's a low overhead tool suite that performs a variety of functions on each host system including active health monitoring, diagnostics, system validation, policies, power and clock management, group configuration and accounting.
-
-DCGM go bindings makes administering and monitoring containerized GPU applications easy.
-
-### DCGM Samples
-
-DCGM can be run in different modes, seven [samples](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/bindings/go/samples/dcgm/README.md) and a [REST API](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/bindings/go/samples/dcgm/restApi/README.md) are included for showing how to use the DCGM API and run it in different modes.
+This Github repository contains Golang bindings for the following two libraries:
+- [NVIDIA Management Library (NVML)](https://docs.nvidia.com/deploy/nvml-api/nvml-api-reference.html#nvml-api-reference) is a C-based API for monitoring and managing NVIDIA GPU devices.
+- [NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/data-center-gpu-manager-dcgm) is a set of tools for managing and monitoring NVIDIA GPUs in cluster environments. It's a low overhead tool suite that performs a variety of functions on each host system including active health monitoring, diagnostics, system validation, policies, power and clock management, group configuration and accounting.
 
+You will also find samples for both of these bindings in this repository.
 
 ## DCGM exporter
 
-GPU metrics exporter for [Prometheus](https://prometheus.io/) leveraging [NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/data-center-gpu-manager-dcgm) is a simple shell script that starts nv-hostengine, reads GPU metrics every 1 second and converts it to a standard Prometheus format.
+This Github repository also contains the DCGM exporter software. It exposes GPU metrics exporter for [Prometheus](https://prometheus.io/) leveraging [NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/data-center-gpu-manager-dcgm).
 
 Find the installation and run instructions [here](https://github.com/NVIDIA/gpu-monitoring-tools/blob/master/exporters/prometheus-dcgm/README.md).
 
+### Quickstart
+
+To gather metrics on a GPU node, simply start the dcgm-exporter container:
+```
+$ docker run -d --gpus all --rm -p 8080:8080 nvidia/dcgm-exporter:latest
+$ curl localhost:8080/metrics
+# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
+# TYPE DCGM_FI_DEV_SM_CLOCK gauge
+# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz).
+# TYPE DCGM_FI_DEV_MEM_CLOCK gauge
+# HELP DCGM_FI_DEV_MEMORY_TEMP Memory temperature (in C).
+# TYPE DCGM_FI_DEV_MEMORY_TEMP gauge
+...
+DCGM_FI_DEV_SM_CLOCK{gpu="0" UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52"} 139
+DCGM_FI_DEV_MEM_CLOCK{gpu="0" UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52"} 405
+DCGM_FI_DEV_MEMORY_TEMP{gpu="0" UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52"} 9223372036854775794
+...
+```
+
+### Quickstart on Kubernetes
+
+Note: Consider using the [NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-operator) rather than the DCGM exporter directly.
+To gather metrics on your GPU nodes you can deploy the daemonset:
+```
+$ kubectl create -f https://raw.githubusercontent.com/NVIDIA/gpu-monitoring-tools/2.0.0-rc.0/daemonset.yaml
+
+# Let's get the output of a random pod:
+$ NAME=$(kubectl get pods -l "app.kubernetes.io/name=dcgm-exporter, app.kubernetes.io/version=2.0.0-rc.0" \
+                         -o "jsonpath={ .items[0].metadata.name}")
+
+$ kubectl proxy --port=8080
+$ curl http://localhost:8080/api/v1/namespaces/default/pods/$NAME:8080/proxy
+# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
+# TYPE DCGM_FI_DEV_SM_CLOCK gauge
+# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz).
+# TYPE DCGM_FI_DEV_MEM_CLOCK gauge
+# HELP DCGM_FI_DEV_MEMORY_TEMP Memory temperature (in C).
+# TYPE DCGM_FI_DEV_MEMORY_TEMP gauge
+...
+DCGM_FI_DEV_SM_CLOCK{gpu="0" UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52"} 139
+DCGM_FI_DEV_MEM_CLOCK{gpu="0" UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52"} 405
+DCGM_FI_DEV_MEMORY_TEMP{gpu="0" UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52"} 9223372036854775794
+...
+```
+
 ## Issues and Contributing
 
 [Checkout the Contributing document!](CONTRIBUTING.md)
 
 * Please let us know by [filing a new issue](https://github.com/NVIDIA/gpu-monitoring-tools/issues/new)
-* You can contribute by opening a [pull request](https://help.github.com/articles/using-pull-requests/)
+* You can contribute by opening a [pull request](https://gitlab.com/nvidia/container-toolkit/gpu-monitoring-tools)
diff --git a/RELEASE.md b/RELEASE.md
@@ -0,0 +1,32 @@
+# Release
+
+This document, the release process as well as the versioning strategy for the DCGM exporter.
+In the future this document will also contain information about the go bindings.
+
+## Versioning
+
+The DCGM container posses three major components:
+- The DCGM Version (e.g: 1.17.3)
+- The Exporter Version (e.g: 2.0.0)
+- The platform of the container (e.g: ubuntu18.04)
+
+The overall version of the Vulkan container has two forms:
+- The long form: `${DCGM_VERSION}-${EXPORTER_VERSION}-${PLATFORM}`
+- The short form: `${DCGM_VERSION}`
+- The latest tag: `latest`
+
+The long form is a unique tag that once pushed will always refer to the same container.
+This means that no updates will be made to that tag and it will always point to the same container.
+
+The short form refers to the latest EXPORTER_VERSION with the platform fixed to ubuntu18.04.
+The latest tag refers to the latest short form (i.e: latest DCGM_VERSION and EXPORTER_VERSION).
+
+Note: We do not maintain multiple version branches.
+
+## Releases
+
+Release of newer versions is done on demand and does not follow DCGM's release cadence.
+Though it is very likely that when a new version of DCGM comes out a new version of the exporter will be released.
+
+All commit to the master branch generates an image on the gitlab registry.
+Tagging a version will push an image to the nvidia/dcgm-exporter repository on the Dockerhub