diff --git a/README.md b/README.md index c4efbdb..0c2647d 100644 --- a/README.md +++ b/README.md @@ -40,10 +40,10 @@ Note: Consider using the [NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-ope Ensure you have already setup your cluster with the [default runtime as NVIDIA](https://github.com/NVIDIA/nvidia-container-runtime#docker-engine-setup). To gather metrics on your GPU nodes you can deploy the daemonset: ``` -$ kubectl create -f https://raw.githubusercontent.com/NVIDIA/gpu-monitoring-tools/2.0.0-rc.9/dcgm-exporter.yaml +$ kubectl create -f https://raw.githubusercontent.com/NVIDIA/gpu-monitoring-tools/2.0.0-rc.11/dcgm-exporter.yaml # Let's get the output of a random pod: -$ NAME=$(kubectl get pods -l "app.kubernetes.io/name=dcgm-exporter, app.kubernetes.io/version=2.0.0-rc.9" \ +$ NAME=$(kubectl get pods -l "app.kubernetes.io/name=dcgm-exporter, app.kubernetes.io/version=2.0.0-rc.11" \ -o "jsonpath={ .items[0].metadata.name}") $ kubectl port-forward $NAME 8080:9400 & @@ -68,7 +68,7 @@ $ helm repo add stable https://kubernetes-charts.storage.googleapis.com $ helm install stable/prometheus-operator --generate-name \ --set "prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false" $ kubectl create -f \ - https://raw.githubusercontent.com/NVIDIA/gpu-monitoring-tools/2.0.0-rc.9/service-monitor.yaml + https://raw.githubusercontent.com/NVIDIA/gpu-monitoring-tools/2.0.0-rc.11/service-monitor.yaml # Note might take ~1-2 minutes for prometheus to pickup the metrics and display them # You can also check in the WebUI the servce-discovery tab (in the Status category) @@ -133,7 +133,7 @@ DCGM_FI_DEV_MEMORY_TEMP{gpu="0", UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52" ### Changing the Metrics With dcgm-exporter 2.0 you can configure which fields are collected by specifying a custom CSV file. -You will find the [default CSV file here](https://github.com/NVIDIA/gpu-monitoring-tools/blob/2.0.0-rc.9/etc/dcgm-exporter/default-counters.csv) and on your system or container at /etc/dcgm-exporter/default-counters.csv +You will find the [default CSV file here](https://github.com/NVIDIA/gpu-monitoring-tools/blob/2.0.0-rc.11/etc/dcgm-exporter/default-counters.csv) and on your system or container at /etc/dcgm-exporter/default-counters.csv The format of this file is pretty straightforward: ``` @@ -159,7 +159,7 @@ Notes: You can find the official NVIDIA dcgm-exporter dashboard here: https://grafana.com/grafana/dashboards/12239 -You will also find the json file on this repo: https://github.com/NVIDIA/gpu-monitoring-tools/blob/2.0.0-rc.9/grafana/dcgm-exporter-dashboard.json +You will also find the json file on this repo: https://github.com/NVIDIA/gpu-monitoring-tools/blob/2.0.0-rc.11/grafana/dcgm-exporter-dashboard.json Pull requests are accepted! diff --git a/dcgm-exporter.yaml b/dcgm-exporter.yaml index 19a8bbf..cfb9b5b 100644 --- a/dcgm-exporter.yaml +++ b/dcgm-exporter.yaml @@ -18,19 +18,19 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.0.0-rc.9" + app.kubernetes.io/version: "2.0.0-rc.11" spec: updateStrategy: type: RollingUpdate selector: matchLabels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.0.0-rc.9" + app.kubernetes.io/version: "2.0.0-rc.11" template: metadata: labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.0.0-rc.9" + app.kubernetes.io/version: "2.0.0-rc.11" name: "dcgm-exporter" spec: containers: @@ -64,11 +64,11 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.0.0-rc.9" + app.kubernetes.io/version: "2.0.0-rc.11" spec: selector: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.0.0-rc.9" + app.kubernetes.io/version: "2.0.0-rc.11" ports: - name: "metrics" port: 9400 diff --git a/service-monitor.yaml b/service-monitor.yaml index 908f61c..0e18f7a 100644 --- a/service-monitor.yaml +++ b/service-monitor.yaml @@ -18,12 +18,12 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.0.0-rc.9" + app.kubernetes.io/version: "2.0.0-rc.11" spec: selector: matchLabels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.0.0-rc.9" + app.kubernetes.io/version: "2.0.0-rc.11" endpoints: - port: "metrics" path: "/metrics"