diff --git a/README.md b/README.md index b6098e7..0572a06 100644 --- a/README.md +++ b/README.md @@ -38,14 +38,69 @@ DCGM_FI_DEV_MEMORY_TEMP{gpu="0" UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52"} Note: Consider using the [NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-operator) rather than the DCGM exporter directly. To gather metrics on your GPU nodes you can deploy the daemonset: ``` -$ kubectl create -f https://raw.githubusercontent.com/NVIDIA/gpu-monitoring-tools/2.0.0-rc.0/daemonset.yaml +$ kubectl create -f https://raw.githubusercontent.com/NVIDIA/gpu-monitoring-tools/2.0.0-rc.8/dcgm-exporter.yaml # Let's get the output of a random pod: -$ NAME=$(kubectl get pods -l "app.kubernetes.io/name=dcgm-exporter, app.kubernetes.io/version=2.0.0-rc.0" \ +$ NAME=$(kubectl get pods -l "app.kubernetes.io/name=dcgm-exporter, app.kubernetes.io/version=2.0.0-rc.8" \ -o "jsonpath={ .items[0].metadata.name}") -$ kubectl proxy --port=9400 -$ curl http://localhost:9400/api/v1/namespaces/default/pods/$NAME:9400/proxy +$ kubectl proxy --port=8080 +$ curl http://localhost:8080/api/v1/namespaces/default/pods/$NAME:9400/proxy/metrics +# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). +# TYPE DCGM_FI_DEV_SM_CLOCK gauge +# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz). +# TYPE DCGM_FI_DEV_MEM_CLOCK gauge +# HELP DCGM_FI_DEV_MEMORY_TEMP Memory temperature (in C). +# TYPE DCGM_FI_DEV_MEMORY_TEMP gauge +... +DCGM_FI_DEV_SM_CLOCK{gpu="0" UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52"} 139 +DCGM_FI_DEV_MEM_CLOCK{gpu="0" UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52"} 405 +DCGM_FI_DEV_MEMORY_TEMP{gpu="0" UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52"} 9223372036854775794 +... + +# If you are using the Prometheus operator +# Note on exporters here: https://github.com/coreos/prometheus-operator/blob/release-0.38/Documentation/user-guides/running-exporters.md +$ helm install stable/prometheus-operator --generate-name --set "prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false" +$ kubectl create -f https://raw.githubusercontent.com/NVIDIA/gpu-monitoring-tools/2.0.0-rc.8/service-monitor.yaml + +$ NAME=$(kubectl get svc -l app=prometheus-operator-prometheus -o jsonpath='{.items[0].metadata.name}') +$ curl "http://localhost:8080/api/v1/namespaces/default/services/$NAME:9090/proxy/api/v1/query?query=DCGM_FI_DEV_MEMORY_TEMP" +{ + status: "success", + data: { + resultType: "vector", + result: [ + { + metric: { + UUID: "GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52", + __name__: "DCGM_FI_DEV_MEMORY_TEMP", + ... + pod: "dcgm-exporter-fn7fm", + service: "dcgm-exporter" + }, + value: [ + 1588399049.227, + "9223372036854776000" + ] + }, + ... + ] + } +} +``` + + +### Building From source and Running on Bare Metal + +The dcgm-exporter is actually fairly straightforward to build and use. Ensure you have go >= 1.14 installed. +``` +$ git clone https://github.com/NVIDIA/gpu-monitoring-tools.git +$ cd gpu-monitoring-tools +$ make binary +$ sudo make install +... +$ dcgm-exporter & +$ curl localhost:8081/metrics # HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). # TYPE DCGM_FI_DEV_SM_CLOCK gauge # HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz). diff --git a/daemonset.yaml b/daemonset.yaml deleted file mode 100644 index c111707..0000000 --- a/daemonset.yaml +++ /dev/null @@ -1,35 +0,0 @@ -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: gpu-metrics-exporter - labels: - app.kubernetes.io/name: dcgm-exporter - app.kubernetes.io/version: "2.0.0-rc.0" -spec: - template: - metadata: - labels: - app.kubernetes.io/name: dcgm-exporter - app.kubernetes.io/version: "2.0.0-rc.0" - name: dcgm-exporter - spec: - containers: - - image: nvidia/dcgm-exporter:2.0.0-rc.0 - env: - - name: DCGM_EXPORTER_PORT - value: 9400 - name: dcgm-exporter - ports: - - name: gpu-metrics - containerPort: 9400 - securityContext: - runAsNonRoot: false - runAsUser: 0 - volumeMounts: - - name: pod-gpu-resources - readOnly: true - mountPath: /var/lib/kubelet/pod-resources - volumes: - - name: pod-gpu-resources - hostPath: - path: /var/lib/kubelet/pod-resources diff --git a/dcgm-exporter.yaml b/dcgm-exporter.yaml new file mode 100644 index 0000000..d9a5636 --- /dev/null +++ b/dcgm-exporter.yaml @@ -0,0 +1,72 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: "dcgm-exporter" + labels: + app.kubernetes.io/name: "dcgm-exporter" + app.kubernetes.io/version: "2.0.0-rc.8" +spec: + updateStrategy: + type: RollingUpdate + selector: + matchLabels: + app.kubernetes.io/name: "dcgm-exporter" + app.kubernetes.io/version: "2.0.0-rc.8" + template: + metadata: + labels: + app.kubernetes.io/name: "dcgm-exporter" + app.kubernetes.io/version: "2.0.0-rc.8" + name: "dcgm-exporter" + spec: + containers: + - image: "nvidia/dcgm-exporter:1.7.2" + env: + - name: "DCGM_EXPORTER_PORT" + value: "9400" + name: "dcgm-exporter" + ports: + - name: "metrics" + containerPort: 9400 + securityContext: + runAsNonRoot: false + runAsUser: 0 + volumeMounts: + - name: "pod-gpu-resources" + readOnly: true + mountPath: "/var/lib/kubelet/pod-resources" + volumes: + - name: "pod-gpu-resources" + hostPath: + path: "/var/lib/kubelet/pod-resources" + +--- + +kind: Service +apiVersion: v1 +metadata: + name: "dcgm-exporter" + labels: + app.kubernetes.io/name: "dcgm-exporter" + app.kubernetes.io/version: "2.0.0-rc.8" +spec: + selector: + app.kubernetes.io/name: "dcgm-exporter" + app.kubernetes.io/version: "2.0.0-rc.8" + ports: + - name: "metrics" + port: 9400 diff --git a/service-monitor.yaml b/service-monitor.yaml new file mode 100644 index 0000000..5b37d9e --- /dev/null +++ b/service-monitor.yaml @@ -0,0 +1,29 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: "dcgm-exporter" + labels: + app.kubernetes.io/name: "dcgm-exporter" + app.kubernetes.io/version: "2.0.0-rc.8" +spec: + selector: + matchLabels: + app.kubernetes.io/name: "dcgm-exporter" + app.kubernetes.io/version: "2.0.0-rc.8" + endpoints: + - port: "metrics" + path: "/metrics"