From e5cda0bb2c95c7132c10fdd2c719d6d1e689d8af Mon Sep 17 00:00:00 2001
From: Renaud Gaubert <rgaubert@nvidia.com>
Date: Fri, 1 May 2020 23:02:31 -0700
Subject: [PATCH] Fix Kubernetes information add bare metal informations

Signed-off-by: Renaud Gaubert <rgaubert@nvidia.com>
---
 README.md            | 63 +++++++++++++++++++++++++++++++++++---
 daemonset.yaml       | 35 ---------------------
 dcgm-exporter.yaml   | 72 ++++++++++++++++++++++++++++++++++++++++++++
 service-monitor.yaml | 29 ++++++++++++++++++
 4 files changed, 160 insertions(+), 39 deletions(-)
 delete mode 100644 daemonset.yaml
 create mode 100644 dcgm-exporter.yaml
 create mode 100644 service-monitor.yaml

diff --git a/README.md b/README.md
index b6098e7..0572a06 100644
--- a/README.md
+++ b/README.md
@@ -38,14 +38,69 @@ DCGM_FI_DEV_MEMORY_TEMP{gpu="0" UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52"}
 Note: Consider using the [NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-operator) rather than the DCGM exporter directly.
 To gather metrics on your GPU nodes you can deploy the daemonset:
 ```
-$ kubectl create -f https://raw.githubusercontent.com/NVIDIA/gpu-monitoring-tools/2.0.0-rc.0/daemonset.yaml
+$ kubectl create -f https://raw.githubusercontent.com/NVIDIA/gpu-monitoring-tools/2.0.0-rc.8/dcgm-exporter.yaml
 
 # Let's get the output of a random pod:
-$ NAME=$(kubectl get pods -l "app.kubernetes.io/name=dcgm-exporter, app.kubernetes.io/version=2.0.0-rc.0" \
+$ NAME=$(kubectl get pods -l "app.kubernetes.io/name=dcgm-exporter, app.kubernetes.io/version=2.0.0-rc.8" \
                          -o "jsonpath={ .items[0].metadata.name}")
 
-$ kubectl proxy --port=9400
-$ curl http://localhost:9400/api/v1/namespaces/default/pods/$NAME:9400/proxy
+$ kubectl proxy --port=8080
+$ curl http://localhost:8080/api/v1/namespaces/default/pods/$NAME:9400/proxy/metrics
+# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
+# TYPE DCGM_FI_DEV_SM_CLOCK gauge
+# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz).
+# TYPE DCGM_FI_DEV_MEM_CLOCK gauge
+# HELP DCGM_FI_DEV_MEMORY_TEMP Memory temperature (in C).
+# TYPE DCGM_FI_DEV_MEMORY_TEMP gauge
+...
+DCGM_FI_DEV_SM_CLOCK{gpu="0" UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52"} 139
+DCGM_FI_DEV_MEM_CLOCK{gpu="0" UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52"} 405
+DCGM_FI_DEV_MEMORY_TEMP{gpu="0" UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52"} 9223372036854775794
+...
+
+# If you are using the Prometheus operator
+# Note on exporters here: https://github.com/coreos/prometheus-operator/blob/release-0.38/Documentation/user-guides/running-exporters.md
+$ helm install stable/prometheus-operator --generate-name --set "prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false"
+$ kubectl create -f https://raw.githubusercontent.com/NVIDIA/gpu-monitoring-tools/2.0.0-rc.8/service-monitor.yaml
+
+$ NAME=$(kubectl get svc -l app=prometheus-operator-prometheus -o jsonpath='{.items[0].metadata.name}')
+$ curl "http://localhost:8080/api/v1/namespaces/default/services/$NAME:9090/proxy/api/v1/query?query=DCGM_FI_DEV_MEMORY_TEMP"
+{
+	status: "success",
+	data: {
+		resultType: "vector",
+		result: [
+			{
+				metric: {
+					UUID: "GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52",
+					__name__: "DCGM_FI_DEV_MEMORY_TEMP",
+					...
+					pod: "dcgm-exporter-fn7fm",
+					service: "dcgm-exporter"
+				},
+				value: [
+					1588399049.227,
+					"9223372036854776000"
+				]
+			},
+			...
+		]
+	}
+}
+```
+
+
+### Building From source and Running on Bare Metal
+
+The dcgm-exporter is actually fairly straightforward to build and use. Ensure you have go >= 1.14 installed.
+```
+$ git clone https://github.com/NVIDIA/gpu-monitoring-tools.git
+$ cd gpu-monitoring-tools
+$ make binary
+$ sudo make install
+...
+$ dcgm-exporter &
+$ curl localhost:8081/metrics
 # HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
 # TYPE DCGM_FI_DEV_SM_CLOCK gauge
 # HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz).
diff --git a/daemonset.yaml b/daemonset.yaml
deleted file mode 100644
index c111707..0000000
--- a/daemonset.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-apiVersion: apps/v1
-kind: DaemonSet
-metadata:
-  name: gpu-metrics-exporter
-  labels:
-    app.kubernetes.io/name: dcgm-exporter
-    app.kubernetes.io/version: "2.0.0-rc.0"
-spec:
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: dcgm-exporter
-        app.kubernetes.io/version: "2.0.0-rc.0"
-      name: dcgm-exporter
-    spec:
-      containers:
-      - image: nvidia/dcgm-exporter:2.0.0-rc.0
-        env:
-          - name: DCGM_EXPORTER_PORT
-            value: 9400
-        name: dcgm-exporter
-        ports:
-        - name: gpu-metrics
-          containerPort: 9400
-        securityContext:
-          runAsNonRoot: false
-          runAsUser: 0
-        volumeMounts:
-        - name: pod-gpu-resources
-          readOnly: true
-          mountPath: /var/lib/kubelet/pod-resources
-      volumes:
-      - name: pod-gpu-resources
-        hostPath:
-          path: /var/lib/kubelet/pod-resources
diff --git a/dcgm-exporter.yaml b/dcgm-exporter.yaml
new file mode 100644
index 0000000..d9a5636
--- /dev/null
+++ b/dcgm-exporter.yaml
@@ -0,0 +1,72 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: "dcgm-exporter"
+  labels:
+    app.kubernetes.io/name: "dcgm-exporter"
+    app.kubernetes.io/version: "2.0.0-rc.8"
+spec:
+  updateStrategy:
+    type: RollingUpdate
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: "dcgm-exporter"
+      app.kubernetes.io/version: "2.0.0-rc.8"
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: "dcgm-exporter"
+        app.kubernetes.io/version: "2.0.0-rc.8"
+      name: "dcgm-exporter"
+    spec:
+      containers:
+      - image: "nvidia/dcgm-exporter:1.7.2"
+        env:
+        - name: "DCGM_EXPORTER_PORT"
+          value: "9400"
+        name: "dcgm-exporter"
+        ports:
+        - name: "metrics"
+          containerPort: 9400
+        securityContext:
+          runAsNonRoot: false
+          runAsUser: 0
+        volumeMounts:
+        - name: "pod-gpu-resources"
+          readOnly: true
+          mountPath: "/var/lib/kubelet/pod-resources"
+      volumes:
+      - name: "pod-gpu-resources"
+        hostPath:
+          path: "/var/lib/kubelet/pod-resources"
+
+---
+
+kind: Service
+apiVersion: v1
+metadata:
+  name: "dcgm-exporter"
+  labels:
+    app.kubernetes.io/name: "dcgm-exporter"
+    app.kubernetes.io/version: "2.0.0-rc.8"
+spec:
+  selector:
+    app.kubernetes.io/name: "dcgm-exporter"
+    app.kubernetes.io/version: "2.0.0-rc.8"
+  ports:
+  - name: "metrics"
+    port: 9400
diff --git a/service-monitor.yaml b/service-monitor.yaml
new file mode 100644
index 0000000..5b37d9e
--- /dev/null
+++ b/service-monitor.yaml
@@ -0,0 +1,29 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: "dcgm-exporter"
+  labels:
+    app.kubernetes.io/name: "dcgm-exporter"
+    app.kubernetes.io/version: "2.0.0-rc.8"
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: "dcgm-exporter"
+      app.kubernetes.io/version: "2.0.0-rc.8"
+  endpoints:
+  - port: "metrics"
+    path: "/metrics"