Skip to content

Commit b70474f

Browse files
committed
Add GPU metric only node-exporter
1 parent fe14af6 commit b70474f

File tree

2 files changed

+104
-1
lines changed

2 files changed

+104
-1
lines changed

exporters/prometheus-dcgm/README.md

+8-1
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,17 @@ $ kubectl label nodes <gpu-node-name> hardware-type=NVIDIAGPU
4545
# Check if the label is added
4646
$ kubectl get nodes --show-labels
4747

48+
# node-exporter collecting GPU and its default metrics
4849
$ kubectl create -f node-exporter-daemonset.yaml
4950

50-
# Check if node-exporter is collecting the metrics
51+
# Check if node-exporter is collecting the GPU metrics successfully
5152
$ curl -s localhost:9100/metrics | grep dcgm
53+
54+
# node-exporter collecting only GPU metrics
55+
$ kubectl create -f dcgm-exporter-daemonset.yaml
56+
57+
# Check GPU metrics
58+
$ curl -s localhost:9101/metrics
5259
```
5360

5461
### Helm Charts
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# Node exporter collecting only GPU metrics from dcgm-exporter.
2+
# Except textfile collector, all other collectors that are enabled by default are disabled.
3+
# Refer: https://github.com/prometheus/node_exporter/tree/release-0.16
4+
apiVersion: extensions/v1beta1
5+
kind: DaemonSet
6+
metadata:
7+
name: dcgm-exporter
8+
spec:
9+
template:
10+
metadata:
11+
labels:
12+
app: dcgm-exporter
13+
name: dcgm-exporter
14+
spec:
15+
nodeSelector:
16+
hardware-type: NVIDIAGPU
17+
containers:
18+
- image: quay.io/prometheus/node-exporter:v0.16.0
19+
name: node-exporter
20+
args:
21+
- "--web.listen-address=0.0.0.0:9101"
22+
- "--path.procfs=/host/proc"
23+
- "--path.sysfs=/host/sys"
24+
- "--collector.textfile.directory=/run/prometheus"
25+
- "--no-collector.arp"
26+
- "--no-collector.bcache"
27+
- "--no-collector.bonding"
28+
- "--no-collector.conntrack"
29+
- "--no-collector.cpu"
30+
- "--no-collector.diskstats"
31+
- "--no-collector.edac"
32+
- "--no-collector.entropy"
33+
- "--no-collector.filefd"
34+
- "--no-collector.filesystem"
35+
- "--no-collector.hwmon"
36+
- "--no-collector.infiniband"
37+
- "--no-collector.ipvs"
38+
- "--no-collector.loadavg"
39+
- "--no-collector.mdadm"
40+
- "--no-collector.meminfo"
41+
- "--no-collector.netdev"
42+
- "--no-collector.netstat"
43+
- "--no-collector.nfs"
44+
- "--no-collector.nfsd"
45+
- "--no-collector.sockstat"
46+
- "--no-collector.stat"
47+
- "--no-collector.time"
48+
- "--no-collector.timex"
49+
- "--no-collector.uname"
50+
- "--no-collector.vmstat"
51+
- "--no-collector.wifi"
52+
- "--no-collector.xfs"
53+
- "--no-collector.zfs"
54+
ports:
55+
- name: metrics
56+
containerPort: 9101
57+
hostPort: 9101
58+
resources:
59+
requests:
60+
memory: 30Mi
61+
cpu: 100m
62+
limits:
63+
memory: 50Mi
64+
cpu: 200m
65+
volumeMounts:
66+
- name: proc
67+
readOnly: true
68+
mountPath: /host/proc
69+
- name: sys
70+
readOnly: true
71+
mountPath: /host/sys
72+
- name: collector-textfiles
73+
readOnly: true
74+
mountPath: /run/prometheus
75+
- image: nvidia/dcgm-exporter:1.4.6
76+
name: nvidia-dcgm-exporter
77+
securityContext:
78+
runAsNonRoot: false
79+
runAsUser: 0
80+
volumeMounts:
81+
- name: collector-textfiles
82+
mountPath: /run/prometheus
83+
84+
hostNetwork: true
85+
hostPID: true
86+
87+
volumes:
88+
- name: proc
89+
hostPath:
90+
path: /proc
91+
- name: sys
92+
hostPath:
93+
path: /sys
94+
- name: collector-textfiles
95+
emptyDir:
96+
medium: Memory

0 commit comments

Comments
 (0)