From 0520bfa2a3ee895e3a3f4858936093f707706c7f Mon Sep 17 00:00:00 2001 From: Martijn Pieters Date: Tue, 1 Sep 2020 17:27:26 +0000 Subject: [PATCH 1/8] Make proper use the gpu variable The dashboard has a 'gpu' variable that's otherwise unused. This change switches the variable from single to multi-value, adds the 'all' option, and updates all expressions to match against the selected GPUs. --- grafana/dcgm-exporter-dashboard.json | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/grafana/dcgm-exporter-dashboard.json b/grafana/dcgm-exporter-dashboard.json index ad3d0b2..c4703f2 100644 --- a/grafana/dcgm-exporter-dashboard.json +++ b/grafana/dcgm-exporter-dashboard.json @@ -100,7 +100,7 @@ "steppedLine": false, "targets": [ { - "expr": "DCGM_FI_DEV_GPU_TEMP", + "expr": "DCGM_FI_DEV_GPU_TEMP{gpu=~\"${gpu}\"}", "instant": true, "interval": "", "legendFormat": "GPU {{gpu}}", @@ -198,7 +198,7 @@ "pluginVersion": "6.7.3", "targets": [ { - "expr": "avg(DCGM_FI_DEV_GPU_TEMP)", + "expr": "avg(DCGM_FI_DEV_GPU_TEMP{gpu=~\"${gpu}\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -253,7 +253,7 @@ "steppedLine": false, "targets": [ { - "expr": "DCGM_FI_DEV_POWER_USAGE", + "expr": "DCGM_FI_DEV_POWER_USAGE{gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" @@ -370,7 +370,7 @@ "pluginVersion": "6.7.3", "targets": [ { - "expr": "sum(DCGM_FI_DEV_POWER_USAGE)", + "expr": "sum(DCGM_FI_DEV_POWER_USAGE{gpu=~\"${gpu}\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -426,7 +426,7 @@ "steppedLine": false, "targets": [ { - "expr": "DCGM_FI_DEV_SM_CLOCK", + "expr": "DCGM_FI_DEV_SM_CLOCK{gpu=~\"${gpu}\"}", "format": "time_series", "interval": "", "intervalFactor": 1, @@ -519,7 +519,7 @@ "steppedLine": false, "targets": [ { - "expr": "DCGM_FI_DEV_MEM_CLOCK", + "expr": "DCGM_FI_DEV_MEM_CLOCK{gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" @@ -609,7 +609,7 @@ "steppedLine": false, "targets": [ { - "expr": "DCGM_FI_DEV_GPU_UTIL", + "expr": "DCGM_FI_DEV_GPU_UTIL{gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" @@ -699,7 +699,7 @@ "steppedLine": false, "targets": [ { - "expr": "DCGM_FI_DEV_MEM_COPY_UTIL", + "expr": "DCGM_FI_DEV_MEM_COPY_UTIL{gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" @@ -788,7 +788,7 @@ "steppedLine": false, "targets": [ { - "expr": "DCGM_FI_DEV_FB_USED", + "expr": "DCGM_FI_DEV_FB_USED{gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" @@ -878,7 +878,7 @@ "steppedLine": false, "targets": [ { - "expr": "DCGM_FI_DEV_FB_USED", + "expr": "DCGM_FI_DEV_FB_USED{gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" @@ -938,10 +938,10 @@ "datasource": "${DS_PROMETHEUS}", "definition": "label_values(gpu)", "hide": 0, - "includeAll": false, + "includeAll": true, "index": -1, "label": null, - "multi": false, + "multi": true, "name": "gpu", "options": [], "query": "label_values(gpu)", @@ -982,4 +982,4 @@ "list": [] }, "version": 1 -} \ No newline at end of file +} From 0be571e79eccb7ed6507c0fb025c5d718b7a4c14 Mon Sep 17 00:00:00 2001 From: Martijn Pieters Date: Fri, 4 Sep 2020 14:27:11 +0100 Subject: [PATCH 2/8] Add an instance drop-down When you monitor more than one machine, you need to be able to distinguish between them. --- grafana/dcgm-exporter-dashboard.json | 42 +++++++++++++++++++++------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/grafana/dcgm-exporter-dashboard.json b/grafana/dcgm-exporter-dashboard.json index c4703f2..73a4458 100644 --- a/grafana/dcgm-exporter-dashboard.json +++ b/grafana/dcgm-exporter-dashboard.json @@ -100,7 +100,7 @@ "steppedLine": false, "targets": [ { - "expr": "DCGM_FI_DEV_GPU_TEMP{gpu=~\"${gpu}\"}", + "expr": "DCGM_FI_DEV_GPU_TEMP{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "instant": true, "interval": "", "legendFormat": "GPU {{gpu}}", @@ -198,7 +198,7 @@ "pluginVersion": "6.7.3", "targets": [ { - "expr": "avg(DCGM_FI_DEV_GPU_TEMP{gpu=~\"${gpu}\"})", + "expr": "avg(DCGM_FI_DEV_GPU_TEMP{instance=~\"${instance}\", gpu=~\"${gpu}\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -253,7 +253,7 @@ "steppedLine": false, "targets": [ { - "expr": "DCGM_FI_DEV_POWER_USAGE{gpu=~\"${gpu}\"}", + "expr": "DCGM_FI_DEV_POWER_USAGE{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" @@ -370,7 +370,7 @@ "pluginVersion": "6.7.3", "targets": [ { - "expr": "sum(DCGM_FI_DEV_POWER_USAGE{gpu=~\"${gpu}\"})", + "expr": "sum(DCGM_FI_DEV_POWER_USAGE{instance=~\"${instance}\", gpu=~\"${gpu}\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -426,7 +426,7 @@ "steppedLine": false, "targets": [ { - "expr": "DCGM_FI_DEV_SM_CLOCK{gpu=~\"${gpu}\"}", + "expr": "DCGM_FI_DEV_SM_CLOCK{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "format": "time_series", "interval": "", "intervalFactor": 1, @@ -519,7 +519,7 @@ "steppedLine": false, "targets": [ { - "expr": "DCGM_FI_DEV_MEM_CLOCK{gpu=~\"${gpu}\"}", + "expr": "DCGM_FI_DEV_MEM_CLOCK{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" @@ -609,7 +609,7 @@ "steppedLine": false, "targets": [ { - "expr": "DCGM_FI_DEV_GPU_UTIL{gpu=~\"${gpu}\"}", + "expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" @@ -699,7 +699,7 @@ "steppedLine": false, "targets": [ { - "expr": "DCGM_FI_DEV_MEM_COPY_UTIL{gpu=~\"${gpu}\"}", + "expr": "DCGM_FI_DEV_MEM_COPY_UTIL{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" @@ -788,7 +788,7 @@ "steppedLine": false, "targets": [ { - "expr": "DCGM_FI_DEV_FB_USED{gpu=~\"${gpu}\"}", + "expr": "DCGM_FI_DEV_FB_USED{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" @@ -878,7 +878,7 @@ "steppedLine": false, "targets": [ { - "expr": "DCGM_FI_DEV_FB_USED{gpu=~\"${gpu}\"}", + "expr": "DCGM_FI_DEV_FB_USED{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" @@ -932,6 +932,28 @@ "tags": [], "templating": { "list": [ + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", + "hide": 0, + "includeAll": false, + "label": null, + "multi": true, + "name": "instance", + "options": [], + "query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": null, "current": {}, From 8b8ed66d4177702ef52aab58a72127ac9fa925c8 Mon Sep 17 00:00:00 2001 From: Martijn Pieters Date: Fri, 4 Sep 2020 14:45:16 +0100 Subject: [PATCH 3/8] The temperature graph needs a time series Disable the 'instant' flag, that only produces the latest value. --- grafana/dcgm-exporter-dashboard.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grafana/dcgm-exporter-dashboard.json b/grafana/dcgm-exporter-dashboard.json index 73a4458..2e0c8d8 100644 --- a/grafana/dcgm-exporter-dashboard.json +++ b/grafana/dcgm-exporter-dashboard.json @@ -101,7 +101,7 @@ "targets": [ { "expr": "DCGM_FI_DEV_GPU_TEMP{instance=~\"${instance}\", gpu=~\"${gpu}\"}", - "instant": true, + "instant": false, "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" From 664f16500fbbd170e55b3ecca01aa7947dd1bb3a Mon Sep 17 00:00:00 2001 From: Martijn Pieters Date: Fri, 4 Sep 2020 15:02:43 +0100 Subject: [PATCH 4/8] Reset the Y-axis limits These don't make sense for all but the two utilisation graphs, and end up hiding information. --- grafana/dcgm-exporter-dashboard.json | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/grafana/dcgm-exporter-dashboard.json b/grafana/dcgm-exporter-dashboard.json index 2e0c8d8..9587d7a 100644 --- a/grafana/dcgm-exporter-dashboard.json +++ b/grafana/dcgm-exporter-dashboard.json @@ -130,8 +130,8 @@ "format": "celsius", "label": null, "logBase": 1, - "max": "100", - "min": "0", + "max": null, + "min": null, "show": true }, { @@ -283,7 +283,7 @@ "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { @@ -458,8 +458,8 @@ "format": "hertz", "label": "", "logBase": 1, - "max": "100", - "min": "0", + "max": null, + "min": null, "show": true }, { @@ -548,8 +548,8 @@ "format": "hertz", "label": null, "logBase": 1, - "max": "100", - "min": "0", + "max": null, + "min": null, "show": true }, { @@ -817,8 +817,8 @@ "format": "decbytes", "label": null, "logBase": 1, - "max": "100", - "min": "0", + "max": null, + "min": null, "show": true }, { @@ -907,8 +907,8 @@ "format": "decbytes", "label": null, "logBase": 1, - "max": "100", - "min": "0", + "max": null, + "min": null, "show": true }, { From 0494408752abf31fb94c554d78e9dadc1a3cc9a3 Mon Sep 17 00:00:00 2001 From: Martijn Pieters Date: Fri, 4 Sep 2020 15:13:28 +0100 Subject: [PATCH 5/8] Remove value mappings from GPU Power Total gauge Someone probably forgot to remove these after experimenting; these mapped a power level of _1_ to `"value to text"`, and a half-configured range (no _from_ or _to_ set) that mapped power levels of _2_ to `"range to text"`. I'd be very surprised if these ever actually would show up, I don't expect a real-world GPU to ever report using exactly 1 or 2 watts. --- grafana/dcgm-exporter-dashboard.json | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/grafana/dcgm-exporter-dashboard.json b/grafana/dcgm-exporter-dashboard.json index 9587d7a..ba0f993 100644 --- a/grafana/dcgm-exporter-dashboard.json +++ b/grafana/dcgm-exporter-dashboard.json @@ -320,24 +320,7 @@ "color": { "mode": "thresholds" }, - "mappings": [ - { - "id": 0, - "op": "=", - "text": "value to text", - "type": 1, - "value": "1" - }, - { - "from": "", - "id": 1, - "operator": "", - "text": "range to text", - "to": "", - "type": 1, - "value": "2" - } - ], + "mappings": [], "max": 2400, "min": 0, "nullValueMode": "connected", From f3cfcaba6ac3c962690021d730de47454893eb82 Mon Sep 17 00:00:00 2001 From: Martijn Pieters Date: Fri, 4 Sep 2020 15:18:49 +0100 Subject: [PATCH 6/8] Correct the 'free' graph The `DCGM_FI_DEV_FB_USED` metric is already being used in the graph to its left. --- grafana/dcgm-exporter-dashboard.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grafana/dcgm-exporter-dashboard.json b/grafana/dcgm-exporter-dashboard.json index ba0f993..2ae6a4c 100644 --- a/grafana/dcgm-exporter-dashboard.json +++ b/grafana/dcgm-exporter-dashboard.json @@ -861,7 +861,7 @@ "steppedLine": false, "targets": [ { - "expr": "DCGM_FI_DEV_FB_USED{instance=~\"${instance}\", gpu=~\"${gpu}\"}", + "expr": "DCGM_FI_DEV_FB_FREE{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" From 7b4deaedc089292d3a9dcea2f2a61ba1c60875ad Mon Sep 17 00:00:00 2001 From: Martijn Pieters Date: Fri, 4 Sep 2020 15:23:06 +0100 Subject: [PATCH 7/8] Correct memory metric units DCGM_FI_DEV_FB_FREE / DCGM_FI_DEV_FB_USED report memory in MB, not bytes. --- grafana/dcgm-exporter-dashboard.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/grafana/dcgm-exporter-dashboard.json b/grafana/dcgm-exporter-dashboard.json index 2ae6a4c..234f571 100644 --- a/grafana/dcgm-exporter-dashboard.json +++ b/grafana/dcgm-exporter-dashboard.json @@ -797,7 +797,7 @@ }, "yaxes": [ { - "format": "decbytes", + "format": "decmbytes", "label": null, "logBase": 1, "max": null, @@ -887,7 +887,7 @@ }, "yaxes": [ { - "format": "decbytes", + "format": "decmbytes", "label": null, "logBase": 1, "max": null, From 3439a3ca2b389b3062a9ed09f091762734c5b794 Mon Sep 17 00:00:00 2001 From: Martijn Pieters Date: Fri, 4 Sep 2020 15:36:01 +0100 Subject: [PATCH 8/8] Adjust clock values The exporter reports these in Mhz, but there is no Mhz option in the Grafana standard units. Instead, we can multiply by 1 million to map Mhz back to Hz, and Grafana will automatically use Mhz when displaying these. --- grafana/dcgm-exporter-dashboard.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/grafana/dcgm-exporter-dashboard.json b/grafana/dcgm-exporter-dashboard.json index 234f571..ed2be0a 100644 --- a/grafana/dcgm-exporter-dashboard.json +++ b/grafana/dcgm-exporter-dashboard.json @@ -409,7 +409,7 @@ "steppedLine": false, "targets": [ { - "expr": "DCGM_FI_DEV_SM_CLOCK{instance=~\"${instance}\", gpu=~\"${gpu}\"}", + "expr": "DCGM_FI_DEV_SM_CLOCK{instance=~\"${instance}\", gpu=~\"${gpu}\"} * 1000000", "format": "time_series", "interval": "", "intervalFactor": 1, @@ -502,7 +502,7 @@ "steppedLine": false, "targets": [ { - "expr": "DCGM_FI_DEV_MEM_CLOCK{instance=~\"${instance}\", gpu=~\"${gpu}\"}", + "expr": "DCGM_FI_DEV_MEM_CLOCK{instance=~\"${instance}\", gpu=~\"${gpu}\"} * 1000000", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A"