diff --git a/Makefile b/Makefile index df61577..77016c5 100644 --- a/Makefile +++ b/Makefile @@ -18,11 +18,11 @@ REGISTRY ?= nvidia DCGM_VERSION := 2.0.10 GOLANG_VERSION := 1.14.2 -VERSION := 2.1.0-rc.1 +VERSION := 2.1.0-rc.2 FULL_VERSION := $(DCGM_VERSION)-$(VERSION) .PHONY: all binary install check-format -all: ubuntu18.04 ubi8 +all: ubuntu18.04 ubuntu20.04 ubi8 binary: go build -o dcgm-exporter github.com/NVIDIA/gpu-monitoring-tools/pkg @@ -36,6 +36,7 @@ check-format: test $$(gofmt -l pkg bindings | tee /dev/stderr | wc -l) -eq 0 push: + $(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu20.04" $(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04" $(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubi8" @@ -51,6 +52,13 @@ push-latest: $(DOCKER) tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04" "$(REGISTRY)/dcgm-exporter:latest" $(DOCKER) push "$(REGISTRY)/dcgm-exporter:latest" +ubuntu20.04: + $(DOCKER) build --pull \ + --build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \ + --build-arg "DCGM_VERSION=$(DCGM_VERSION)" \ + --tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu20.04" \ + --file docker/Dockerfile.ubuntu20.04 . + ubuntu18.04: $(DOCKER) build --pull \ --build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \ diff --git a/dcgm-exporter.yaml b/dcgm-exporter.yaml index 3ce5389..64ff25f 100644 --- a/dcgm-exporter.yaml +++ b/dcgm-exporter.yaml @@ -18,23 +18,23 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.0.0-rc.12" + app.kubernetes.io/version: "2.1.0-rc.2" spec: updateStrategy: type: RollingUpdate selector: matchLabels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.0.0-rc.12" + app.kubernetes.io/version: "2.1.0-rc.2" template: metadata: labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.0.0-rc.12" + app.kubernetes.io/version: "2.1.0-rc.2" name: "dcgm-exporter" spec: containers: - - image: "nvidia/dcgm-exporter:1.7.2" + - image: "nvidia/dcgm-exporter:2.0.10-2.1.0-rc.2-ubuntu18.04" env: - name: "DCGM_EXPORTER_LISTEN" value: ":9400" @@ -64,11 +64,11 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.0.0-rc.12" + app.kubernetes.io/version: "2.1.0-rc.2" spec: selector: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.0.0-rc.12" + app.kubernetes.io/version: "2.1.0-rc.2" ports: - name: "metrics" port: 9400 diff --git a/deployment/dcgm-exporter/Chart.yaml b/deployment/dcgm-exporter/Chart.yaml index 3d4730d..423d9cb 100644 --- a/deployment/dcgm-exporter/Chart.yaml +++ b/deployment/dcgm-exporter/Chart.yaml @@ -3,6 +3,6 @@ name: dcgm-exporter description: A Helm chart for DCGM exporter sources: - https://gitlab.com/nvidia/container-toolkit/gpu-monitoring-tools -version: "1.0.2" -appVersion: "1.7.2" +version: "1.1.0" +appVersion: "2.0.10" kubeVersion: ">= 1.13.0" diff --git a/deployment/dcgm-exporter/templates/daemonset.yaml b/deployment/dcgm-exporter/templates/daemonset.yaml index 0ad6900..cc89e4d 100644 --- a/deployment/dcgm-exporter/templates/daemonset.yaml +++ b/deployment/dcgm-exporter/templates/daemonset.yaml @@ -73,6 +73,10 @@ spec: image: "{{ .Values.image.repository }}:{{ .Chart.AppVersion }}" {{- end }} imagePullPolicy: "{{ .Values.image.pullPolicy }}" + args: + {{- range $.Values.arguments }} + - {{ . }} + {{- end }} env: - name: "DCGM_EXPORTER_KUBERNETES" value: "true" diff --git a/deployment/dcgm-exporter/values.yaml b/deployment/dcgm-exporter/values.yaml index 9e8abf3..8b999c7 100644 --- a/deployment/dcgm-exporter/values.yaml +++ b/deployment/dcgm-exporter/values.yaml @@ -17,7 +17,10 @@ image: pullPolicy: IfNotPresent # Image tag defaults to AppVersion, but you can use the tag key # for the image tag, e.g: - #tag: 1.7.2 + tag: 2.0.10-2.1.0-rc.2-ubuntu18.04 + +# Comment the following line to stop profiling metrics from DCGM +arguments: ["-f", "/etc/dcgm-exporter/dcp-metrics-included.csv"] imagePullSecrets: [] nameOverride: "" @@ -38,9 +41,8 @@ podSecurityContext: {} securityContext: runAsNonRoot: false runAsUser: 0 - # capabilities: - # drop: - # - ALL + capabilities: + add: ["SYS_ADMIN"] # readOnlyRootFilesystem: true service: diff --git a/docker/Dockerfile.ubuntu20.04 b/docker/Dockerfile.ubuntu20.04 new file mode 100644 index 0000000..d30736d --- /dev/null +++ b/docker/Dockerfile.ubuntu20.04 @@ -0,0 +1,34 @@ +ARG GOLANG_VERSION +FROM golang:$GOLANG_VERSION AS builder +WORKDIR /go/src/github.com/NVIDIA/gpu-monitoring-tools + +COPY . . + +RUN make binary check-format + +FROM nvidia/cuda:11.0-base-ubuntu20.04 +LABEL io.k8s.display-name="NVIDIA DCGM Exporter" + +COPY --from=builder /go/src/github.com/NVIDIA/gpu-monitoring-tools/dcgm-exporter /usr/bin/ +COPY etc/dcgm-exporter /etc/dcgm-exporter + +ARG DCGM_VERSION +RUN apt-get update && apt-get install -y --no-install-recommends \ + libcap2-bin \ + libgomp1 \ + wget && \ + rm -rf /var/lib/apt/lists/* && \ + wget --no-check-certificate https://developer.download.nvidia.com/compute/redist/dcgm/${DCGM_VERSION}/DEBS/datacenter-gpu-manager_${DCGM_VERSION}_amd64.deb && \ + dpkg -i datacenter-gpu-manager_*.deb && \ + rm -f datacenter-gpu-manager_*.deb + +# Required for DCP metrics +ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32 + +ENV NVIDIA_VISIBLE_DEVICES=all + +ENV NO_SETCAP= +COPY docker/docker-entrypoint.sh /usr/local/dcgm/docker-entrypoint.sh +RUN chmod +x /usr/local/dcgm/docker-entrypoint.sh + +ENTRYPOINT ["/usr/local/dcgm/docker-entrypoint.sh"] diff --git a/etc/dcgm-exporter/dcp-metrics-included.csv b/etc/dcgm-exporter/dcp-metrics-included.csv index 59c784b..e781599 100644 --- a/etc/dcgm-exporter/dcp-metrics-included.csv +++ b/etc/dcgm-exporter/dcp-metrics-included.csv @@ -54,7 +54,8 @@ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink f DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. -DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes. +DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload. # DCP metrics,, DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active (in %). @@ -62,3 +63,9 @@ DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM (in %). DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %). DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %). +DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active (in %). +DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active (in %). +DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active (in %). +DCGM_FI_PROF_PCIE_TX_BYTES, counter, The number of bytes of active pcie tx data including both header and payload. +DCGM_FI_PROF_PCIE_RX_BYTES, counter, The number of bytes of active pcie rx data including both header and payload. + diff --git a/grafana/dcgm-exporter-dashboard.json b/grafana/dcgm-exporter-dashboard.json index ed2be0a..40f89d8 100644 --- a/grafana/dcgm-exporter-dashboard.json +++ b/grafana/dcgm-exporter-dashboard.json @@ -459,96 +459,6 @@ "alignLevel": null } }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 16 - }, - "hiddenSeries": false, - "id": 4, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "nullPointMode": "null", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "DCGM_FI_DEV_MEM_CLOCK{instance=~\"${instance}\", gpu=~\"${gpu}\"} * 1000000", - "interval": "", - "legendFormat": "GPU {{gpu}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "GPU Memory Clocks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "hertz", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, { "aliasColors": {}, "bars": false, @@ -639,96 +549,6 @@ "alignLevel": null } }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 24 - }, - "hiddenSeries": false, - "id": 8, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "nullPointMode": "null", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "DCGM_FI_DEV_MEM_COPY_UTIL{instance=~\"${instance}\", gpu=~\"${gpu}\"}", - "interval": "", - "legendFormat": "GPU {{gpu}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "GPU Mem Cpy Utilization", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "label": null, - "logBase": 1, - "max": "100", - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, { "aliasColors": {}, "bars": false, @@ -829,15 +649,15 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 32 + "x": 0, + "y": 24 }, "hiddenSeries": false, - "id": 20, + "id": 4, "legend": { "alignAsTable": true, "avg": true, - "current": false, + "current": true, "max": true, "min": false, "rightSide": true, @@ -861,7 +681,7 @@ "steppedLine": false, "targets": [ { - "expr": "DCGM_FI_DEV_FB_FREE{instance=~\"${instance}\", gpu=~\"${gpu}\"}", + "expr": "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" @@ -871,11 +691,11 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "GPU Framebuffer Mem Free", + "title": "Tensor Core Utilization", "tooltip": { "shared": true, "sort": 0, - "value_type": "individual" + "value_type": "cumulative" }, "type": "graph", "xaxis": { @@ -887,11 +707,11 @@ }, "yaxes": [ { - "format": "decmbytes", + "format": "percentunit", "label": null, "logBase": 1, - "max": null, - "min": null, + "max": "1", + "min": "0", "show": true }, { diff --git a/service-monitor.yaml b/service-monitor.yaml index 7e39764..9477b98 100644 --- a/service-monitor.yaml +++ b/service-monitor.yaml @@ -18,12 +18,12 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.0.0-rc.12" + app.kubernetes.io/version: "2.1.0-rc.2" spec: selector: matchLabels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.0.0-rc.12" + app.kubernetes.io/version: "2.1.0-rc.2" endpoints: - port: "metrics" path: "/metrics"