Skip to content

Commit

Permalink
Update dcgm-exporter to 2.1.0-rc.2
Browse files Browse the repository at this point in the history
Changelog:
1. Build Ubuntu 20.04 images
2. Update dcp-metrics-included.csv to include all profiling fields
3. Update Helm chart - chart now deploys DCP metrics by default and can be configured during Helm install to exclude
4. Reconfigure the default Grafana dashboard

Fix indentation typo in dcgm-exporter.yaml
  • Loading branch information
dualvtable committed Oct 3, 2020
1 parent f7c6397 commit 488614d
Show file tree
Hide file tree
Showing 9 changed files with 82 additions and 207 deletions.
12 changes: 10 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ REGISTRY ?= nvidia

DCGM_VERSION := 2.0.10
GOLANG_VERSION := 1.14.2
VERSION := 2.1.0-rc.1
VERSION := 2.1.0-rc.2
FULL_VERSION := $(DCGM_VERSION)-$(VERSION)

.PHONY: all binary install check-format
all: ubuntu18.04 ubi8
all: ubuntu18.04 ubuntu20.04 ubi8

binary:
go build -o dcgm-exporter github.com/NVIDIA/gpu-monitoring-tools/pkg
Expand All @@ -36,6 +36,7 @@ check-format:
test $$(gofmt -l pkg bindings | tee /dev/stderr | wc -l) -eq 0

push:
$(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu20.04"
$(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04"
$(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubi8"

Expand All @@ -51,6 +52,13 @@ push-latest:
$(DOCKER) tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04" "$(REGISTRY)/dcgm-exporter:latest"
$(DOCKER) push "$(REGISTRY)/dcgm-exporter:latest"

ubuntu20.04:
$(DOCKER) build --pull \
--build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \
--build-arg "DCGM_VERSION=$(DCGM_VERSION)" \
--tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu20.04" \
--file docker/Dockerfile.ubuntu20.04 .

ubuntu18.04:
$(DOCKER) build --pull \
--build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \
Expand Down
12 changes: 6 additions & 6 deletions dcgm-exporter.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,23 @@ metadata:
name: "dcgm-exporter"
labels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "2.0.0-rc.12"
app.kubernetes.io/version: "2.1.0-rc.2"
spec:
updateStrategy:
type: RollingUpdate
selector:
matchLabels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "2.0.0-rc.12"
app.kubernetes.io/version: "2.1.0-rc.2"
template:
metadata:
labels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "2.0.0-rc.12"
app.kubernetes.io/version: "2.1.0-rc.2"
name: "dcgm-exporter"
spec:
containers:
- image: "nvidia/dcgm-exporter:1.7.2"
- image: "nvidia/dcgm-exporter:2.0.10-2.1.0-rc.2-ubuntu18.04"
env:
- name: "DCGM_EXPORTER_LISTEN"
value: ":9400"
Expand Down Expand Up @@ -64,11 +64,11 @@ metadata:
name: "dcgm-exporter"
labels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "2.0.0-rc.12"
app.kubernetes.io/version: "2.1.0-rc.2"
spec:
selector:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "2.0.0-rc.12"
app.kubernetes.io/version: "2.1.0-rc.2"
ports:
- name: "metrics"
port: 9400
4 changes: 2 additions & 2 deletions deployment/dcgm-exporter/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ name: dcgm-exporter
description: A Helm chart for DCGM exporter
sources:
- https://gitlab.com/nvidia/container-toolkit/gpu-monitoring-tools
version: "1.0.2"
appVersion: "1.7.2"
version: "1.1.0"
appVersion: "2.0.10"
kubeVersion: ">= 1.13.0"
4 changes: 4 additions & 0 deletions deployment/dcgm-exporter/templates/daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ spec:
image: "{{ .Values.image.repository }}:{{ .Chart.AppVersion }}"
{{- end }}
imagePullPolicy: "{{ .Values.image.pullPolicy }}"
args:
{{- range $.Values.arguments }}
- {{ . }}
{{- end }}
env:
- name: "DCGM_EXPORTER_KUBERNETES"
value: "true"
Expand Down
10 changes: 6 additions & 4 deletions deployment/dcgm-exporter/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@ image:
pullPolicy: IfNotPresent
# Image tag defaults to AppVersion, but you can use the tag key
# for the image tag, e.g:
#tag: 1.7.2
tag: 2.0.10-2.1.0-rc.2-ubuntu18.04

# Comment the following line to stop profiling metrics from DCGM
arguments: ["-f", "/etc/dcgm-exporter/dcp-metrics-included.csv"]

imagePullSecrets: []
nameOverride: ""
Expand All @@ -38,9 +41,8 @@ podSecurityContext: {}
securityContext:
runAsNonRoot: false
runAsUser: 0
# capabilities:
# drop:
# - ALL
capabilities:
add: ["SYS_ADMIN"]
# readOnlyRootFilesystem: true

service:
Expand Down
34 changes: 34 additions & 0 deletions docker/Dockerfile.ubuntu20.04
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
ARG GOLANG_VERSION
FROM golang:$GOLANG_VERSION AS builder
WORKDIR /go/src/github.com/NVIDIA/gpu-monitoring-tools

COPY . .

RUN make binary check-format

FROM nvidia/cuda:11.0-base-ubuntu20.04
LABEL io.k8s.display-name="NVIDIA DCGM Exporter"

COPY --from=builder /go/src/github.com/NVIDIA/gpu-monitoring-tools/dcgm-exporter /usr/bin/
COPY etc/dcgm-exporter /etc/dcgm-exporter

ARG DCGM_VERSION
RUN apt-get update && apt-get install -y --no-install-recommends \
libcap2-bin \
libgomp1 \
wget && \
rm -rf /var/lib/apt/lists/* && \
wget --no-check-certificate https://developer.download.nvidia.com/compute/redist/dcgm/${DCGM_VERSION}/DEBS/datacenter-gpu-manager_${DCGM_VERSION}_amd64.deb && \
dpkg -i datacenter-gpu-manager_*.deb && \
rm -f datacenter-gpu-manager_*.deb

# Required for DCP metrics
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32

ENV NVIDIA_VISIBLE_DEVICES=all

ENV NO_SETCAP=
COPY docker/docker-entrypoint.sh /usr/local/dcgm/docker-entrypoint.sh
RUN chmod +x /usr/local/dcgm/docker-entrypoint.sh

ENTRYPOINT ["/usr/local/dcgm/docker-entrypoint.sh"]
9 changes: 8 additions & 1 deletion etc/dcgm-exporter/dcp-metrics-included.csv
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,18 @@ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink f
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors.
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries.
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors.
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes.
DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload.

# DCP metrics,,
DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active (in %).
DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned (in %).
DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM (in %).
DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %).
DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %).
DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active (in %).
DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active (in %).
DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active (in %).
DCGM_FI_PROF_PCIE_TX_BYTES, counter, The number of bytes of active pcie tx data including both header and payload.
DCGM_FI_PROF_PCIE_RX_BYTES, counter, The number of bytes of active pcie rx data including both header and payload.

Loading

0 comments on commit 488614d

Please sign in to comment.