Skip to content

Commit

Permalink
Merge branch 'master' into 'fix-chart-compatibility-check'
Browse files Browse the repository at this point in the history
# Conflicts:
#   deployment/dcgm-exporter/Chart.yaml
  • Loading branch information
decayofmind committed Nov 17, 2020
2 parents 49f16fc + 87592e0 commit acc56bd
Show file tree
Hide file tree
Showing 12 changed files with 84 additions and 17 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ REGISTRY ?= nvidia

DCGM_VERSION := 2.0.13
GOLANG_VERSION := 1.14.2
VERSION := 2.1.0
VERSION := 2.1.1
FULL_VERSION := $(DCGM_VERSION)-$(VERSION)

.PHONY: all binary install check-format
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ The repository also contains DCGM-Exporter. It exposes GPU metrics exporter for

To gather metrics on a GPU node, simply start the `dcgm-exporter` container:
```
$ docker run -d --gpus all --rm -p 9400:9400 nvidia/dcgm-exporter:2.0.13-2.1.0-ubuntu18.04
$ docker run -d --gpus all --rm -p 9400:9400 nvidia/dcgm-exporter:2.0.13-2.1.1-ubuntu18.04
$ curl localhost:9400/metrics
# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
# TYPE DCGM_FI_DEV_SM_CLOCK gauge
Expand Down
5 changes: 5 additions & 0 deletions bindings/go/dcgm/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,8 @@ func Policy(gpuId uint, typ ...policyCondition) (<-chan PolicyViolation, error)
func Introspect() (DcgmStatus, error) {
return introspect()
}

// Get all of the profiling metric groups for a given GPU group.
func GetSupportedMetricGroups(grpid uint) ([]MetricGroup, error) {
return getSupportedMetricGroups(grpid)
}
47 changes: 47 additions & 0 deletions bindings/go/dcgm/profile.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package dcgm

/*
#include "dcgm_agent.h"
#include "dcgm_structs.h"
*/
import "C"
import (
"fmt"
"unsafe"
)

type MetricGroup struct {
major uint
minor uint
fieldIds []uint
}

func getSupportedMetricGroups(grpid uint) (groups []MetricGroup, err error) {

var groupInfo C.dcgmProfGetMetricGroups_t
groupInfo.version = makeVersion2(unsafe.Sizeof(groupInfo))
groupInfo.groupId = C.ulong(grpid)

result := C.dcgmProfGetSupportedMetricGroups(handle.handle, &groupInfo)

if err = errorString(result); err != nil {
return groups, fmt.Errorf("Error getting supported metrics: %s", err)
}

var count = uint(groupInfo.numMetricGroups)

for i := uint(0); i < count; i++ {
var group MetricGroup
group.major = uint(groupInfo.metricGroups[i].majorId)
group.minor = uint(groupInfo.metricGroups[i].minorId)

var fieldCount = uint(groupInfo.metricGroups[i].numFieldIds)

for j := uint(0); j < fieldCount; j++ {
group.fieldIds = append(group.fieldIds, uint(groupInfo.metricGroups[i].fieldIds[j]))
}
groups = append(groups, group)
}

return groups, nil
}
12 changes: 6 additions & 6 deletions dcgm-exporter.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,23 @@ metadata:
name: "dcgm-exporter"
labels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "2.1.0"
app.kubernetes.io/version: "2.1.1"
spec:
updateStrategy:
type: RollingUpdate
selector:
matchLabels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "2.1.0"
app.kubernetes.io/version: "2.1.1"
template:
metadata:
labels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "2.1.0"
app.kubernetes.io/version: "2.1.1"
name: "dcgm-exporter"
spec:
containers:
- image: "nvidia/dcgm-exporter:2.0.13-2.1.0-ubuntu18.04"
- image: "nvidia/dcgm-exporter:2.0.13-2.1.1-ubuntu18.04"
env:
- name: "DCGM_EXPORTER_LISTEN"
value: ":9400"
Expand Down Expand Up @@ -64,11 +64,11 @@ metadata:
name: "dcgm-exporter"
labels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "2.1.0"
app.kubernetes.io/version: "2.1.1"
spec:
selector:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "2.1.0"
app.kubernetes.io/version: "2.1.1"
ports:
- name: "metrics"
port: 9400
4 changes: 2 additions & 2 deletions deployment/dcgm-exporter/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
apiVersion: v2
name: dcgm-exporter
description: A Helm chart for DCGM exporter
version: "2.1.0"
version: "2.1.1"
kubeVersion: ">= 1.13.0-0"
appVersion: "2.1.0"
appVersion: "2.1.2"
sources:
- https://gitlab.com/nvidia/container-toolkit/gpu-monitoring-tools
home: https://github.com/nvidia/gpu-monitoring-tools/
Expand Down
2 changes: 1 addition & 1 deletion deployment/dcgm-exporter/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ image:
pullPolicy: IfNotPresent
# Image tag defaults to AppVersion, but you can use the tag key
# for the image tag, e.g:
tag: 2.0.13-2.1.0-ubuntu18.04
tag: 2.0.13-2.1.1-ubuntu18.04

# Comment the following line to stop profiling metrics from DCGM
arguments: ["-f", "/etc/dcgm-exporter/dcp-metrics-included.csv"]
Expand Down
9 changes: 9 additions & 0 deletions pkg/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,14 @@ restart:
}
logrus.Info("DCGM successfully initialized!")

_, err = dcgm.GetSupportedMetricGroups(0)
if err != nil {
config.CollectDCP = false
logrus.Info("Not collecting DCP metrics: ", err)
} else {
logrus.Info("Collecting DCP Metrics")
}

ch := make(chan string, 10)
pipeline, cleanup, err := NewMetricsPipeline(config)
defer cleanup()
Expand Down Expand Up @@ -153,5 +161,6 @@ func contextToConfig(c *cli.Context) *Config {
CollectInterval: c.Int(CLICollectInterval),
Kubernetes: c.Bool(CLIKubernetes),
KubernetesGPUIdType: KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)),
CollectDCP: true,
}
}
11 changes: 8 additions & 3 deletions pkg/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@ import (
"github.com/sirupsen/logrus"
)

func ExtractCounters(filename string) ([]Counter, error) {
func ExtractCounters(filename string, dcpAllowed bool) ([]Counter, error) {
records, err := ReadCSVFile(filename)
if err != nil {
fmt.Printf("Error: %v\n", err)
return nil, err
}

counters, err := extractCounters(records)
counters, err := extractCounters(records, dcpAllowed)
if err != nil {
return nil, err
}
Expand All @@ -55,7 +55,7 @@ func ReadCSVFile(filename string) ([][]string, error) {
return records, err
}

func extractCounters(records [][]string) ([]Counter, error) {
func extractCounters(records [][]string, dcpAllowed bool) ([]Counter, error) {
f := make([]Counter, 0, len(records))

for i, record := range records {
Expand All @@ -81,6 +81,11 @@ func extractCounters(records [][]string) ([]Counter, error) {
return nil, fmt.Errorf("Could not find DCGM field %s", record[0])
}

if !dcpAllowed && fieldID >= 1000 {
logrus.Warnf("Skipping line %d ('%s'): DCP metrics not enabled", i, record[0])
continue
}

if _, ok := promMetricType[record[1]]; !ok {
return nil, fmt.Errorf("Could not find Prometheus metry type %s", record[1])
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/pipeline.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ import (
)

func NewMetricsPipeline(c *Config) (*MetricsPipeline, func(), error) {
counters, err := ExtractCounters(c.CollectorsFile)
counters, err := ExtractCounters(c.CollectorsFile, c.CollectDCP)
if err != nil {
return nil, func() {}, err
}
Expand Down
1 change: 1 addition & 0 deletions pkg/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ type Config struct {
CollectInterval int
Kubernetes bool
KubernetesGPUIdType KubernetesGPUIDType
CollectDCP bool
}

type Transform interface {
Expand Down
4 changes: 2 additions & 2 deletions service-monitor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@ metadata:
name: "dcgm-exporter"
labels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "2.1.0"
app.kubernetes.io/version: "2.1.1"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "2.1.0"
app.kubernetes.io/version: "2.1.1"
endpoints:
- port: "metrics"
path: "/metrics"

0 comments on commit acc56bd

Please sign in to comment.