Skip to content

Commit

Permalink
DCGM-1972 - enable dcgm-exporter check for DCP metric compatibilty
Browse files Browse the repository at this point in the history
  • Loading branch information
glowkey committed Nov 16, 2020
1 parent d08ea3c commit 5bb0241
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 4 deletions.
5 changes: 5 additions & 0 deletions bindings/go/dcgm/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,8 @@ func Policy(gpuId uint, typ ...policyCondition) (<-chan PolicyViolation, error)
func Introspect() (DcgmStatus, error) {
return introspect()
}

// Get all of the profiling metric groups for a given GPU group.
func GetSupportedMetricGroups(grpid uint) ([]MetricGroup, error) {
return getSupportedMetricGroups(grpid)
}
47 changes: 47 additions & 0 deletions bindings/go/dcgm/profile.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package dcgm

/*
#include "dcgm_agent.h"
#include "dcgm_structs.h"
*/
import "C"
import (
"fmt"
"unsafe"
)

type MetricGroup struct {
major uint
minor uint
fieldIds []uint
}

func getSupportedMetricGroups(grpid uint) (groups []MetricGroup, err error) {

var groupInfo C.dcgmProfGetMetricGroups_t
groupInfo.version = makeVersion2(unsafe.Sizeof(groupInfo))
groupInfo.groupId = C.ulong(grpid)

result := C.dcgmProfGetSupportedMetricGroups(handle.handle, &groupInfo)

if err = errorString(result); err != nil {
return groups, fmt.Errorf("Error getting supported metrics: %s", err)
}

var count = uint(groupInfo.numMetricGroups)

for i := uint(0); i < count; i++ {
var group MetricGroup
group.major = uint(groupInfo.metricGroups[i].majorId)
group.minor = uint(groupInfo.metricGroups[i].minorId)

var fieldCount = uint(groupInfo.metricGroups[i].numFieldIds)

for j := uint(0); j < fieldCount; j++ {
group.fieldIds = append(group.fieldIds, uint(groupInfo.metricGroups[i].fieldIds[j]))
}
groups = append(groups, group)
}

return groups, nil
}
9 changes: 9 additions & 0 deletions pkg/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,14 @@ restart:
}
logrus.Info("DCGM successfully initialized!")

_, err = dcgm.GetSupportedMetricGroups(0)
if err != nil {
config.CollectDCP = false
logrus.Info("Not collecting DCP metrics: ", err)
} else {
logrus.Info("Collecting DCP Metrics")
}

ch := make(chan string, 10)
pipeline, cleanup, err := NewMetricsPipeline(config)
defer cleanup()
Expand Down Expand Up @@ -153,5 +161,6 @@ func contextToConfig(c *cli.Context) *Config {
CollectInterval: c.Int(CLICollectInterval),
Kubernetes: c.Bool(CLIKubernetes),
KubernetesGPUIdType: KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)),
CollectDCP: true,
}
}
11 changes: 8 additions & 3 deletions pkg/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@ import (
"github.com/sirupsen/logrus"
)

func ExtractCounters(filename string) ([]Counter, error) {
func ExtractCounters(filename string, dcpAllowed bool) ([]Counter, error) {
records, err := ReadCSVFile(filename)
if err != nil {
fmt.Printf("Error: %v\n", err)
return nil, err
}

counters, err := extractCounters(records)
counters, err := extractCounters(records, dcpAllowed)
if err != nil {
return nil, err
}
Expand All @@ -55,7 +55,7 @@ func ReadCSVFile(filename string) ([][]string, error) {
return records, err
}

func extractCounters(records [][]string) ([]Counter, error) {
func extractCounters(records [][]string, dcpAllowed bool) ([]Counter, error) {
f := make([]Counter, 0, len(records))

for i, record := range records {
Expand All @@ -81,6 +81,11 @@ func extractCounters(records [][]string) ([]Counter, error) {
return nil, fmt.Errorf("Could not find DCGM field %s", record[0])
}

if !dcpAllowed && fieldID >= 1000 {
logrus.Warnf("Skipping line %d ('%s'): DCP metrics not enabled", i, record[0])
continue
}

if _, ok := promMetricType[record[1]]; !ok {
return nil, fmt.Errorf("Could not find Prometheus metry type %s", record[1])
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/pipeline.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ import (
)

func NewMetricsPipeline(c *Config) (*MetricsPipeline, func(), error) {
counters, err := ExtractCounters(c.CollectorsFile)
counters, err := ExtractCounters(c.CollectorsFile, c.CollectDCP)
if err != nil {
return nil, func() {}, err
}
Expand Down
1 change: 1 addition & 0 deletions pkg/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ type Config struct {
CollectInterval int
Kubernetes bool
KubernetesGPUIdType KubernetesGPUIDType
CollectDCP bool
}

type Transform interface {
Expand Down

0 comments on commit 5bb0241

Please sign in to comment.