diff --git a/Makefile b/Makefile index 9b2dc88..0bf1f8b 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ REGISTRY ?= nvidia DCGM_VERSION := 2.0.13 GOLANG_VERSION := 1.14.2 -VERSION := 2.1.0 +VERSION := 2.1.1 FULL_VERSION := $(DCGM_VERSION)-$(VERSION) .PHONY: all binary install check-format diff --git a/README.md b/README.md index ed5ffd4..c0b6dc4 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ The repository also contains DCGM-Exporter. It exposes GPU metrics exporter for To gather metrics on a GPU node, simply start the `dcgm-exporter` container: ``` -$ docker run -d --gpus all --rm -p 9400:9400 nvidia/dcgm-exporter:2.0.13-2.1.0-ubuntu18.04 +$ docker run -d --gpus all --rm -p 9400:9400 nvidia/dcgm-exporter:2.0.13-2.1.1-ubuntu18.04 $ curl localhost:9400/metrics # HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). # TYPE DCGM_FI_DEV_SM_CLOCK gauge diff --git a/bindings/go/dcgm/api.go b/bindings/go/dcgm/api.go index a25d911..05a446d 100644 --- a/bindings/go/dcgm/api.go +++ b/bindings/go/dcgm/api.go @@ -101,3 +101,8 @@ func Policy(gpuId uint, typ ...policyCondition) (<-chan PolicyViolation, error) func Introspect() (DcgmStatus, error) { return introspect() } + +// Get all of the profiling metric groups for a given GPU group. +func GetSupportedMetricGroups(grpid uint) ([]MetricGroup, error) { + return getSupportedMetricGroups(grpid) +} diff --git a/bindings/go/dcgm/profile.go b/bindings/go/dcgm/profile.go new file mode 100644 index 0000000..25ca752 --- /dev/null +++ b/bindings/go/dcgm/profile.go @@ -0,0 +1,47 @@ +package dcgm + +/* +#include "dcgm_agent.h" +#include "dcgm_structs.h" +*/ +import "C" +import ( + "fmt" + "unsafe" +) + +type MetricGroup struct { + major uint + minor uint + fieldIds []uint +} + +func getSupportedMetricGroups(grpid uint) (groups []MetricGroup, err error) { + + var groupInfo C.dcgmProfGetMetricGroups_t + groupInfo.version = makeVersion2(unsafe.Sizeof(groupInfo)) + groupInfo.groupId = C.ulong(grpid) + + result := C.dcgmProfGetSupportedMetricGroups(handle.handle, &groupInfo) + + if err = errorString(result); err != nil { + return groups, fmt.Errorf("Error getting supported metrics: %s", err) + } + + var count = uint(groupInfo.numMetricGroups) + + for i := uint(0); i < count; i++ { + var group MetricGroup + group.major = uint(groupInfo.metricGroups[i].majorId) + group.minor = uint(groupInfo.metricGroups[i].minorId) + + var fieldCount = uint(groupInfo.metricGroups[i].numFieldIds) + + for j := uint(0); j < fieldCount; j++ { + group.fieldIds = append(group.fieldIds, uint(groupInfo.metricGroups[i].fieldIds[j])) + } + groups = append(groups, group) + } + + return groups, nil +} diff --git a/dcgm-exporter.yaml b/dcgm-exporter.yaml index 206a62e..8dc9f1c 100644 --- a/dcgm-exporter.yaml +++ b/dcgm-exporter.yaml @@ -18,23 +18,23 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/version: "2.1.1" spec: updateStrategy: type: RollingUpdate selector: matchLabels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/version: "2.1.1" template: metadata: labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/version: "2.1.1" name: "dcgm-exporter" spec: containers: - - image: "nvidia/dcgm-exporter:2.0.13-2.1.0-ubuntu18.04" + - image: "nvidia/dcgm-exporter:2.0.13-2.1.1-ubuntu18.04" env: - name: "DCGM_EXPORTER_LISTEN" value: ":9400" @@ -64,11 +64,11 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/version: "2.1.1" spec: selector: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/version: "2.1.1" ports: - name: "metrics" port: 9400 diff --git a/deployment/dcgm-exporter/Chart.yaml b/deployment/dcgm-exporter/Chart.yaml index d6d2409..79e50b2 100644 --- a/deployment/dcgm-exporter/Chart.yaml +++ b/deployment/dcgm-exporter/Chart.yaml @@ -1,9 +1,9 @@ apiVersion: v2 name: dcgm-exporter description: A Helm chart for DCGM exporter -version: "2.1.0" +version: "2.1.1" kubeVersion: ">= 1.13.0-0" -appVersion: "2.1.0" +appVersion: "2.1.2" sources: - https://gitlab.com/nvidia/container-toolkit/gpu-monitoring-tools home: https://github.com/nvidia/gpu-monitoring-tools/ diff --git a/deployment/dcgm-exporter/values.yaml b/deployment/dcgm-exporter/values.yaml index a898ed9..fd23750 100644 --- a/deployment/dcgm-exporter/values.yaml +++ b/deployment/dcgm-exporter/values.yaml @@ -17,7 +17,7 @@ image: pullPolicy: IfNotPresent # Image tag defaults to AppVersion, but you can use the tag key # for the image tag, e.g: - tag: 2.0.13-2.1.0-ubuntu18.04 + tag: 2.0.13-2.1.1-ubuntu18.04 # Comment the following line to stop profiling metrics from DCGM arguments: ["-f", "/etc/dcgm-exporter/dcp-metrics-included.csv"] diff --git a/pkg/main.go b/pkg/main.go index 525e744..a796406 100644 --- a/pkg/main.go +++ b/pkg/main.go @@ -103,6 +103,14 @@ restart: } logrus.Info("DCGM successfully initialized!") + _, err = dcgm.GetSupportedMetricGroups(0) + if err != nil { + config.CollectDCP = false + logrus.Info("Not collecting DCP metrics: ", err) + } else { + logrus.Info("Collecting DCP Metrics") + } + ch := make(chan string, 10) pipeline, cleanup, err := NewMetricsPipeline(config) defer cleanup() @@ -153,5 +161,6 @@ func contextToConfig(c *cli.Context) *Config { CollectInterval: c.Int(CLICollectInterval), Kubernetes: c.Bool(CLIKubernetes), KubernetesGPUIdType: KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)), + CollectDCP: true, } } diff --git a/pkg/parser.go b/pkg/parser.go index 6f251d9..d0f2f31 100644 --- a/pkg/parser.go +++ b/pkg/parser.go @@ -26,14 +26,14 @@ import ( "github.com/sirupsen/logrus" ) -func ExtractCounters(filename string) ([]Counter, error) { +func ExtractCounters(filename string, dcpAllowed bool) ([]Counter, error) { records, err := ReadCSVFile(filename) if err != nil { fmt.Printf("Error: %v\n", err) return nil, err } - counters, err := extractCounters(records) + counters, err := extractCounters(records, dcpAllowed) if err != nil { return nil, err } @@ -55,7 +55,7 @@ func ReadCSVFile(filename string) ([][]string, error) { return records, err } -func extractCounters(records [][]string) ([]Counter, error) { +func extractCounters(records [][]string, dcpAllowed bool) ([]Counter, error) { f := make([]Counter, 0, len(records)) for i, record := range records { @@ -81,6 +81,11 @@ func extractCounters(records [][]string) ([]Counter, error) { return nil, fmt.Errorf("Could not find DCGM field %s", record[0]) } + if !dcpAllowed && fieldID >= 1000 { + logrus.Warnf("Skipping line %d ('%s'): DCP metrics not enabled", i, record[0]) + continue + } + if _, ok := promMetricType[record[1]]; !ok { return nil, fmt.Errorf("Could not find Prometheus metry type %s", record[1]) } diff --git a/pkg/pipeline.go b/pkg/pipeline.go index 415d00f..a9a6788 100644 --- a/pkg/pipeline.go +++ b/pkg/pipeline.go @@ -27,7 +27,7 @@ import ( ) func NewMetricsPipeline(c *Config) (*MetricsPipeline, func(), error) { - counters, err := ExtractCounters(c.CollectorsFile) + counters, err := ExtractCounters(c.CollectorsFile, c.CollectDCP) if err != nil { return nil, func() {}, err } diff --git a/pkg/types.go b/pkg/types.go index b8a11ec..c849f0b 100644 --- a/pkg/types.go +++ b/pkg/types.go @@ -50,6 +50,7 @@ type Config struct { CollectInterval int Kubernetes bool KubernetesGPUIdType KubernetesGPUIDType + CollectDCP bool } type Transform interface { diff --git a/service-monitor.yaml b/service-monitor.yaml index 9bb52bf..0e881ac 100644 --- a/service-monitor.yaml +++ b/service-monitor.yaml @@ -18,12 +18,12 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/version: "2.1.1" spec: selector: matchLabels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/version: "2.1.1" endpoints: - port: "metrics" path: "/metrics"