Skip to content

Commit

Permalink
Merge branch 'support-setting-kubernetes-resource-identifier' into 'm…
Browse files Browse the repository at this point in the history
…aster'

Support different identifiers for metrics

See merge request nvidia/container-toolkit/gpu-monitoring-tools!30
  • Loading branch information
nvjmayo committed Sep 4, 2020
2 parents 1bbc5cd + a571380 commit 9732363
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 20 deletions.
5 changes: 3 additions & 2 deletions pkg/gpu_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,9 @@ func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device) []Metric
Name: c[i].FieldName,
Value: v,

GPU: fmt.Sprintf("%d", d.GPU),
GPUUUID: d.UUID,
GPU: fmt.Sprintf("%d", d.GPU),
GPUUUID: d.UUID,
GPUDevice: fmt.Sprintf("nvidia%d", d.GPU),

Attributes: map[string]string{},
}
Expand Down
10 changes: 7 additions & 3 deletions pkg/kubernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,13 @@ func (p *PodMapper) Process(metrics [][]Metric) error {
// and not the copy, we need to use the indexes
for i, device := range metrics {
for j, val := range device {
metrics[i][j].Attributes[podAttribute] = deviceToPod[val.GPUUUID].Name
metrics[i][j].Attributes[namespaceAttribute] = deviceToPod[val.GPUUUID].Namespace
metrics[i][j].Attributes[containerAttribute] = deviceToPod[val.GPUUUID].Container
GPUID, err := val.getIDOfType(p.Config.KubernetesGPUIdType)
if err != nil {
return err
}
metrics[i][j].Attributes[podAttribute] = deviceToPod[GPUID].Name
metrics[i][j].Attributes[namespaceAttribute] = deviceToPod[GPUID].Namespace
metrics[i][j].Attributes[containerAttribute] = deviceToPod[GPUID].Container
}
}

Expand Down
25 changes: 17 additions & 8 deletions pkg/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package main

import (
"fmt"
"os"
"sync"
"syscall"
Expand All @@ -30,10 +31,11 @@ import (
var (
BuildVersion = "Filled by the build system"

CLIFieldsFile = "collectors"
CLIAddress = "address"
CLICollectInterval = "collect-interval"
CLIKubernetes = "kubernetes"
CLIFieldsFile = "collectors"
CLIAddress = "address"
CLICollectInterval = "collect-interval"
CLIKubernetes = "kubernetes"
CLIKubernetesGPUIDType = "kubernetes-gpu-id-type"
)

func main() {
Expand Down Expand Up @@ -71,6 +73,12 @@ func main() {
Usage: "Enable kubernetes mapping metrics to kubernetes pods",
EnvVars: []string{"DCGM_EXPORTER_KUBERNETES"},
},
&cli.StringFlag{
Name: CLIKubernetesGPUIDType,
Value: string(GPUUID),
Usage: fmt.Sprintf("Choose Type of GPU ID to use to map kubernetes resources to pods. Possible values: '%s', '%s'", GPUUID, DeviceName),
EnvVars: []string{"DCGM_EXPORTER_KUBERNETES_GPU_ID_TYPE"},
},
}

c.Action = func(c *cli.Context) error {
Expand Down Expand Up @@ -140,9 +148,10 @@ restart:

func contextToConfig(c *cli.Context) *Config {
return &Config{
CollectorsFile: c.String(CLIFieldsFile),
Address: c.String(CLIAddress),
CollectInterval: c.Int(CLICollectInterval),
Kubernetes: c.Bool(CLIKubernetes),
CollectorsFile: c.String(CLIFieldsFile),
Address: c.String(CLIAddress),
CollectInterval: c.Int(CLICollectInterval),
Kubernetes: c.Bool(CLIKubernetes),
KubernetesGPUIdType: KubernetesGPUIDType(c.String(CLIKubernetesGPUIDType)),
}
}
2 changes: 1 addition & 1 deletion pkg/pipeline.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ func FormatCounters(c []Counter) (string, error) {

var metricsFormat = `
{{ range $dev := . }}{{ range $val := $dev }}
{{ $val.Name }}{gpu="{{ $val.GPU }}", UUID="{{ $val.GPUUUID }}"
{{ $val.Name }}{gpu="{{ $val.GPU }}", UUID="{{ $val.GPUUUID }}", device="{{ $val.GPUDevice }}"
{{- range $k, $v := $val.Attributes -}}
,{{ $k }}="{{ $v }}"
Expand Down
32 changes: 26 additions & 6 deletions pkg/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package main

import (
"fmt"
"net/http"
"sync"
"text/template"
Expand All @@ -36,11 +37,19 @@ var (
containerAttribute = "container"
)

type KubernetesGPUIDType string

const (
GPUUID KubernetesGPUIDType = "uid"
DeviceName KubernetesGPUIDType = "device-name"
)

type Config struct {
CollectorsFile string
Address string
CollectInterval int
Kubernetes bool
CollectorsFile string
Address string
CollectInterval int
Kubernetes bool
KubernetesGPUIdType KubernetesGPUIDType
}

type Transform interface {
Expand Down Expand Up @@ -75,12 +84,23 @@ type Metric struct {
Name string
Value string

GPU string
GPUUUID string
GPU string
GPUUUID string
GPUDevice string

Attributes map[string]string
}

func (m Metric) getIDOfType(idType KubernetesGPUIDType) (string, error) {
switch idType {
case GPUUID:
return m.GPUUUID, nil
case DeviceName:
return m.GPUDevice, nil
}
return "", fmt.Errorf("unsupported KubernetesGPUIDType for MetricID %s", idType)
}

var promMetricType = map[string]bool{
"gauge": true,
"counter": true,
Expand Down

0 comments on commit 9732363

Please sign in to comment.