diff --git a/bindings/go/dcgm/const.go b/bindings/go/dcgm/const.go index 854c188..b086808 100644 --- a/bindings/go/dcgm/const.go +++ b/bindings/go/dcgm/const.go @@ -14,11 +14,27 @@ type FieldValue_v1 struct { } const ( - DCGM_FT_BINARY = uint('b') - DCGM_FT_DOUBLE = uint('d') - DCGM_FT_INT64 = uint('i') - DCGM_FT_STRING = uint('s') - DCGM_FT_TIMESTAMP = uint('t') + DCGM_FT_BINARY = uint('b') + DCGM_FT_DOUBLE = uint('d') + DCGM_FT_INT64 = uint('i') + DCGM_FT_STRING = uint('s') + DCGM_FT_TIMESTAMP = uint('t') + DCGM_FT_INT32_BLANK = int64(2147483632) + DCGM_FT_INT32_NOT_FOUND = int64(DCGM_FT_INT32_BLANK + 1) + DCGM_FT_INT32_NOT_SUPPORTED = int64(DCGM_FT_INT32_BLANK + 2) + DCGM_FT_INT32_NOT_PERMISSIONED = int64(DCGM_FT_INT32_BLANK + 3) + DCGM_FT_INT64_BLANK = int64(9223372036854775792) + DCGM_FT_INT64_NOT_FOUND = int64(DCGM_FT_INT64_BLANK + 1) + DCGM_FT_INT64_NOT_SUPPORTED = int64(DCGM_FT_INT64_BLANK + 2) + DCGM_FT_INT64_NOT_PERMISSIONED = int64(DCGM_FT_INT64_BLANK + 3) + DCGM_FT_FP64_BLANK = 140737488355328.0 + DCGM_FT_FP64_NOT_FOUND = float64(DCGM_FT_FP64_BLANK + 1.0) + DCGM_FT_FP64_NOT_SUPPORTED = float64(DCGM_FT_FP64_BLANK + 2.0) + DCGM_FT_FP64_NOT_PERMISSIONED = float64(DCGM_FT_FP64_BLANK + 3.0) + DCGM_FT_STR_BLANK = "<<>>" + DCGM_FT_STR_NOT_FOUND = "<<>>" + DCGM_FT_STR_NOT_SUPPORTED = "<<>>" + DCGM_FT_STR_NOT_PERMISSIONED = "<<>>" DCGM_FI_UNKNOWN = 0 DCGM_FI_DRIVER_VERSION = 1 diff --git a/pkg/gpu_collector.go b/pkg/gpu_collector.go index 73dc00d..2ab6daa 100644 --- a/pkg/gpu_collector.go +++ b/pkg/gpu_collector.go @@ -69,18 +69,24 @@ func (c *DCGMCollector) GetMetrics() ([][]Metric, error) { } func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device) []Metric { - metrics := make([]Metric, len(values)) + var metrics []Metric for i, val := range values { - metrics[i] = Metric{ + v := ToString(val) + // Filter out counters with no value + if v == SkipDCGMValue { + continue + } + m := Metric{ Name: c[i].FieldName, - Value: ToString(val), + Value: v, GPU: fmt.Sprintf("%d", d.GPU), GPUUUID: d.UUID, Attributes: map[string]string{}, } + metrics = append(metrics, m) } return metrics @@ -88,6 +94,34 @@ func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device) []Metric } func ToString(value dcgm.FieldValue_v1) string { + switch v := value.Int64(); v { + case dcgm.DCGM_FT_INT32_BLANK: + return SkipDCGMValue + case dcgm.DCGM_FT_INT32_NOT_FOUND: + return SkipDCGMValue + case dcgm.DCGM_FT_INT32_NOT_SUPPORTED: + return SkipDCGMValue + case dcgm.DCGM_FT_INT32_NOT_PERMISSIONED: + return SkipDCGMValue + case dcgm.DCGM_FT_INT64_BLANK: + return SkipDCGMValue + case dcgm.DCGM_FT_INT64_NOT_FOUND: + return SkipDCGMValue + case dcgm.DCGM_FT_INT64_NOT_SUPPORTED: + return SkipDCGMValue + case dcgm.DCGM_FT_INT64_NOT_PERMISSIONED: + return SkipDCGMValue + } + switch v := value.Float64(); v { + case dcgm.DCGM_FT_FP64_BLANK: + return SkipDCGMValue + case dcgm.DCGM_FT_FP64_NOT_FOUND: + return SkipDCGMValue + case dcgm.DCGM_FT_FP64_NOT_SUPPORTED: + return SkipDCGMValue + case dcgm.DCGM_FT_FP64_NOT_PERMISSIONED: + return SkipDCGMValue + } switch v := value.FieldType; v { case dcgm.DCGM_FT_STRING: return value.String() diff --git a/pkg/types.go b/pkg/types.go index 24ac55a..825aee4 100644 --- a/pkg/types.go +++ b/pkg/types.go @@ -25,6 +25,7 @@ import ( ) var ( + SkipDCGMValue = "SKIPPING DCGM VALUE" FailedToConvert = "ERROR - FAILED TO CONVERT TO STRING" nvidiaResourceName = "nvidia.com/gpu"