diff --git a/Makefile b/Makefile index c8d27aa..ec99e65 100644 --- a/Makefile +++ b/Makefile @@ -21,7 +21,7 @@ GOLANG_VERSION := 1.14.2 VERSION := 2.0.0-rc.7 FULL_VERSION := $(DCGM_VERSION)-$(VERSION) -.PHONY: all binary install +.PHONY: all binary install check-format all: ubuntu18.04 ubi8 binary: @@ -32,6 +32,9 @@ install: binary install -m 557 -D ./etc/dcgm-exporter/default-counters.csv /etc/dcgm-exporter/default-counters.csv install -m 557 -D ./etc/dcgm-exporter/dcp-metrics-included.csv /etc/dcgm-exporter/dcp-metrics-included.csv +check-format: + test $$(gofmt -l pkg bindings | tee /dev/stderr | wc -l) -eq 0 + push: $(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04" $(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubi8" diff --git a/bindings/go/dcgm/device_info.go b/bindings/go/dcgm/device_info.go index c9c7349..8c61a55 100644 --- a/bindings/go/dcgm/device_info.go +++ b/bindings/go/dcgm/device_info.go @@ -13,8 +13,8 @@ import ( type PCIInfo struct { BusID string - BAR1 uint // MB - FBTotal uint // MB + BAR1 uint // MB + FBTotal uint // MB Bandwidth int64 // MB/s } diff --git a/bindings/go/dcgm/device_status.go b/bindings/go/dcgm/device_status.go index abec47b..4d37de1 100644 --- a/bindings/go/dcgm/device_status.go +++ b/bindings/go/dcgm/device_status.go @@ -61,7 +61,7 @@ type PCIStatusInfo struct { type DeviceStatus struct { Power float64 // W - Temperature int64 // °C + Temperature int64 // °C Utilization UtilizationInfo Memory MemoryInfo Clocks ClockInfo @@ -111,7 +111,6 @@ func latestValuesForDevice(gpuId uint) (status DeviceStatus, err error) { deviceFields[pstate] = C.DCGM_FI_DEV_PSTATE deviceFields[fanSpeed] = C.DCGM_FI_DEV_FAN_SPEED - fieldsName := fmt.Sprintf("devStatusFields%d", rand.Uint64()) fieldsId, err := FieldGroupCreate(fieldsName, deviceFields) if err != nil { diff --git a/bindings/go/dcgm/fields.go b/bindings/go/dcgm/fields.go index 7d39df5..bbffff7 100644 --- a/bindings/go/dcgm/fields.go +++ b/bindings/go/dcgm/fields.go @@ -101,12 +101,12 @@ func toFieldValue(cfields []C.dcgmFieldValue_v1) []FieldValue_v1 { fields := make([]FieldValue_v1, len(cfields)) for i, f := range cfields { fields[i] = FieldValue_v1{ - Version: uint(f.version), - FieldId: uint(f.fieldId), + Version: uint(f.version), + FieldId: uint(f.fieldId), FieldType: uint(f.fieldType), - Status: int(f.status), - Ts: int64(f.ts), - Value: f.value, + Status: int(f.status), + Ts: int64(f.ts), + Value: f.value, } } @@ -125,6 +125,6 @@ func (fv FieldValue_v1) String() string { return *(*string)(unsafe.Pointer(&fv.Value[0])) } -func (fv FieldValue_v1) Blob() [4096]byte{ +func (fv FieldValue_v1) Blob() [4096]byte { return fv.Value } diff --git a/docker/Dockerfile.ubi8 b/docker/Dockerfile.ubi8 index 0083fd9..37e9c0d 100644 --- a/docker/Dockerfile.ubi8 +++ b/docker/Dockerfile.ubi8 @@ -4,7 +4,7 @@ WORKDIR /go/src/github.com/NVIDIA/gpu-monitoring-tools COPY . . -RUN make binary +RUN make binary check-format FROM registry.access.redhat.com/ubi8:latest LABEL io.k8s.display-name="NVIDIA DCGM Exporter" diff --git a/docker/Dockerfile.ubuntu18.04 b/docker/Dockerfile.ubuntu18.04 index fa87f3f..0eb50a7 100644 --- a/docker/Dockerfile.ubuntu18.04 +++ b/docker/Dockerfile.ubuntu18.04 @@ -4,7 +4,7 @@ WORKDIR /go/src/github.com/NVIDIA/gpu-monitoring-tools COPY . . -RUN make binary +RUN make binary check-format FROM ubuntu:18.04 LABEL io.k8s.display-name="NVIDIA DCGM Exporter" diff --git a/pkg/dcgm.go b/pkg/dcgm.go index 5f78e65..cc1b643 100644 --- a/pkg/dcgm.go +++ b/pkg/dcgm.go @@ -26,7 +26,7 @@ import ( func NewGroup() (dcgm.GroupHandle, func(), error) { group, err := dcgm.NewDefaultGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64())) if err != nil { - return dcgm.GroupHandle{}, func(){}, err + return dcgm.GroupHandle{}, func() {}, err } return group, func() { dcgm.DestroyGroup(group) }, nil @@ -45,7 +45,7 @@ func NewFieldGroup(deviceFields []dcgm.Short) (dcgm.FieldHandle, func(), error) name := fmt.Sprintf("gpu-collector-fieldgroup-%d", rand.Uint64()) fieldGroup, err := dcgm.FieldGroupCreate(name, deviceFields) if err != nil { - return dcgm.FieldHandle{}, func(){}, err + return dcgm.FieldHandle{}, func() {}, err } return fieldGroup, func() { dcgm.FieldGroupDestroy(fieldGroup) }, nil @@ -95,4 +95,3 @@ fail: return nil, err } - diff --git a/pkg/gpu_collector.go b/pkg/gpu_collector.go index 28c512d..73dc00d 100644 --- a/pkg/gpu_collector.go +++ b/pkg/gpu_collector.go @@ -23,13 +23,13 @@ import ( func NewDCGMCollector(c []Counter) (*DCGMCollector, func(), error) { collector := &DCGMCollector{ - Counters: c, + Counters: c, DeviceFields: NewDeviceFields(c), } cleanups, err := SetupDcgmFieldsWatch(collector.DeviceFields) if err != nil { - return nil, func(){}, err + return nil, func() {}, err } collector.Cleanups = cleanups @@ -73,10 +73,10 @@ func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device) []Metric for i, val := range values { metrics[i] = Metric{ - Name: c[i].FieldName, + Name: c[i].FieldName, Value: ToString(val), - GPU: fmt.Sprintf("%d", d.GPU), + GPU: fmt.Sprintf("%d", d.GPU), GPUUUID: d.UUID, Attributes: map[string]string{}, @@ -101,4 +101,3 @@ func ToString(value dcgm.FieldValue_v1) string { return FailedToConvert } - diff --git a/pkg/gpu_collector_test.go b/pkg/gpu_collector_test.go index 242ef5c..a3213de 100644 --- a/pkg/gpu_collector_test.go +++ b/pkg/gpu_collector_test.go @@ -29,10 +29,9 @@ var sampleCounters = []Counter{ {dcgm.DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION", "gauge", "Energy help info"}, {dcgm.DCGM_FI_DEV_POWER_USAGE, "DCGM_FI_DEV_POWER_USAGE", "gauge", "Power help info"}, } - func TestDCGMCollector(t *testing.T) { - cleanup, err := dcgm.Init(dcgm.Embedded); + cleanup, err := dcgm.Init(dcgm.Embedded) require.NoError(t, err) defer cleanup() diff --git a/pkg/kubernetes.go b/pkg/kubernetes.go index 13cd67b..ae4e3bc 100644 --- a/pkg/kubernetes.go +++ b/pkg/kubernetes.go @@ -20,11 +20,11 @@ import ( "context" "fmt" "net" - "time" "os" + "time" - "google.golang.org/grpc" "github.com/sirupsen/logrus" + "google.golang.org/grpc" podresourcesapi "k8s.io/kubernetes/pkg/kubelet/apis/podresources/v1alpha1" ) @@ -32,7 +32,7 @@ var ( socketDir = "/var/lib/kubelet/pod-resources" socketPath = socketDir + "/kubelet.sock" - connectionTimeout = 10 * time.Second + connectionTimeout = 10 * time.Second ) func NewPodMapper(c *Config) *PodMapper { @@ -52,7 +52,6 @@ func (p *PodMapper) Process(metrics [][]Metric) error { return nil } - // TODO: This needs to be moved out of the critical path. c, cleanup, err := connectToServer(socketPath) if err != nil { diff --git a/pkg/kubernetes_test.go b/pkg/kubernetes_test.go index 20daf99..154ffbf 100644 --- a/pkg/kubernetes_test.go +++ b/pkg/kubernetes_test.go @@ -17,19 +17,18 @@ package main import ( - "fmt" - "testing" "context" - "time" + "fmt" "io/ioutil" "os" + "testing" + "time" + "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" "github.com/stretchr/testify/require" + "google.golang.org/grpc" podresourcesapi "k8s.io/kubernetes/pkg/kubelet/apis/podresources/v1alpha1" - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" "k8s.io/kubernetes/pkg/kubelet/util" - "google.golang.org/grpc" - ) var tmpDir string @@ -38,7 +37,7 @@ func TestProcessPodMapper(t *testing.T) { cleanup := CreateTmpDir(t) defer cleanup() - cleanup, err := dcgm.Init(dcgm.Embedded); + cleanup, err := dcgm.Init(dcgm.Embedded) require.NoError(t, err) defer cleanup() @@ -77,9 +76,9 @@ func TestProcessPodMapper(t *testing.T) { } } -func GetGPUUUIDs(metrics [][]Metric) []string{ +func GetGPUUUIDs(metrics [][]Metric) []string { gpus := make([]string, len(metrics)) - for i, dev := range metrics{ + for i, dev := range metrics { gpus[i] = dev[0].GPUUUID } @@ -100,10 +99,10 @@ func StartMockServer(t *testing.T, server *grpc.Server, socket string) func() { return func() { server.Stop() select { - case <-stopped: - return - case <-time.After(1 * time.Second): - t.Fatal("Failed waiting for gRPC server to stop") + case <-stopped: + return + case <-time.After(1 * time.Second): + t.Fatal("Failed waiting for gRPC server to stop") } } } @@ -135,15 +134,15 @@ func (s *PodResourcesMockServer) List(ctx context.Context, req *podresourcesapi. for i, gpu := range s.gpus { podResources[i] = &podresourcesapi.PodResources{ - Name: fmt.Sprintf("gpu-pod-%d", i), - Namespace: "default", + Name: fmt.Sprintf("gpu-pod-%d", i), + Namespace: "default", Containers: []*podresourcesapi.ContainerResources{ &podresourcesapi.ContainerResources{ - Name: "default", + Name: "default", Devices: []*podresourcesapi.ContainerDevices{ &podresourcesapi.ContainerDevices{ ResourceName: nvidiaResourceName, - DeviceIds: []string{gpu}, + DeviceIds: []string{gpu}, }, }, }, diff --git a/pkg/main.go b/pkg/main.go index c0d2bfc..274d6a2 100644 --- a/pkg/main.go +++ b/pkg/main.go @@ -18,22 +18,22 @@ package main import ( "os" - "syscall" "sync" + "syscall" "time" + "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" "github.com/sirupsen/logrus" "github.com/urfave/cli/v2" - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" ) var ( BuildVersion = "Filled by the build system" - CLIFieldsFile = "collectors" - CLIPort = "port" + CLIFieldsFile = "collectors" + CLIPort = "port" CLICollectInterval = "collect-interval" - CLIKubernetes = "kubernetes" + CLIKubernetes = "kubernetes" ) func main() { @@ -83,7 +83,7 @@ func main() { } func Run(c *cli.Context) error { - restart: +restart: logrus.Info("Starting dcgm-exporter") config := contextToConfig(c) @@ -122,7 +122,7 @@ func Run(c *cli.Context) error { select { case sig := <-sigs: close(stop) - err := WaitWithTimeout(&wg, time.Second * 2) + err := WaitWithTimeout(&wg, time.Second*2) if err != nil { logrus.Fatal(err) } @@ -139,10 +139,10 @@ func Run(c *cli.Context) error { } func contextToConfig(c *cli.Context) *Config { - return &Config { - CollectorsFile: c.String(CLIFieldsFile), - Port: c.Int(CLIPort), + return &Config{ + CollectorsFile: c.String(CLIFieldsFile), + Port: c.Int(CLIPort), CollectInterval: c.Int(CLICollectInterval), - Kubernetes: c.Bool(CLIKubernetes), + Kubernetes: c.Bool(CLIKubernetes), } } diff --git a/pkg/parser.go b/pkg/parser.go index 547114b..6f251d9 100644 --- a/pkg/parser.go +++ b/pkg/parser.go @@ -17,16 +17,15 @@ package main import ( + "encoding/csv" "fmt" "os" "strings" - "encoding/csv" - "github.com/sirupsen/logrus" "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" + "github.com/sirupsen/logrus" ) - func ExtractCounters(filename string) ([]Counter, error) { records, err := ReadCSVFile(filename) if err != nil { diff --git a/pkg/pipeline.go b/pkg/pipeline.go index bce5a7a..dc9d663 100644 --- a/pkg/pipeline.go +++ b/pkg/pipeline.go @@ -17,11 +17,11 @@ package main import ( - "fmt" "bytes" - "time" - "text/template" + "fmt" "sync" + "text/template" + "time" "github.com/sirupsen/logrus" ) @@ -50,16 +50,16 @@ func NewMetricsPipeline(c *Config) (*MetricsPipeline, func(), error) { } return &MetricsPipeline{ - config: c, + config: c, - metricsFormat: template.Must(template.New("metrics").Parse(metricsFormat)), - countersText: countersText, + metricsFormat: template.Must(template.New("metrics").Parse(metricsFormat)), + countersText: countersText, - gpuCollector: gpuCollector, - transformations: transformations, - }, func() { - cleanup() - }, nil + gpuCollector: gpuCollector, + transformations: transformations, + }, func() { + cleanup() + }, nil } // Primarely for testing, caller expected to cleanup the collector @@ -73,7 +73,7 @@ func NewMetricsPipelineWithGPUCollector(c *Config, collector *DCGMCollector) (*M config: c, metricsFormat: template.Must(template.New("metrics").Parse(metricsFormat)), - countersText: countersText, + countersText: countersText, gpuCollector: collector, }, func() {}, nil @@ -91,7 +91,7 @@ func (m *MetricsPipeline) Run(out chan string, stop chan interface{}, wg *sync.W defer t.Stop() for { - select{ + select { case <-stop: return case <-t.C: @@ -123,7 +123,7 @@ func (m *MetricsPipeline) run() (string, error) { } } - formated, err := FormatMetrics(m.countersText, m.metricsFormat, metrics) + formated, err := FormatMetrics(m.countersText, m.metricsFormat, metrics) if err != nil { return "", fmt.Errorf("Failed to format metrics with error: %v", err) } @@ -145,7 +145,7 @@ func (m *MetricsPipeline) run() (string, error) { * The expectation is that the template will be given the following * values: {.Fields, .Devices, .Values[Device][Field]} * -*/ + */ var countersFormat = `{{- range $c := . -}} # HELP {{ $c.FieldName }} {{ $c.Help }} diff --git a/pkg/pipeline_test.go b/pkg/pipeline_test.go index cdeece5..2f77b7a 100644 --- a/pkg/pipeline_test.go +++ b/pkg/pipeline_test.go @@ -24,7 +24,7 @@ import ( ) func TestRun(t *testing.T) { - cleanup, err := dcgm.Init(dcgm.Embedded); + cleanup, err := dcgm.Init(dcgm.Embedded) require.NoError(t, err) defer cleanup() diff --git a/pkg/server.go b/pkg/server.go index cff9a36..4ce9354 100644 --- a/pkg/server.go +++ b/pkg/server.go @@ -17,11 +17,11 @@ package main import ( - "fmt" "context" - "time" + "fmt" "net/http" "sync" + "time" "github.com/gorilla/mux" "github.com/sirupsen/logrus" @@ -31,13 +31,13 @@ func NewMetricsServer(c *Config, metrics chan string) (*MetricsServer, func(), e router := mux.NewRouter() serverv1 := &MetricsServer{ server: http.Server{ - Addr: fmt.Sprintf(":%d", c.Port), - Handler: router, - ReadTimeout: 10 * time.Second, - WriteTimeout: 10 * time.Second, + Addr: fmt.Sprintf(":%d", c.Port), + Handler: router, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, }, metricsChan: metrics, - metrics: "", + metrics: "", } router.HandleFunc("/heath", serverv1.Health) @@ -77,7 +77,7 @@ func (s *MetricsServer) Run(stop chan interface{}, wg *sync.WaitGroup) { logrus.Fatalf("Failed to shutdown HTTP server, with err: `%v`", err) } - if err := WaitWithTimeout(&httpwg, 3 * time.Second); err != nil { + if err := WaitWithTimeout(&httpwg, 3*time.Second); err != nil { logrus.Fatalf("Failed waiting for HTTP server to shutdown, with err: `%v`", err) } } diff --git a/pkg/types.go b/pkg/types.go index 9e918bf..d7a6d5b 100644 --- a/pkg/types.go +++ b/pkg/types.go @@ -17,8 +17,8 @@ package main import ( - "sync" "net/http" + "sync" "text/template" "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" @@ -30,7 +30,7 @@ var ( nvidiaResourceName = "nvidia.com/gpu" // Note standard resource attributes - podAttribute = "pod" + podAttribute = "pod" namespaceAttribute = "namespace" containerAttribute = "container" ) @@ -51,16 +51,16 @@ type MetricsPipeline struct { config *Config transformations []Transform - metricsFormat *template.Template - countersText string + metricsFormat *template.Template + countersText string gpuCollector *DCGMCollector } type DCGMCollector struct { - Counters []Counter + Counters []Counter DeviceFields []dcgm.Short - Cleanups []func() + Cleanups []func() } type Counter struct { @@ -81,17 +81,17 @@ type Metric struct { } var promMetricType = map[string]bool{ - "gauge": true, - "counter": true, + "gauge": true, + "counter": true, "histogram": true, - "summary": true, + "summary": true, } type MetricsServer struct { sync.Mutex - server http.Server - metrics string + server http.Server + metrics string metricsChan chan string } @@ -100,7 +100,7 @@ type PodMapper struct { } type PodInfo struct { - Name string + Name string Namespace string Container string } diff --git a/pkg/utils.go b/pkg/utils.go index 17a01c9..6ee6d16 100644 --- a/pkg/utils.go +++ b/pkg/utils.go @@ -18,10 +18,10 @@ package main import ( "fmt" - "sync" - "time" "os" "os/signal" + "sync" + "time" ) func WaitWithTimeout(wg *sync.WaitGroup, timeout time.Duration) error {