diff --git a/bindings/go/nvml/api.go b/bindings/go/nvml/api.go new file mode 100644 index 0000000..52ab012 --- /dev/null +++ b/bindings/go/nvml/api.go @@ -0,0 +1,45 @@ +package nvml + +import ( + "fmt" + "os" + "sync" +) + +var ( + nvmlInitCounter int + mux sync.Mutex +) + +func InitCounter() (cleanup func(), err error) { + mux.Lock() + if nvmlInitCounter < 0 { + count := fmt.Sprintf("%d", nvmlInitCounter) + err = fmt.Errorf("ShutdownCounter() is called %s times, before InitCounter()", count[1:]) + } + if nvmlInitCounter == 0 { + err = Init() + } + nvmlInitCounter += 1 + mux.Unlock() + + return func() { + if err := ShutdownCounter(); err != nil { + fmt.Fprintf(os.Stderr, "Failed to shutdown DCGM with error: `%v`", err) + } + }, err +} + +func ShutdownCounter() (err error) { + mux.Lock() + if nvmlInitCounter <= 0 { + err = fmt.Errorf("Init() needs to be called before Shutdown()") + } + if nvmlInitCounter == 1 { + err = Shutdown() + } + nvmlInitCounter -= 1 + mux.Unlock() + + return +} diff --git a/pkg/main.go b/pkg/main.go index a796406..b6d2668 100644 --- a/pkg/main.go +++ b/pkg/main.go @@ -24,6 +24,7 @@ import ( "time" "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" + "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml" "github.com/sirupsen/logrus" "github.com/urfave/cli/v2" ) @@ -103,6 +104,13 @@ restart: } logrus.Info("DCGM successfully initialized!") + nvmlCleanup, err := nvml.InitCounter() + defer nvmlCleanup() + if err != nil { + logrus.Fatal(err) + } + logrus.Info("DCGM successfully initialized!") + _, err = dcgm.GetSupportedMetricGroups(0) if err != nil { config.CollectDCP = false