pref: discard detect-gpu dependency

harry · harry · commit 4adec55a196a · 2024-01-15T10:52:23.000+08:00
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 [![Go Report Card](https://goreportcard.com/badge/github.com/mayooot/gpu-docker-api)](https://goreportcard.com/badge/github.com/mayooot/gpu-docker-api)
 
 [简体中文](docs%2Fzh-cn.md)
-> ⚠️注意：中文文档已经过期，请使用英文文档。
+> ⚠️注意：中文文档更新可能落后于英文文档，请以英文文档为准。
 >
 
 # Overview
@@ -116,7 +116,6 @@ Import [gpu-docker-api.openapi.json](api%2Fgpu-docker-api.openapi.json) to invok
    tutorial: [volume-size-scale-en.md](docs%2Fvolume%2Fvolume-size-scale-en.md)
 4. Make sure your test environment has ETCD V3 installed, installation
    tutorial: [ETCD](https://github.com/etcd-io/etcd).
-5. Clone and run [detect-gpu](https://github.com/mayooot/detect-gpu).
 
 ## Build From Source
 
@@ -127,6 +126,7 @@ $ make build
 ~~~
 
 ## Download From Release
+
 [release](https://github.com/mayooot/gpu-docker-api/releases)
 
 ## Config File
@@ -193,7 +193,7 @@ And workQueue asynchronous processing in Client-go.
 
 * gpuScheduler：A scheduler that allocates GPU resources and saves the used GPUs.
     * gpuStatusMap：
-      Maintain the GPU resources of the server, when the program starts for the first time, call detect-gpu to get all
+      Maintain the GPU resources of the server, when the program starts for the first time, call `nvidia-smi` to get all
       the GPU resources, and initialize gpuStatusMap, Key is the UUID of GPU, Value is the usage, 0 means used, 1 means
       unused.
 
@@ -214,8 +214,6 @@ And workQueue asynchronous processing in Client-go.
     * /apis/v1/versions/containerVersionMapKey
     * /apis/v1/versions/volumeVersionMapKey
 
-* detect-gpu：A simple HTTP server that calls [go-nvml](https://github.com/NVIDIA/go-nvml) to get the GPU of the host
-  computer.
 
 ## Architecture Diagram
 
diff --git a/cmd/gpu-docker-api/main.go b/cmd/gpu-docker-api/main.go
@@ -70,7 +70,7 @@ func (p *program) Init(svc.Environment) error {
 
 	workQueue.InitWorkQueue()
 
-	if err = gpuscheduler.Init(p.cfg); err != nil {
+	if err = gpuscheduler.Init(); err != nil {
 		return err
 	}
 
diff --git a/etc/config.toml b/etc/config.toml
@@ -4,11 +4,6 @@ port = ":2378"
 # etcd addr
 etcd_addr = "127.0.0.1:2379"
 
-# detect-gpu addr
-detect_gpu_addr = "http://127.0.0.1:2376/api/v1/detect/gpu"
-# host gpu count
-available_gpu_nums = 8
-
 # available port range
 start_port = 40000
 end_port = 65535
diff --git a/internal/config/config.go b/internal/config/config.go
@@ -7,12 +7,10 @@ import (
 )
 
 type Config struct {
-	Port             string `toml:"port"`
-	EtcdAddr         string `toml:"etcd_addr"`
-	DetectGPUAddr    string `toml:"detect_gpu_addr"`
-	AvailableGpuNums int    `toml:"available_gpu_nums"`
-	StartPort        int    `toml:"start_port"`
-	EndPort          int    `toml:"end_port"`
+	Port      string `toml:"port"`
+	EtcdAddr  string `toml:"etcd_addr"`
+	StartPort int    `toml:"start_port"`
+	EndPort   int    `toml:"end_port"`
 }
 
 func NewConfigWithFile(name string) (*Config, error) {
diff --git a/internal/etcd/client.go b/internal/etcd/client.go
@@ -1,6 +1,7 @@
 package etcd
 
 import (
+	"github.com/pkg/errors"
 	"time"
 
 	clientv3 "go.etcd.io/etcd/client/v3"
@@ -18,8 +19,10 @@ func InitEtcdClient(cfg *config.Config) error {
 		DialTimeout: 2 * time.Second,
 		DialOptions: []grpc.DialOption{grpc.WithBlock()},
 	})
-
-	return err
+	if err != nil {
+		return errors.Wrap(err, "failed to connect etcd")
+	}
+	return nil
 }
 
 func CloseEtcdClient() error {
diff --git a/internal/scheduler/gpuscheduler/scheduler.go b/internal/scheduler/gpuscheduler/scheduler.go
@@ -2,55 +2,56 @@ package gpuscheduler
 
 import (
 	"encoding/json"
-	"errors"
-	"io/ioutil"
-	"net/http"
 	"strconv"
+	"strings"
 	"sync"
 
-	"github.com/mayooot/gpu-docker-api/internal/config"
+	"github.com/commander-cli/cmd"
+	"github.com/pkg/errors"
+
 	"github.com/mayooot/gpu-docker-api/internal/etcd"
-	"github.com/mayooot/gpu-docker-api/internal/model"
 	"github.com/mayooot/gpu-docker-api/internal/xerrors"
 )
 
 const (
-	// 默认的可用GPU 数量
-	defaultAvailableGpuNums = 8
+	// 执行命令获取 gpu 的 index 和 uuid
+	allGpuUUIDCommand = "nvidia-smi --query-gpu=index,uuid --format=csv,noheader,nounits"
 
 	// gpuScheduler 存储在 etcd 中的 key
 	gpuStatusMapKey = "gpuStatusMapKey"
 )
 
 var Scheduler *scheduler
 
+type gpu struct {
+	Index int     `json:"index"`
+	UUID  *string `json:"uuid"`
+}
+
 type scheduler struct {
 	sync.RWMutex
 
 	AvailableGpuNums int
 	GpuStatusMap     map[string]byte
 }
 
-func Init(cfg *config.Config) error {
+func Init() error {
 	var err error
 	Scheduler, err = initFormEtcd()
 	if err != nil {
-		return err
+		return errors.Wrap(err, "initFormEtcd failed")
 	}
 
 	if Scheduler.AvailableGpuNums == 0 || len(Scheduler.GpuStatusMap) == 0 {
 		// 如果没有初始化过
-		Scheduler.AvailableGpuNums = defaultAvailableGpuNums
-		if cfg.AvailableGpuNums >= 0 {
-			Scheduler.AvailableGpuNums = cfg.AvailableGpuNums
-		}
-
-		gpus, err := getDetectGpus(cfg.DetectGPUAddr)
+		gpus, err := getAllGpuUUID()
 		if err != nil {
-			return err
+			return errors.Wrap(err, "getAllGpuUUID failed")
 		}
+
+		Scheduler.AvailableGpuNums = len(gpus)
 		for i := 0; i < len(gpus); i++ {
-			Scheduler.GpuStatusMap[gpus[i].UUID] = 0
+			Scheduler.GpuStatusMap[*gpus[i].UUID] = 0
 		}
 	}
 	return nil
@@ -139,20 +140,40 @@ func initFormEtcd() (s *scheduler, err error) {
 	return s, err
 }
 
-func getDetectGpus(addr string) (gpus []model.GpuInfo, err error) {
-	resp, err := http.Get(addr)
+func getAllGpuUUID() ([]*gpu, error) {
+	c := cmd.NewCommand(allGpuUUIDCommand)
+	err := c.Execute()
 	if err != nil {
-		return gpus, err
+		return nil, errors.Wrap(err, "cmd.Execute failed")
 	}
-	defer resp.Body.Close()
 
-	body, err := ioutil.ReadAll(resp.Body)
+	gpuList, err := parseOutput(c.Stdout())
 	if err != nil {
-		return gpus, err
+		return nil, errors.Wrap(err, "parseOutput failed")
 	}
+	return gpuList, nil
+}
+
+func parseOutput(output string) (gpuList []*gpu, err error) {
+	lines := strings.Split(output, "\n")
+	gpuList = make([]*gpu, 0, len(lines))
+	for _, line := range lines {
+		if line == "" {
+			continue
+		}
 
-	if err = json.Unmarshal(body, &gpus); err != nil {
-		return gpus, err
+		fields := strings.Split(line, ", ")
+		if len(fields) == 2 {
+			index, err := strconv.Atoi(fields[0])
+			if err != nil {
+				return gpuList, errors.Wrapf(err, "strconv.Atoi failed, index: %s", fields[0])
+			}
+			uuid := fields[1]
+			gpuList = append(gpuList, &gpu{
+				Index: index,
+				UUID:  &uuid,
+			})
+		}
 	}
-	return gpus, err
+	return
 }
diff --git a/internal/scheduler/portscheduler/scheduler.go b/internal/scheduler/portscheduler/scheduler.go
@@ -2,7 +2,7 @@ package portscheduler
 
 import (
 	"encoding/json"
-	"errors"
+	"github.com/pkg/errors"
 	"sort"
 	"strconv"
 	"sync"
@@ -59,7 +59,7 @@ func Init(cfg *config.Config) error {
 	var err error
 	Scheduler, err = initFormEtcd()
 	if err != nil {
-		return err
+		return errors.Wrap(err, "initFormEtcd failed")
 	}
 
 	if Scheduler.StartPort == 0 || Scheduler.EndPort == 0 || Scheduler.AvailableCount == 0 {

Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,7 @@ func (p *program) Init(svc.Environment) error {`
`70`	`70`
`71`	`71`	`workQueue.InitWorkQueue()`
`72`	`72`
`73`		`- if err = gpuscheduler.Init(p.cfg); err != nil {`
	`73`	`+ if err = gpuscheduler.Init(); err != nil {`
`74`	`74`	`return err`
`75`	`75`	`}`
`76`	`76`