Skip to content

Commit 4adec55

Browse files
author
harry
committed
pref: discard detect-gpu dependency
1 parent c61611c commit 4adec55

File tree

7 files changed

+62
-47
lines changed

7 files changed

+62
-47
lines changed

README.md

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
[![Go Report Card](https://goreportcard.com/badge/github.com/mayooot/gpu-docker-api)](https://goreportcard.com/badge/github.com/mayooot/gpu-docker-api)
55

66
[简体中文](docs%2Fzh-cn.md)
7-
> ⚠️注意:中文文档已经过期,请使用英文文档
7+
> ⚠️注意:中文文档更新可能落后于英文文档,请以英文文档为准
88
>
99
1010
# Overview
@@ -116,7 +116,6 @@ Import [gpu-docker-api.openapi.json](api%2Fgpu-docker-api.openapi.json) to invok
116116
tutorial: [volume-size-scale-en.md](docs%2Fvolume%2Fvolume-size-scale-en.md)
117117
4. Make sure your test environment has ETCD V3 installed, installation
118118
tutorial: [ETCD](https://github.com/etcd-io/etcd).
119-
5. Clone and run [detect-gpu](https://github.com/mayooot/detect-gpu).
120119

121120
## Build From Source
122121

@@ -127,6 +126,7 @@ $ make build
127126
~~~
128127

129128
## Download From Release
129+
130130
[release](https://github.com/mayooot/gpu-docker-api/releases)
131131

132132
## Config File
@@ -193,7 +193,7 @@ And workQueue asynchronous processing in Client-go.
193193

194194
* gpuScheduler:A scheduler that allocates GPU resources and saves the used GPUs.
195195
* gpuStatusMap:
196-
Maintain the GPU resources of the server, when the program starts for the first time, call detect-gpu to get all
196+
Maintain the GPU resources of the server, when the program starts for the first time, call `nvidia-smi` to get all
197197
the GPU resources, and initialize gpuStatusMap, Key is the UUID of GPU, Value is the usage, 0 means used, 1 means
198198
unused.
199199

@@ -214,8 +214,6 @@ And workQueue asynchronous processing in Client-go.
214214
* /apis/v1/versions/containerVersionMapKey
215215
* /apis/v1/versions/volumeVersionMapKey
216216

217-
* detect-gpu:A simple HTTP server that calls [go-nvml](https://github.com/NVIDIA/go-nvml) to get the GPU of the host
218-
computer.
219217

220218
## Architecture Diagram
221219

cmd/gpu-docker-api/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ func (p *program) Init(svc.Environment) error {
7070

7171
workQueue.InitWorkQueue()
7272

73-
if err = gpuscheduler.Init(p.cfg); err != nil {
73+
if err = gpuscheduler.Init(); err != nil {
7474
return err
7575
}
7676

etc/config.toml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,6 @@ port = ":2378"
44
# etcd addr
55
etcd_addr = "127.0.0.1:2379"
66

7-
# detect-gpu addr
8-
detect_gpu_addr = "http://127.0.0.1:2376/api/v1/detect/gpu"
9-
# host gpu count
10-
available_gpu_nums = 8
11-
127
# available port range
138
start_port = 40000
149
end_port = 65535

internal/config/config.go

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,10 @@ import (
77
)
88

99
type Config struct {
10-
Port string `toml:"port"`
11-
EtcdAddr string `toml:"etcd_addr"`
12-
DetectGPUAddr string `toml:"detect_gpu_addr"`
13-
AvailableGpuNums int `toml:"available_gpu_nums"`
14-
StartPort int `toml:"start_port"`
15-
EndPort int `toml:"end_port"`
10+
Port string `toml:"port"`
11+
EtcdAddr string `toml:"etcd_addr"`
12+
StartPort int `toml:"start_port"`
13+
EndPort int `toml:"end_port"`
1614
}
1715

1816
func NewConfigWithFile(name string) (*Config, error) {

internal/etcd/client.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package etcd
22

33
import (
4+
"github.com/pkg/errors"
45
"time"
56

67
clientv3 "go.etcd.io/etcd/client/v3"
@@ -18,8 +19,10 @@ func InitEtcdClient(cfg *config.Config) error {
1819
DialTimeout: 2 * time.Second,
1920
DialOptions: []grpc.DialOption{grpc.WithBlock()},
2021
})
21-
22-
return err
22+
if err != nil {
23+
return errors.Wrap(err, "failed to connect etcd")
24+
}
25+
return nil
2326
}
2427

2528
func CloseEtcdClient() error {

internal/scheduler/gpuscheduler/scheduler.go

Lines changed: 47 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2,55 +2,56 @@ package gpuscheduler
22

33
import (
44
"encoding/json"
5-
"errors"
6-
"io/ioutil"
7-
"net/http"
85
"strconv"
6+
"strings"
97
"sync"
108

11-
"github.com/mayooot/gpu-docker-api/internal/config"
9+
"github.com/commander-cli/cmd"
10+
"github.com/pkg/errors"
11+
1212
"github.com/mayooot/gpu-docker-api/internal/etcd"
13-
"github.com/mayooot/gpu-docker-api/internal/model"
1413
"github.com/mayooot/gpu-docker-api/internal/xerrors"
1514
)
1615

1716
const (
18-
// 默认的可用GPU 数量
19-
defaultAvailableGpuNums = 8
17+
// 执行命令获取 gpu 的 index 和 uuid
18+
allGpuUUIDCommand = "nvidia-smi --query-gpu=index,uuid --format=csv,noheader,nounits"
2019

2120
// gpuScheduler 存储在 etcd 中的 key
2221
gpuStatusMapKey = "gpuStatusMapKey"
2322
)
2423

2524
var Scheduler *scheduler
2625

26+
type gpu struct {
27+
Index int `json:"index"`
28+
UUID *string `json:"uuid"`
29+
}
30+
2731
type scheduler struct {
2832
sync.RWMutex
2933

3034
AvailableGpuNums int
3135
GpuStatusMap map[string]byte
3236
}
3337

34-
func Init(cfg *config.Config) error {
38+
func Init() error {
3539
var err error
3640
Scheduler, err = initFormEtcd()
3741
if err != nil {
38-
return err
42+
return errors.Wrap(err, "initFormEtcd failed")
3943
}
4044

4145
if Scheduler.AvailableGpuNums == 0 || len(Scheduler.GpuStatusMap) == 0 {
4246
// 如果没有初始化过
43-
Scheduler.AvailableGpuNums = defaultAvailableGpuNums
44-
if cfg.AvailableGpuNums >= 0 {
45-
Scheduler.AvailableGpuNums = cfg.AvailableGpuNums
46-
}
47-
48-
gpus, err := getDetectGpus(cfg.DetectGPUAddr)
47+
gpus, err := getAllGpuUUID()
4948
if err != nil {
50-
return err
49+
return errors.Wrap(err, "getAllGpuUUID failed")
5150
}
51+
52+
Scheduler.AvailableGpuNums = len(gpus)
5253
for i := 0; i < len(gpus); i++ {
53-
Scheduler.GpuStatusMap[gpus[i].UUID] = 0
54+
Scheduler.GpuStatusMap[*gpus[i].UUID] = 0
5455
}
5556
}
5657
return nil
@@ -139,20 +140,40 @@ func initFormEtcd() (s *scheduler, err error) {
139140
return s, err
140141
}
141142

142-
func getDetectGpus(addr string) (gpus []model.GpuInfo, err error) {
143-
resp, err := http.Get(addr)
143+
func getAllGpuUUID() ([]*gpu, error) {
144+
c := cmd.NewCommand(allGpuUUIDCommand)
145+
err := c.Execute()
144146
if err != nil {
145-
return gpus, err
147+
return nil, errors.Wrap(err, "cmd.Execute failed")
146148
}
147-
defer resp.Body.Close()
148149

149-
body, err := ioutil.ReadAll(resp.Body)
150+
gpuList, err := parseOutput(c.Stdout())
150151
if err != nil {
151-
return gpus, err
152+
return nil, errors.Wrap(err, "parseOutput failed")
152153
}
154+
return gpuList, nil
155+
}
156+
157+
func parseOutput(output string) (gpuList []*gpu, err error) {
158+
lines := strings.Split(output, "\n")
159+
gpuList = make([]*gpu, 0, len(lines))
160+
for _, line := range lines {
161+
if line == "" {
162+
continue
163+
}
153164

154-
if err = json.Unmarshal(body, &gpus); err != nil {
155-
return gpus, err
165+
fields := strings.Split(line, ", ")
166+
if len(fields) == 2 {
167+
index, err := strconv.Atoi(fields[0])
168+
if err != nil {
169+
return gpuList, errors.Wrapf(err, "strconv.Atoi failed, index: %s", fields[0])
170+
}
171+
uuid := fields[1]
172+
gpuList = append(gpuList, &gpu{
173+
Index: index,
174+
UUID: &uuid,
175+
})
176+
}
156177
}
157-
return gpus, err
178+
return
158179
}

internal/scheduler/portscheduler/scheduler.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ package portscheduler
22

33
import (
44
"encoding/json"
5-
"errors"
5+
"github.com/pkg/errors"
66
"sort"
77
"strconv"
88
"sync"
@@ -59,7 +59,7 @@ func Init(cfg *config.Config) error {
5959
var err error
6060
Scheduler, err = initFormEtcd()
6161
if err != nil {
62-
return err
62+
return errors.Wrap(err, "initFormEtcd failed")
6363
}
6464

6565
if Scheduler.StartPort == 0 || Scheduler.EndPort == 0 || Scheduler.AvailableCount == 0 {

0 commit comments

Comments
 (0)