Skip to content
This repository was archived by the owner on Nov 27, 2023. It is now read-only.

Commit 57c14e7

Browse files
authored
Merge pull request #628 from docker/machine
Guess AWS machine type based on service resources reservations
2 parents 10372b7 + b22ebd6 commit 57c14e7

File tree

3 files changed

+329
-0
lines changed

3 files changed

+329
-0
lines changed

ecs/compatibility.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ var compatibleComposeAttributes = []string{
3838
"services.deploy.resources.reservations",
3939
"services.deploy.resources.reservations.cpus",
4040
"services.deploy.resources.reservations.memory",
41+
"services.deploy.resources.reservations.generic_resources",
42+
"services.deploy.resources.reservations.generic_resources.discrete_resource_spec",
4143
"services.deploy.update_config",
4244
"services.deploy.update_config.parallelism",
4345
"services.entrypoint",

ecs/gpu.go

Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
/*
2+
Copyright 2020 Docker, Inc.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package ecs
18+
19+
import (
20+
"fmt"
21+
"math"
22+
"strconv"
23+
24+
"github.com/compose-spec/compose-go/types"
25+
"github.com/docker/go-units"
26+
)
27+
28+
type machine struct {
29+
id string
30+
cpus float64
31+
memory types.UnitBytes
32+
gpus int64
33+
}
34+
35+
type family []machine
36+
37+
var p3family = family{
38+
{
39+
id: "p3.2xlarge",
40+
cpus: 8,
41+
memory: 64 * units.GiB,
42+
gpus: 2,
43+
},
44+
{
45+
id: "p3.8xlarge",
46+
cpus: 32,
47+
memory: 244 * units.GiB,
48+
gpus: 4,
49+
},
50+
{
51+
id: "p3.16xlarge",
52+
cpus: 64,
53+
memory: 488 * units.GiB,
54+
gpus: 8,
55+
},
56+
}
57+
58+
type filterFn func(machine) bool
59+
60+
func (f family) filter(fn filterFn) family {
61+
var filtered family
62+
for _, machine := range f {
63+
if fn(machine) {
64+
filtered = append(filtered, machine)
65+
}
66+
}
67+
return filtered
68+
}
69+
70+
func (f family) firstOrError(msg string, args ...interface{}) (machine, error) {
71+
if len(f) == 0 {
72+
return machine{}, fmt.Errorf(msg, args...)
73+
}
74+
return f[0], nil
75+
}
76+
77+
func guessMachineType(project *types.Project) (string, error) {
78+
// we select a machine type to match all gpus-bound services requirements
79+
// once https://github.com/aws/containers-roadmap/issues/631 is implemented we can define dedicated CapacityProviders per service.
80+
requirements, err := getResourceRequirements(project)
81+
if err != nil {
82+
return "", err
83+
}
84+
85+
instanceType, err := p3family.
86+
filter(func(m machine) bool {
87+
return m.memory >= requirements.memory
88+
}).
89+
filter(func(m machine) bool {
90+
return m.cpus >= requirements.cpus
91+
}).
92+
filter(func(m machine) bool {
93+
return m.gpus >= requirements.gpus
94+
}).
95+
firstOrError("none of the Amazon EC2 P3 instance types meet the requirements for memory:%d cpu:%f gpus:%d", requirements.memory, requirements.cpus, requirements.gpus)
96+
if err != nil {
97+
return "", err
98+
}
99+
return instanceType.id, nil
100+
}
101+
102+
type resourceRequirements struct {
103+
memory types.UnitBytes
104+
cpus float64
105+
gpus int64
106+
}
107+
108+
func getResourceRequirements(project *types.Project) (*resourceRequirements, error) {
109+
return toResourceRequirementsSlice(project).
110+
filter(func(requirements *resourceRequirements) bool {
111+
return requirements.gpus != 0
112+
}).
113+
max()
114+
}
115+
116+
type eitherRequirementsOrError struct {
117+
requirements []*resourceRequirements
118+
err error
119+
}
120+
121+
func toResourceRequirementsSlice(project *types.Project) eitherRequirementsOrError {
122+
var requirements []*resourceRequirements
123+
for _, service := range project.Services {
124+
r, err := toResourceRequirements(service)
125+
if err != nil {
126+
return eitherRequirementsOrError{nil, err}
127+
}
128+
requirements = append(requirements, r)
129+
}
130+
return eitherRequirementsOrError{requirements, nil}
131+
}
132+
133+
func (r eitherRequirementsOrError) filter(fn func(*resourceRequirements) bool) eitherRequirementsOrError {
134+
if r.err != nil {
135+
return r
136+
}
137+
var requirements []*resourceRequirements
138+
for _, req := range r.requirements {
139+
if fn(req) {
140+
requirements = append(requirements, req)
141+
}
142+
}
143+
return eitherRequirementsOrError{requirements, nil}
144+
}
145+
146+
func toResourceRequirements(service types.ServiceConfig) (*resourceRequirements, error) {
147+
if service.Deploy == nil {
148+
return nil, nil
149+
}
150+
reservations := service.Deploy.Resources.Reservations
151+
if reservations == nil {
152+
return nil, nil
153+
}
154+
155+
var requiredGPUs int64
156+
for _, r := range reservations.GenericResources {
157+
if r.DiscreteResourceSpec.Kind == "gpus" {
158+
requiredGPUs = r.DiscreteResourceSpec.Value
159+
break
160+
}
161+
}
162+
163+
var nanocpu float64
164+
if reservations.NanoCPUs != "" {
165+
v, err := strconv.ParseFloat(reservations.NanoCPUs, 64)
166+
if err != nil {
167+
return nil, err
168+
}
169+
nanocpu = v
170+
}
171+
return &resourceRequirements{
172+
memory: reservations.MemoryBytes,
173+
cpus: nanocpu,
174+
gpus: requiredGPUs,
175+
}, nil
176+
}
177+
178+
func (r resourceRequirements) combine(o *resourceRequirements) resourceRequirements {
179+
if o == nil {
180+
return r
181+
}
182+
return resourceRequirements{
183+
memory: maxUnitBytes(r.memory, o.memory),
184+
cpus: math.Max(r.cpus, o.cpus),
185+
gpus: maxInt64(r.gpus, o.gpus),
186+
}
187+
}
188+
189+
func (r eitherRequirementsOrError) max() (*resourceRequirements, error) {
190+
if r.err != nil {
191+
return nil, r.err
192+
}
193+
min := resourceRequirements{}
194+
for _, req := range r.requirements {
195+
min = min.combine(req)
196+
}
197+
return &min, nil
198+
}
199+
200+
func maxInt64(a, b int64) int64 {
201+
if a > b {
202+
return a
203+
}
204+
return b
205+
}
206+
207+
func maxUnitBytes(a, b types.UnitBytes) types.UnitBytes {
208+
if a > b {
209+
return a
210+
}
211+
return b
212+
}

ecs/gpu_test.go

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
/*
2+
Copyright 2020 Docker, Inc.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package ecs
18+
19+
import (
20+
"testing"
21+
)
22+
23+
func TestGuessMachineType(t *testing.T) {
24+
tests := []struct {
25+
name string
26+
yaml string
27+
want string
28+
wantErr bool
29+
}{
30+
{
31+
name: "1-gpus",
32+
yaml: `
33+
services:
34+
learning:
35+
image: tensorflow/tensorflow:latest-gpus
36+
deploy:
37+
resources:
38+
reservations:
39+
generic_resources:
40+
- discrete_resource_spec:
41+
kind: gpus
42+
value: 1
43+
`,
44+
want: "p3.2xlarge",
45+
wantErr: false,
46+
},
47+
{
48+
name: "4-gpus",
49+
yaml: `
50+
services:
51+
learning:
52+
image: tensorflow/tensorflow:latest-gpus
53+
deploy:
54+
resources:
55+
reservations:
56+
generic_resources:
57+
- discrete_resource_spec:
58+
kind: gpus
59+
value: 4
60+
`,
61+
want: "p3.8xlarge",
62+
wantErr: false,
63+
},
64+
{
65+
name: "1-gpus, high-memory",
66+
yaml: `
67+
services:
68+
learning:
69+
image: tensorflow/tensorflow:latest-gpus
70+
deploy:
71+
resources:
72+
reservations:
73+
memory: 300Gb
74+
generic_resources:
75+
- discrete_resource_spec:
76+
kind: gpus
77+
value: 2
78+
`,
79+
want: "p3.16xlarge",
80+
wantErr: false,
81+
},
82+
{
83+
name: "1-gpus, high-cpu",
84+
yaml: `
85+
services:
86+
learning:
87+
image: tensorflow/tensorflow:latest-gpus
88+
deploy:
89+
resources:
90+
reservations:
91+
memory: 32Gb
92+
cpus: "32"
93+
generic_resources:
94+
- discrete_resource_spec:
95+
kind: gpus
96+
value: 2
97+
`,
98+
want: "p3.8xlarge",
99+
wantErr: false,
100+
},
101+
}
102+
for _, tt := range tests {
103+
t.Run(tt.name, func(t *testing.T) {
104+
project := loadConfig(t, tt.yaml)
105+
got, err := guessMachineType(project)
106+
if (err != nil) != tt.wantErr {
107+
t.Errorf("guessMachineType() error = %v, wantErr %v", err, tt.wantErr)
108+
return
109+
}
110+
if got != tt.want {
111+
t.Errorf("guessMachineType() got = %v, want %v", got, tt.want)
112+
}
113+
})
114+
}
115+
}

0 commit comments

Comments
 (0)