Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 2dc89c0

Browse files
author
shixianc
committedJan 24, 2025·
Add AWS Inf2 instances support for aws_batch scheduler
1 parent 4789001 commit 2dc89c0

File tree

2 files changed

+73
-0
lines changed

2 files changed

+73
-0
lines changed
 

‎torchx/specs/named_resources_aws.py

+44
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,46 @@ def aws_trn1_32xlarge() -> Resource:
354354
)
355355

356356

357+
def aws_inf2_xlarge() -> Resource:
358+
return Resource(
359+
cpu=4,
360+
gpu=0,
361+
memMB=16 * GiB,
362+
capabilities={K8S_ITYPE: "inf2.xlarge"},
363+
devices={NEURON_DEVICE: 1},
364+
)
365+
366+
367+
def aws_inf2_8xlarge() -> Resource:
368+
return Resource(
369+
cpu=32,
370+
gpu=0,
371+
memMB=128 * GiB,
372+
capabilities={K8S_ITYPE: "inf2.8xlarge"},
373+
devices={NEURON_DEVICE: 1},
374+
)
375+
376+
377+
def aws_inf2_24xlarge() -> Resource:
378+
return Resource(
379+
cpu=96,
380+
gpu=0,
381+
memMB=384 * GiB,
382+
capabilities={K8S_ITYPE: "inf2.24xlarge"},
383+
devices={NEURON_DEVICE: 6},
384+
)
385+
386+
387+
def aws_inf2_48xlarge() -> Resource:
388+
return Resource(
389+
cpu=192,
390+
gpu=0,
391+
memMB=768 * GiB,
392+
capabilities={K8S_ITYPE: "inf2.48xlarge"},
393+
devices={NEURON_DEVICE: 12},
394+
)
395+
396+
357397
NAMED_RESOURCES: Mapping[str, Callable[[], Resource]] = {
358398
"aws_t3.medium": aws_t3_medium,
359399
"aws_m5.2xlarge": aws_m5_2xlarge,
@@ -390,4 +430,8 @@ def aws_trn1_32xlarge() -> Resource:
390430
"aws_g6e.48xlarge": aws_g6e_48xlarge,
391431
"aws_trn1.2xlarge": aws_trn1_2xlarge,
392432
"aws_trn1.32xlarge": aws_trn1_32xlarge,
433+
"aws_inf2.xlarge": aws_inf2_xlarge,
434+
"aws_inf2.8xlarge": aws_inf2_8xlarge,
435+
"aws_inf2.24xlarge": aws_inf2_24xlarge,
436+
"aws_inf2.48xlarge": aws_inf2_48xlarge,
393437
}

‎torchx/specs/test/named_resources_aws_test.py

+29
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@
3232
aws_g6e_4xlarge,
3333
aws_g6e_8xlarge,
3434
aws_g6e_xlarge,
35+
aws_inf2_24xlarge,
36+
aws_inf2_48xlarge,
37+
aws_inf2_8xlarge,
38+
aws_inf2_xlarge,
3539
aws_m5_2xlarge,
3640
aws_p3_16xlarge,
3741
aws_p3_2xlarge,
@@ -232,6 +236,31 @@ def test_aws_trn1(self) -> None:
232236
self.assertEqual(trn1_32.memMB, trn1_2.memMB * 16)
233237
self.assertEqual({EFA_DEVICE: 8, NEURON_DEVICE: 16}, trn1_32.devices)
234238

239+
def test_aws_inf2(self) -> None:
240+
inf2_1 = aws_inf2_xlarge()
241+
self.assertEqual(4, inf2_1.cpu)
242+
self.assertEqual(0, inf2_1.gpu)
243+
self.assertEqual(16 * GiB, inf2_1.memMB)
244+
self.assertEqual({NEURON_DEVICE: 1}, inf2_1.devices)
245+
246+
inf2_8 = aws_inf2_8xlarge()
247+
self.assertEqual(32, inf2_8.cpu)
248+
self.assertEqual(0, inf2_8.gpu)
249+
self.assertEqual(128 * GiB, inf2_8.memMB)
250+
self.assertEqual({NEURON_DEVICE: 1}, inf2_8.devices)
251+
252+
inf2_24 = aws_inf2_24xlarge()
253+
self.assertEqual(96, inf2_24.cpu)
254+
self.assertEqual(0, inf2_24.gpu)
255+
self.assertEqual(384 * GiB, inf2_24.memMB)
256+
self.assertEqual({NEURON_DEVICE: 6}, inf2_24.devices)
257+
258+
inf2_48 = aws_inf2_48xlarge()
259+
self.assertEqual(192, inf2_48.cpu)
260+
self.assertEqual(0, inf2_48.gpu)
261+
self.assertEqual(768 * GiB, inf2_48.memMB)
262+
self.assertEqual({NEURON_DEVICE: 12}, inf2_48.devices)
263+
235264
def test_aws_m5_2xlarge(self) -> None:
236265
resource = aws_m5_2xlarge()
237266
self.assertEqual(8, resource.cpu)

0 commit comments

Comments
 (0)
Please sign in to comment.