From c4e3aca109084d4bf240cc02695d4171c48409ef Mon Sep 17 00:00:00 2001 From: Suraj Kota Date: Wed, 23 Oct 2024 08:16:36 -0700 Subject: [PATCH 1/4] Add support for HyperPod nodes SageMaker HyperPod recently launched EKS integration. This commit adds SageMaker instance types and toleration for running DeepHealthChecks. --- stable/aws-efa-k8s-device-plugin/values.yaml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/stable/aws-efa-k8s-device-plugin/values.yaml b/stable/aws-efa-k8s-device-plugin/values.yaml index ece9a2aa..2e78815c 100644 --- a/stable/aws-efa-k8s-device-plugin/values.yaml +++ b/stable/aws-efa-k8s-device-plugin/values.yaml @@ -35,7 +35,9 @@ supportedInstanceLabels: # EFA supported instances: https://docs.aws.amazon.com/ - m7i.48xlarge - m7i.metal-48xl - c5n.9xlarge + - ml.c5n.9xlarge - c5n.18xlarge + - ml.c5n.18xlarge - c5n.metal - c6a.48xlarge - c6a.metal @@ -97,10 +99,15 @@ supportedInstanceLabels: # EFA supported instances: https://docs.aws.amazon.com/ - g4dn.16xlarge - g4dn.metal - g5.8xlarge + - ml.g5.8xlarge - g5.12xlarge + - ml.g5.12xlarge - g5.16xlarge + - ml.g5.16xlarge - g5.24xlarge + - ml.g5.24xlarge - g5.48xlarge + - ml.g5.48xlarge - g6.8xlarge - g6.12xlarge - g6.16xlarge @@ -115,11 +122,17 @@ supportedInstanceLabels: # EFA supported instances: https://docs.aws.amazon.com/ - inf1.24xlarge - p3dn.24xlarge - p4d.24xlarge + - ml.p4d.24xlarge - p4de.24xlarge + - ml.p4de.24xlarge - p5.48xlarge + - ml.p5.48xlarge - p5e.48xlarge + - ml.p5e.48xlarge - trn1.32xlarge + - ml.trn1.32xlarge - trn1n.32xlarge + - ml.trn1n.32xlarge - vt1.24xlarge - hpc6a.48xlarge - hpc6id.32xlarge @@ -147,6 +160,10 @@ tolerations: [] # - key: aws.amazon.com/efa # operator: Exists # effect: NoSchedule + - key: sagemaker.amazonaws.com/node-health-status + operator: Equal + effect: NoSchedule + value: Unschedulable additionalPodAnnotations: {} additionalPodLabels: {} nameOverride: "" From b65ae60536e8cbcc968fda795128feef0a961edd Mon Sep 17 00:00:00 2001 From: Suraj Kota Date: Wed, 23 Oct 2024 08:21:32 -0700 Subject: [PATCH 2/4] Update Chart version --- stable/aws-efa-k8s-device-plugin/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stable/aws-efa-k8s-device-plugin/Chart.yaml b/stable/aws-efa-k8s-device-plugin/Chart.yaml index 683598ee..2abfc4b6 100644 --- a/stable/aws-efa-k8s-device-plugin/Chart.yaml +++ b/stable/aws-efa-k8s-device-plugin/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v1 name: aws-efa-k8s-device-plugin description: A Helm chart for EFA device plugin. -version: v0.5.5 +version: v0.5.6 appVersion: "v0.5.4" home: https://github.com/aws/eks-charts icon: https://raw.githubusercontent.com/aws/eks-charts/master/docs/logo/aws.png From b1c3f9236ed5b36a16933f81a37d6082e6ab4298 Mon Sep 17 00:00:00 2001 From: Suraj Kota Date: Wed, 23 Oct 2024 08:26:00 -0700 Subject: [PATCH 3/4] consolidate instance types for readability --- stable/aws-efa-k8s-device-plugin/values.yaml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/stable/aws-efa-k8s-device-plugin/values.yaml b/stable/aws-efa-k8s-device-plugin/values.yaml index 2e78815c..61bed4c0 100644 --- a/stable/aws-efa-k8s-device-plugin/values.yaml +++ b/stable/aws-efa-k8s-device-plugin/values.yaml @@ -35,8 +35,8 @@ supportedInstanceLabels: # EFA supported instances: https://docs.aws.amazon.com/ - m7i.48xlarge - m7i.metal-48xl - c5n.9xlarge - - ml.c5n.9xlarge - c5n.18xlarge + - ml.c5n.9xlarge - ml.c5n.18xlarge - c5n.metal - c6a.48xlarge @@ -99,14 +99,14 @@ supportedInstanceLabels: # EFA supported instances: https://docs.aws.amazon.com/ - g4dn.16xlarge - g4dn.metal - g5.8xlarge - - ml.g5.8xlarge - g5.12xlarge - - ml.g5.12xlarge - g5.16xlarge - - ml.g5.16xlarge - g5.24xlarge - - ml.g5.24xlarge - g5.48xlarge + - ml.g5.8xlarge + - ml.g5.12xlarge + - ml.g5.16xlarge + - ml.g5.24xlarge - ml.g5.48xlarge - g6.8xlarge - g6.12xlarge @@ -122,16 +122,16 @@ supportedInstanceLabels: # EFA supported instances: https://docs.aws.amazon.com/ - inf1.24xlarge - p3dn.24xlarge - p4d.24xlarge - - ml.p4d.24xlarge - p4de.24xlarge + - ml.p4d.24xlarge - ml.p4de.24xlarge - p5.48xlarge - - ml.p5.48xlarge - p5e.48xlarge + - ml.p5.48xlarge - ml.p5e.48xlarge - trn1.32xlarge - - ml.trn1.32xlarge - trn1n.32xlarge + - ml.trn1.32xlarge - ml.trn1n.32xlarge - vt1.24xlarge - hpc6a.48xlarge From 4359c723a198ed53bc9bdf49aaaa510b4937dcc9 Mon Sep 17 00:00:00 2001 From: Suraj Kota Date: Wed, 23 Oct 2024 17:23:59 -0700 Subject: [PATCH 4/4] fix tolerations --- stable/aws-efa-k8s-device-plugin/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stable/aws-efa-k8s-device-plugin/values.yaml b/stable/aws-efa-k8s-device-plugin/values.yaml index 61bed4c0..cd1fe0ae 100644 --- a/stable/aws-efa-k8s-device-plugin/values.yaml +++ b/stable/aws-efa-k8s-device-plugin/values.yaml @@ -156,7 +156,7 @@ resources: memory: 20Mi nodeSelector: {} # efa: present -tolerations: [] +tolerations: # - key: aws.amazon.com/efa # operator: Exists # effect: NoSchedule