Skip to content

Commit 8972fbf

Browse files
authored
[CLI][GB200] Add ultraserver instance(p6e-gb200) capacity block support (#6928)
- Add ultraserver instance support with capacity block validation - Add CapacityBlockHealthStatusValidator for UltraServer instance check. - The validator also collect the capacity blocks that are in valid CAPACITY_BLOCK_INACTIVE_STATES. (for example: state `scheduled`). In this scenario we throw a warning and report the ids and the states. - Add describe_capacity_block_status api in ec2.py - Add logic to collect all ultraserver instance capacity block ids in a dict, key -> ULTRASERVER_INSTANCE_PREFIX, value -> list of capacity block ids - Add ULTRASERVER_INSTANCE_PREFIX_LIST and ULTRASERVER_CAPACITY_BLOCK_ALLOWED_SIZE_DICT constants - Implement capacity block size validation for p6e-gb200 instances (9, 18 nodes) - Add ultraserver-specific template generation and validation logic - Move get_instance_type_and_reservation_type_from_capacity_reservation to ec2.py - Enhance template and configuration handling - Add capacity block support in cluster and queue stack templates - Pass ultrasever capacity block size to HeadNode dna.json. For p6e-gb200, "9", "18" or "9, 18". - Add has_ultraserver_instance to detect if the queue stack compute resources have ultraserver instance - Tests - Add ultraserver capacity block test suites - Add ultraserver cluster and queue stack tests - Fix test mocks to return proper CapacityReservationInfo objects - Add describe_capacity_reservations method to dummy EC2 client - Update existing tests for refactored capacity reservation function
1 parent fe6a186 commit 8972fbf

16 files changed

+1030
-22
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ CHANGELOG
55
------
66

77
**ENHANCEMENTS**
8+
- Add support for p6e-gb200 instances via capacity blocks.
89
- Echo chef-client log when a node fails to bootstrap. This helps with investigating bootstrap failures in cases CloudWatch logs are not available.
910

1011
**CHANGES**

cli/src/pcluster/aws/aws_resources.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -597,6 +597,10 @@ def total_instance_count(self):
597597
"""Return the total instance count, if present, 0 otherwise."""
598598
return self.capacity_reservation_data.get("TotalInstanceCount", 0)
599599

600+
def available_instance_count(self):
601+
"""Return the available instance count, if present, 0 otherwise."""
602+
return self.capacity_reservation_data.get("AvailableInstanceCount", 0)
603+
600604
def get_tag(self, tag_key: str):
601605
"""Get stack tag by tag key."""
602606
return next(

cli/src/pcluster/aws/ec2.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -577,3 +577,77 @@ def is_subnet_public(self, subnet_id):
577577
return True
578578

579579
return False
580+
581+
@AWSExceptionHandler.handle_client_exception
582+
def describe_capacity_block_status(
583+
self, capacity_block_ids: List[str] = None, filters=None, max_results: int = None
584+
):
585+
"""
586+
Describe the availability and health status of capacity blocks, particularly for ultraserver instances.
587+
588+
This method is primarily used to check the health status of ultraserver capacity blocks
589+
(e.g., p6e-gb200) to ensure they are ready for cluster operations. It provides information
590+
about interconnect status and available capacity.
591+
592+
:param capacity_block_ids: List of Capacity Block IDs to query (e.g., ['cr-123456']).
593+
:param filters: Optional boto3-style filters to narrow results (e.g., interconnect-status).
594+
:param max_results: Optional page size hint for pagination.
595+
:return: Dict with key 'CapacityBlockStatuses' containing a flattened list of capacity block
596+
status entries. Each entry includes fields like:
597+
- CapacityBlockId: The capacity block identifier
598+
- InterconnectStatus: Health status ('ok', 'impaired', 'insufficient-data')
599+
- TotalCapacity: Total number of instances in the capacity block
600+
- TotalUnavailableCapacity: Number of unavailable instances
601+
"""
602+
kwargs = {}
603+
if capacity_block_ids:
604+
kwargs["CapacityBlockIds"] = capacity_block_ids
605+
if filters:
606+
kwargs["Filters"] = filters
607+
if max_results:
608+
kwargs["MaxResults"] = max_results
609+
610+
paginator = self._client.get_paginator("describe_capacity_block_status")
611+
page_iterator = paginator.paginate(**kwargs)
612+
613+
statuses = []
614+
for page in page_iterator:
615+
statuses.extend(page.get("CapacityBlockStatuses", []))
616+
617+
return statuses
618+
619+
@AWSExceptionHandler.handle_client_exception
620+
def get_instance_type_and_reservation_type_from_capacity_reservation(
621+
self, capacity_reservation_id: str
622+
) -> tuple[str, str]:
623+
"""
624+
Retrieve instance type and reservation type from a capacity reservation ID.
625+
626+
This method queries AWS EC2 to get detailed information about a capacity reservation,
627+
specifically extracting the instance type and reservation type. This information is
628+
crucial for determining if special handling is needed (e.g., for ultraserver instances
629+
with capacity blocks).
630+
631+
Args:
632+
capacity_reservation_id: The AWS capacity reservation ID to query (e.g., 'cr-123456')
633+
634+
Returns:
635+
tuple: A tuple containing (instance_type, reservation_type) where:
636+
- instance_type: EC2 instance type (e.g., 'p6e-gb200.36xlarge')
637+
- reservation_type: Type of reservation (e.g., 'capacity-block', 'ondemand')
638+
Both values will be None if the reservation cannot be found or accessed.
639+
640+
Example:
641+
('p6e-gb200.36xlarge', 'capacity-block')
642+
"""
643+
instance_type = None
644+
reservation_type = None
645+
646+
if capacity_reservation_id:
647+
capacity_reservations = self.describe_capacity_reservations([capacity_reservation_id])
648+
if capacity_reservations:
649+
reservation = capacity_reservations[0]
650+
instance_type = reservation.instance_type()
651+
reservation_type = reservation.reservation_type()
652+
653+
return instance_type, reservation_type

cli/src/pcluster/config/cluster_config.py

Lines changed: 72 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,14 @@
6363
NODE_BOOTSTRAP_TIMEOUT,
6464
ONTAP,
6565
OPENZFS,
66+
ULTRASERVER_INSTANCE_PREFIX_LIST,
6667
Feature,
6768
)
68-
from pcluster.utils import get_partition, get_resource_name_from_resource_arn, to_snake_case
69+
from pcluster.utils import (
70+
get_partition,
71+
get_resource_name_from_resource_arn,
72+
to_snake_case,
73+
)
6974
from pcluster.validators.awsbatch_validators import (
7075
AwsBatchComputeInstanceTypeValidator,
7176
AwsBatchComputeResourceSizeValidator,
@@ -141,6 +146,7 @@
141146
)
142147
from pcluster.validators.ec2_validators import (
143148
AmiOsCompatibleValidator,
149+
CapacityBlockHealthStatusValidator,
144150
CapacityReservationResourceGroupValidator,
145151
CapacityReservationSizeValidator,
146152
CapacityReservationValidator,
@@ -2409,7 +2415,16 @@ def instance_types(self) -> List[str]:
24092415
def instance_type(self):
24102416
"""Instance type of this compute resource."""
24112417
if not self._instance_type:
2412-
self._instance_type = Resource.init_param(self._instance_type_from_capacity_reservation())
2418+
capacity_reservation_id = (
2419+
self.capacity_reservation_target.capacity_reservation_id if self.capacity_reservation_target else None
2420+
)
2421+
(
2422+
instance_type_from_capacity_reservation,
2423+
_,
2424+
) = AWSApi.instance().ec2.get_instance_type_and_reservation_type_from_capacity_reservation(
2425+
capacity_reservation_id
2426+
)
2427+
self._instance_type = Resource.init_param(instance_type_from_capacity_reservation)
24132428
return self._instance_type
24142429

24152430
def _register_validators(self, context: ValidatorContext = None):
@@ -2453,18 +2468,6 @@ def disable_simultaneous_multithreading_manually(self) -> bool:
24532468
"""Return true if simultaneous multithreading must be disabled with a cookbook script."""
24542469
return self.disable_simultaneous_multithreading and self._instance_type_info.default_threads_per_core() > 1
24552470

2456-
def _instance_type_from_capacity_reservation(self):
2457-
"""Return the instance type from the configured CapacityReservationId, if any."""
2458-
instance_type = None
2459-
capacity_reservation_id = (
2460-
self.capacity_reservation_target.capacity_reservation_id if self.capacity_reservation_target else None
2461-
)
2462-
if capacity_reservation_id:
2463-
capacity_reservations = AWSApi.instance().ec2.describe_capacity_reservations([capacity_reservation_id])
2464-
if capacity_reservations:
2465-
instance_type = capacity_reservations[0].instance_type()
2466-
return instance_type
2467-
24682471

24692472
class _CommonQueue(BaseQueue):
24702473
"""Represent the Common Queue resource between Slurm and future scheduler implementation."""
@@ -2931,6 +2934,7 @@ def __init__(
29312934
pool.ssh.allowed_ips = self.head_node.ssh.allowed_ips
29322935

29332936
self.__image_dict = None
2937+
self.__ultraserver_capacity_block_dict = None
29342938
# Cache capacity reservations information together to reduce number of boto3 calls.
29352939
# Since this cache is only used for validation, if AWSClientError happens
29362940
# (e.g insufficient IAM permissions to describe the capacity reservations), we catch the exception to avoid
@@ -2986,6 +2990,53 @@ def login_nodes_subnet_ids(self):
29862990
subnet_ids_set.add(subnet_id)
29872991
return list(subnet_ids_set)
29882992

2993+
@property
2994+
def ultraserver_capacity_block_dict(self):
2995+
"""
2996+
Return a dictionary mapping ultraserver instance prefixes to their capacity block reservation IDs.
2997+
2998+
This property collects all capacity block reservations used by ultraserver instances
2999+
(e.g., p6e-gb200) across all queues and compute resources in the cluster configuration.
3000+
3001+
Returns:
3002+
dict: A dictionary where keys are ultraserver instance prefixes (e.g., 'p6e-gb200')
3003+
and values are lists of capacity reservation IDs for that instance type.
3004+
3005+
Example:
3006+
{
3007+
'p6e-gb200': ['cr-123456', 'cr-789012']
3008+
}
3009+
"""
3010+
if self.__ultraserver_capacity_block_dict:
3011+
return self.__ultraserver_capacity_block_dict
3012+
3013+
self.__ultraserver_capacity_block_dict = {}
3014+
3015+
# Initialize empty lists for each supported ultraserver instance prefix
3016+
for ultraserver_instance_prefix in ULTRASERVER_INSTANCE_PREFIX_LIST:
3017+
self.__ultraserver_capacity_block_dict[ultraserver_instance_prefix] = []
3018+
3019+
# Iterate through all queues and compute resources to find ultraserver capacity blocks
3020+
for queue in self.scheduling.queues:
3021+
for compute_resource in queue.compute_resources:
3022+
cr_target = compute_resource.capacity_reservation_target or queue.capacity_reservation_target
3023+
if cr_target and cr_target.capacity_reservation_id:
3024+
# Get instance type and reservation type from the capacity reservation
3025+
(
3026+
instance_type,
3027+
reservation_type,
3028+
) = AWSApi.instance().ec2.get_instance_type_and_reservation_type_from_capacity_reservation(
3029+
cr_target.capacity_reservation_id
3030+
)
3031+
# Extract instance prefix (e.g., 'p6e-gb200' from 'p6e-gb200.36xlarge')
3032+
instance_prefix = instance_type.split(".")[0]
3033+
# Only collect capacity blocks for ultraserver instances
3034+
if reservation_type == "capacity-block" and instance_prefix in ULTRASERVER_INSTANCE_PREFIX_LIST:
3035+
self.__ultraserver_capacity_block_dict.get(instance_prefix).append(
3036+
cr_target.capacity_reservation_id
3037+
)
3038+
return self.__ultraserver_capacity_block_dict
3039+
29893040
def _register_login_node_validators(self):
29903041
"""Register all login node validators to ensure that the resource parameters are valid."""
29913042
# Check if all subnets(head node, Login nodes, compute nodes) are in the same VPC and support DNS.
@@ -3223,6 +3274,13 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: C901
32233274
num_of_instances=num_of_instances,
32243275
)
32253276

3277+
for ultraserver_instance_prefix in ULTRASERVER_INSTANCE_PREFIX_LIST:
3278+
if self.ultraserver_capacity_block_dict.get(ultraserver_instance_prefix):
3279+
self._register_validator(
3280+
CapacityBlockHealthStatusValidator,
3281+
capacity_reservation_ids=self.ultraserver_capacity_block_dict.get(ultraserver_instance_prefix),
3282+
)
3283+
32263284
@property
32273285
def image_dict(self):
32283286
"""Return image dict of queues, key is queue name, value is image id."""

cli/src/pcluster/constants.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,3 +340,12 @@ class Operation(Enum):
340340
# Tag key & expected revision (increment when policy widens)
341341
PCLUSTER_BUILD_IMAGE_CLEANUP_ROLE_REVISION = 1
342342
PCLUSTER_BUILD_IMAGE_CLEANUP_ROLE_BOOTSTRAP_TAG_KEY = "parallelcluster:build-image-cleanup-role-bootstrapped"
343+
344+
P6E_GB200 = "p6e-gb200"
345+
ULTRASERVER_INSTANCE_PREFIX_LIST = [P6E_GB200]
346+
# Dictionary mapping ultraserver instance prefixes to their allowed capacity block sizes
347+
ULTRASERVER_CAPACITY_BLOCK_ALLOWED_SIZE_DICT = {
348+
P6E_GB200: [9, 18], # Allowed sizes for p6e-gb200 ultraserver instances
349+
}
350+
# Capacity Block states that are considered inactive (cannot check health status)
351+
CAPACITY_BLOCK_INACTIVE_STATES = ["scheduled", "payment-pending", "assessing", "delayed"]

cli/src/pcluster/templates/cdk_builder_utils.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
from aws_cdk.aws_iam import ManagedPolicy, PermissionsBoundary
2424
from aws_cdk.core import Arn, ArnFormat, CfnDeletionPolicy, CfnTag, Construct, Fn, Stack
2525

26+
from pcluster.api.errors import BadRequestException
27+
from pcluster.aws.aws_api import AWSApi
2628
from pcluster.config.cluster_config import (
2729
BaseClusterConfig,
2830
BaseComputeResource,
@@ -42,6 +44,8 @@
4244
PCLUSTER_CLUSTER_NAME_TAG,
4345
PCLUSTER_DYNAMODB_PREFIX,
4446
PCLUSTER_NODE_TYPE_TAG,
47+
ULTRASERVER_CAPACITY_BLOCK_ALLOWED_SIZE_DICT,
48+
ULTRASERVER_INSTANCE_PREFIX_LIST,
4549
)
4650
from pcluster.launch_template_utils import _LaunchTemplateBuilder
4751
from pcluster.models.s3_bucket import S3Bucket, parse_bucket_url
@@ -369,6 +373,68 @@ def generate_launch_template_version_cfn_parameter_hash(queue, compute_resource)
369373
return sha1((queue + compute_resource).encode()).hexdigest()[0:16].capitalize() # nosec nosemgrep
370374

371375

376+
def process_ultraserver_capacity_block_sizes(cluster_ultraserver_capacity_block_dict):
377+
"""
378+
Process ultraserver capacity block sizes and validate them.
379+
380+
Returns:
381+
Dictionary mapping ultraserver instance prefixes to comma-separated size strings
382+
Raises:
383+
BadRequestException: If any capacity block sizes are invalid
384+
"""
385+
cluster_ultraserver_capacity_block_sizes_dict = {}
386+
invalid_capacity_blocks = []
387+
388+
for ultraserver_instance_prefix in ULTRASERVER_INSTANCE_PREFIX_LIST:
389+
cluster_ultraserver_capacity_block_sizes_dict[ultraserver_instance_prefix] = []
390+
allowed_sizes_list = ULTRASERVER_CAPACITY_BLOCK_ALLOWED_SIZE_DICT.get(ultraserver_instance_prefix)
391+
392+
capacity_reservation_ids = cluster_ultraserver_capacity_block_dict.get(ultraserver_instance_prefix)
393+
if capacity_reservation_ids:
394+
statuses = AWSApi.instance().ec2.describe_capacity_block_status(capacity_reservation_ids)
395+
396+
for status in statuses:
397+
size = status.get("TotalCapacity")
398+
if size is not None:
399+
if size not in allowed_sizes_list:
400+
invalid_capacity_blocks.append(
401+
f"{status.get('CapacityBlockId')} (size: {size}, allowed: {allowed_sizes_list})"
402+
)
403+
else:
404+
cluster_ultraserver_capacity_block_sizes_dict.get(ultraserver_instance_prefix).append(size)
405+
406+
unique_sizes = sorted(set(cluster_ultraserver_capacity_block_sizes_dict.get(ultraserver_instance_prefix)))
407+
408+
cluster_ultraserver_capacity_block_sizes_dict[ultraserver_instance_prefix] = ", ".join(
409+
str(unique_size) for unique_size in unique_sizes
410+
)
411+
412+
# Raise exception with all invalid capacity blocks if any found
413+
if invalid_capacity_blocks:
414+
raise BadRequestException(
415+
f"The following capacity blocks have invalid block sizes: {'; '.join(invalid_capacity_blocks)}."
416+
)
417+
418+
return cluster_ultraserver_capacity_block_sizes_dict
419+
420+
421+
def has_ultraserver_instance(cr_target):
422+
"""Check if the compute resource uses ultraserver instances with capacity blocks."""
423+
_has_ultraserver_instance = False
424+
if cr_target and cr_target.capacity_reservation_id:
425+
(
426+
instance_type,
427+
reservation_type,
428+
) = AWSApi.instance().ec2.get_instance_type_and_reservation_type_from_capacity_reservation(
429+
cr_target.capacity_reservation_id
430+
)
431+
instance_prefix = instance_type.split(".")[0]
432+
if reservation_type == "capacity-block" and instance_prefix in ULTRASERVER_INSTANCE_PREFIX_LIST:
433+
_has_ultraserver_instance = True
434+
435+
return _has_ultraserver_instance
436+
437+
372438
class NodeIamResourcesBase(Construct):
373439
"""Abstract construct defining IAM resources for a cluster node."""
374440

cli/src/pcluster/templates/cluster_stack.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,10 @@
7171
NFS_PORT,
7272
NODE_BOOTSTRAP_TIMEOUT,
7373
OS_MAPPING,
74+
P6E_GB200,
7475
PCLUSTER_DYNAMODB_PREFIX,
7576
PCLUSTER_S3_ARTIFACTS_DICT,
77+
SLURM,
7678
SLURM_PORTS_RANGE,
7779
)
7880
from pcluster.models.s3_bucket import S3Bucket
@@ -98,6 +100,7 @@
98100
get_slurm_specific_dna_json_for_head_node,
99101
get_source_ingress_rule,
100102
get_user_data_content,
103+
process_ultraserver_capacity_block_sizes,
101104
to_comma_separated_string,
102105
)
103106
from pcluster.templates.compute_fleet_stack import ComputeFleetConstruct
@@ -1265,6 +1268,16 @@ def _add_head_node(self):
12651268
head_node_launch_template.add_metadata("Comment", "AWS ParallelCluster Head Node")
12661269
# CloudFormation::Init metadata
12671270

1271+
# Process ultraserver capacity block information for DNA JSON
1272+
# This section collects capacity block sizes for ultraserver instances (e.g., p6e-gb200)
1273+
# and validates that they conform to allowed size configurations for Slurm topology
1274+
cluster_ultraserver_capacity_block_sizes_dict = {}
1275+
if self.config.scheduling.scheduler == SLURM:
1276+
cluster_ultraserver_capacity_block_dict = self.config.ultraserver_capacity_block_dict
1277+
cluster_ultraserver_capacity_block_sizes_dict = process_ultraserver_capacity_block_sizes(
1278+
cluster_ultraserver_capacity_block_dict
1279+
)
1280+
12681281
dna_json = json.dumps(
12691282
{
12701283
"cluster": {
@@ -1358,6 +1371,12 @@ def _add_head_node(self):
13581371
else "false"
13591372
),
13601373
"launch_template_id": launch_template_id,
1374+
**(
1375+
{"p6egb200_block_sizes": cluster_ultraserver_capacity_block_sizes_dict[P6E_GB200]}
1376+
if P6E_GB200 in cluster_ultraserver_capacity_block_sizes_dict
1377+
and cluster_ultraserver_capacity_block_sizes_dict[P6E_GB200]
1378+
else {}
1379+
),
13611380
**(
13621381
get_slurm_specific_dna_json_for_head_node(self.config, self.scheduler_resources)
13631382
if self._condition_is_slurm()

0 commit comments

Comments
 (0)