Skip to content

Commit abf9c73

Browse files
committed
Change network interface setup logic to account for gb200
1 parent e3caf6e commit abf9c73

File tree

1 file changed

+49
-27
lines changed

1 file changed

+49
-27
lines changed

cli/src/pcluster/templates/queues_stack.py

Lines changed: 49 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -150,33 +150,7 @@ def _add_compute_resource_launch_template(
150150
instance_profiles,
151151
is_detailed_monitoring_enabled,
152152
):
153-
# LT network interfaces
154-
compute_lt_nw_interfaces = [
155-
ec2.CfnLaunchTemplate.NetworkInterfaceProperty(
156-
device_index=0,
157-
network_card_index=0,
158-
associate_public_ip_address=queue.networking.assign_public_ip,
159-
interface_type="efa" if compute_resource.efa and compute_resource.efa.enabled else None,
160-
groups=queue_lt_security_groups,
161-
subnet_id=(
162-
queue.networking.subnet_ids[0] if isinstance(compute_resource, SlurmComputeResource) else None
163-
),
164-
)
165-
]
166-
167-
for network_card in compute_resource.network_cards_list[1:]:
168-
compute_lt_nw_interfaces.append(
169-
ec2.CfnLaunchTemplate.NetworkInterfaceProperty(
170-
device_index=0 if network_card.maximum_network_interfaces() == 1 else 1,
171-
network_card_index=network_card.network_card_index(),
172-
associate_public_ip_address=False,
173-
interface_type="efa" if compute_resource.efa and compute_resource.efa.enabled else None,
174-
groups=queue_lt_security_groups,
175-
subnet_id=(
176-
queue.networking.subnet_ids[0] if isinstance(compute_resource, SlurmComputeResource) else None
177-
),
178-
)
179-
)
153+
compute_lt_nw_interfaces = add_network_interfaces(queue, compute_resource, queue_lt_security_groups)
180154

181155
conditional_template_properties = {}
182156
if compute_resource.is_ebs_optimized:
@@ -385,3 +359,51 @@ def _add_compute_resource_launch_template(
385359
)
386360

387361
return launch_template
362+
363+
364+
def add_network_interfaces(
365+
queue,
366+
compute_resource,
367+
queue_lt_security_groups,
368+
):
369+
"""Generate launch template network interfaces list"""
370+
371+
is_gb200 = compute_resource.instance_types[0] == "p6e-gb200.36xlarge"
372+
interface = "efa" if compute_resource.efa and compute_resource.efa.enabled and not is_gb200 else None
373+
374+
compute_lt_nw_interfaces = [
375+
ec2.CfnLaunchTemplate.NetworkInterfaceProperty(
376+
device_index=0,
377+
network_card_index=0,
378+
associate_public_ip_address=queue.networking.assign_public_ip,
379+
interface_type=interface,
380+
groups=queue_lt_security_groups,
381+
subnet_id=(queue.networking.subnet_ids[0] if isinstance(compute_resource, SlurmComputeResource) else None),
382+
)
383+
]
384+
385+
for network_card in compute_resource.network_cards_list[1:]:
386+
efa_enabled = True if compute_resource.efa and compute_resource.efa.enabled else False
387+
even = network_card.network_card_index() % 2 == 0
388+
# if efa is disabled, and we have a gb200 instance we skip configuring odd numbered indexes
389+
if is_gb200 and not efa_enabled and not even:
390+
continue
391+
392+
interface = "efa" if compute_resource.efa and compute_resource.efa.enabled else None
393+
# if efa is enabled with a gb200 instance, even indexes are configured as efa and the odd as efa-only
394+
if is_gb200 and efa_enabled:
395+
interface = "efa" if even else "efa-only"
396+
397+
compute_lt_nw_interfaces.append(
398+
ec2.CfnLaunchTemplate.NetworkInterfaceProperty(
399+
device_index=0 if network_card.maximum_network_interfaces() == 1 else 1,
400+
network_card_index=network_card.network_card_index(),
401+
associate_public_ip_address=False,
402+
interface_type=interface,
403+
groups=queue_lt_security_groups,
404+
subnet_id=(
405+
queue.networking.subnet_ids[0] if isinstance(compute_resource, SlurmComputeResource) else None
406+
),
407+
)
408+
)
409+
return compute_lt_nw_interfaces

0 commit comments

Comments
 (0)