Skip to content

Commit 56b6b31

Browse files
authored
Enable ena-express (#298)
Detect that instances can support ENA-Express and enable it so that they have more networking bandwith. Note that the AMI must have the correct drivers installed. Resolves #308
1 parent cdf9073 commit 56b6b31

File tree

9 files changed

+87
-17
lines changed

9 files changed

+87
-17
lines changed

.gitallowed

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
key = 'ParallelClusterEnableEnaExpressPolicyArn'

docs/config.md

+13-2
Original file line numberDiff line numberDiff line change
@@ -108,12 +108,16 @@ This project creates a ParallelCluster configuration file that is documented in
108108
useOnDemand: bool
109109
UseSpot: bool
110110
DisableSimultaneousMultithreading: bool
111+
EnableEfa: bool
112+
PlacementGroupName: str
111113
<a href="#include-instancetypes">InstanceTypes</a>:
112114
- str
113115
- str:
114116
UseOnDemand: bool
115117
UseSpot: bool
116118
DisableSimultaneousMultithreading: bool
119+
EnableEfa: bool
120+
PlacementGroupName: str
117121
<a href="#nodecounts">NodeCounts</a>:
118122
<a href="#defaultmincount">DefaultMinCount</a>: str
119123
<a href="#defaultmaxcount">DefaultMaxCount</a>: str
@@ -373,7 +377,14 @@ type: bool
373377

374378
default: False
375379

376-
Recommend to not use EFA unless necessary to avoid insufficient capacity errors when starting new instances in group or when multiple instance types in the group.
380+
This will enable EFA for all compute resources with instances that support EFA.
381+
382+
This can also be controlled for individual instance types in the InstanceConfig section.
383+
384+
If EFA is enabled without specifying a placement group name, then each compute resource is assigned its own managed placement group.
385+
386+
NOTE: Most EDA workloads cannot take advantage of EFA because they don't use MPI or NCCL.
387+
I recommend to not use EFA unless necessary to avoid insufficient capacity errors when starting new instances in group or when multiple instance types are in the group.
377388

378389
See [https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-groups.html#placement-groups-cluster](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-groups.html#placement-groups-cluster)
379390

@@ -827,7 +838,7 @@ Exclude patterns are processed first and take precedence over any includes.
827838
Instance families and types are regular expressions with implicit '^' and '$' at the begining and end.
828839

829840
Each element in the array can be either a regular expression string or a dictionary where the only key
830-
is the regular expression string and that has overrides **UseOnDemand**, **UseSpot**, and **DisableSimultaneousMultithreading** for the matching instance families or instance types.
841+
is the regular expression string and that has overrides **UseOnDemand**, **UseSpot**, **DisableSimultaneousMultithreading**, **EnableEfa**, and **PlacementGroupName** for the matching instance families or instance types.
831842

832843
The settings for instance families overrides the defaults, and the settings for instance types override the others.
833844

source/EC2InstanceTypeInfoPkg/EC2InstanceTypeInfo.py

+15-3
Original file line numberDiff line numberDiff line change
@@ -119,14 +119,24 @@ def __init__(self, regions, get_savings_plans=True, json_filename=None, debug=Fa
119119
# Endpoints only supported in 2 regions: https://docs.aws.amazon.com/cli/latest/reference/pricing/index.html
120120
self.pricing_client = boto3.client('pricing', region_name='us-east-1')
121121

122+
# Check region names first to make sure opt-in regions are enabled
123+
self.region_names = {}
124+
missing_region_names = False
122125
for region in sorted(self.regions):
123-
if region in self.instance_type_and_family_info and json_filename:
124-
logger.info(f'Using EC2 instance info from {json_filename} for {region}')
125-
continue
126126
region_name = self.get_region_name(region)
127127
if not region_name:
128128
logger.error(f"Could not find region name for {region}. Is this a new region or does it need to be enabled for your account?")
129+
missing_region_names = True
130+
continue
131+
self.region_names[region] = region_name
132+
if missing_region_names:
133+
exit(1)
134+
135+
for region in sorted(self.regions):
136+
if region in self.instance_type_and_family_info and json_filename:
137+
logger.info(f'Using EC2 instance info from {json_filename} for {region}')
129138
continue
139+
region_name = self.region_names[region]
130140
logger.info(f'Getting EC2 instance info for {region} ({region_name})')
131141
assert(self.valid_credentials)
132142
self.ec2_client = boto3.client('ec2', region_name=region)
@@ -187,6 +197,7 @@ def get_instance_type_and_family_info(self, region):
187197
instance_type_info[instanceType]['Hypervisor'] = instanceTypeDict.get('Hypervisor', '')
188198
instance_type_info[instanceType]['NetworkPerformance'] = instanceTypeDict['NetworkInfo']['NetworkPerformance']
189199
instance_type_info[instanceType]['EfaSupported'] = instanceTypeDict['NetworkInfo']['EfaSupported']
200+
instance_type_info[instanceType]['EnaSrdSupported'] = instanceTypeDict['NetworkInfo']['EnaSrdSupported']
190201
if 'GpuInfo' in instanceTypeDict and 'Gpus' in instanceTypeDict['GpuInfo']:
191202
instance_type_info[instanceType]['GpuCount'] = int(instanceTypeDict['GpuInfo']['Gpus'][0].get('Count', 0))
192203
instance_type_info[instanceType]['GpuManufacturer'] = instanceTypeDict['GpuInfo']['Gpus'][0].get('Manufacturer', "")
@@ -528,6 +539,7 @@ def get_region_name(self, region_code):
528539
with open(endpoint_file, 'r') as f:
529540
data = json.load(f)
530541
missing_region_names = {
542+
'ap-southeast-5': {'description': 'Asia Pacific (Malaysia)'},
531543
'ca-west-1': {'description': 'Canada (Calgary)'}
532544
}
533545
for missing_region in missing_region_names:

source/EC2InstanceTypeInfoPkg/get_ec2_instance_info.py

+8
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,24 @@
55
from EC2InstanceTypeInfoPkg.EC2InstanceTypeInfo import EC2InstanceTypeInfo
66
import logging
77
from sys import exit
8+
from VersionCheck import logger as VersionCheck_logger, VersionCheck
89

910
if __name__ == '__main__':
1011
try:
1112
parser = argparse.ArgumentParser(description="Get EC2 instance pricing info.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
1213
parser.add_argument("--region", "-r", type=str, default=[], action='append', help="AWS region(s) to get info for.")
1314
parser.add_argument("--input", '-i', type=str, default=None, help="JSON input file. Reads existing info from previous runs. Can speed up rerun if it failed to collect the data for a region.")
1415
parser.add_argument("--output-csv", '-o', type=str, default=None, help="CSV output file. Default: instance_type_info.csv")
16+
parser.add_argument("--disable-version-check", action='store_const', const=True, default=False, help="Disable git version check")
1517
parser.add_argument("--debug", "-d", action='store_const', const=True, default=False, help="Enable debug messages")
1618
args = parser.parse_args()
1719

20+
if args.debug:
21+
VersionCheck_logger.setLevel(logging.DEBUG)
22+
23+
if not args.disable_version_check and not VersionCheck().check_git_version():
24+
exit(1)
25+
1826
if args.input:
1927
print(f"Reading existing instance info from {args.input}")
2028
ec2InstanceTypeInfo = EC2InstanceTypeInfo(args.region, json_filename=args.input, debug=args.debug)

source/EC2InstanceTypeInfoPkg/retry_boto3_throttling.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,11 @@ def f_retry(*args, **kwargs):
6363
attempt += 1
6464
return f(*args, **kwargs)
6565
except ClientError as e:
66-
logging.exception("Caught exception")
66+
logging.debug("Caught exception")
6767
if e.response['Error']['Code'] in ['RequestLimitExceeded', 'InternalError', 'ThrottlingException']:
6868
pass
6969
else:
70-
logging.exception("Rethrew exception")
70+
logging.debug("Rethrew exception")
7171
raise e
7272
logger.debug("%s" % (traceback.format_exc()))
7373
logger.debug("attempt=%d" % attempt)

source/SlurmPlugin.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -1964,7 +1964,9 @@ def get_instance_types_from_instance_config(self, instance_config: dict, regions
19641964
default_instance_type_config = {
19651965
'UseOnDemand': instance_config['UseOnDemand'],
19661966
'UseSpot': instance_config['UseSpot'],
1967-
'DisableSimultaneousMultithreading': instance_config['DisableSimultaneousMultithreading']
1967+
'DisableSimultaneousMultithreading': instance_config['DisableSimultaneousMultithreading'],
1968+
'EnableEfa': instance_config['EnableEfa'],
1969+
'PlacementGroupName': instance_config.get('PlacementGroupName', None)
19681970
}
19691971

19701972
instance_types = {}
@@ -2069,6 +2071,8 @@ def get_instance_types_from_instance_config(self, instance_config: dict, regions
20692071
instance_type_config['UseOnDemand'] = instance_type_config.get('UseOnDemand', instance_family_config.get('UseOnDemand', default_instance_type_config['UseOnDemand']))
20702072
instance_type_config['UseSpot'] = instance_type_config.get('UseSpot', instance_family_config.get('UseSpot', default_instance_type_config['UseSpot']))
20712073
instance_type_config['DisableSimultaneousMultithreading'] = instance_type_config.get('DisableSimultaneousMultithreading', instance_family_config.get('DisableSimultaneousMultithreading', default_instance_type_config['DisableSimultaneousMultithreading']))
2074+
instance_type_config['EnableEfa'] = instance_type_config.get('EnableEfa', instance_family_config.get('EnableEfa', default_instance_type_config['EnableEfa']))
2075+
instance_type_config['PlacementGroupName'] = instance_type_config.get('PlacementGroupName', instance_family_config.get('PlacementGroupName', default_instance_type_config['PlacementGroupName']))
20722076

20732077
region_instance_types[instance_type] = instance_type_config
20742078

source/cdk/cdk_slurm_stack.py

+26-4
Original file line numberDiff line numberDiff line change
@@ -1081,6 +1081,20 @@ def create_parallel_cluster_assets(self):
10811081
# If use managed_policy_name, then get the following cfn_nag warning.
10821082
# W28: Resource found with an explicit name, this disallows updates that require replacement of this resource
10831083

1084+
self.parallel_cluster_enable_ena_express_policy = iam.ManagedPolicy(
1085+
self, "ParallelClusterEnableEnaExpressPolicy",
1086+
path = '/parallelcluster/',
1087+
statements = [
1088+
iam.PolicyStatement(
1089+
effect=iam.Effect.ALLOW,
1090+
actions=[
1091+
'ec2:ModifyNetworkInterfaceAttribute',
1092+
],
1093+
resources=['*']
1094+
)
1095+
]
1096+
)
1097+
10841098
self.create_munge_key_secret()
10851099

10861100
self.playbooks_asset = s3_assets.Asset(self, 'Playbooks',
@@ -2787,7 +2801,7 @@ def create_parallel_cluster_config(self):
27872801
if not instance_type_config['UseSpot']:
27882802
continue
27892803
logger.debug(f"Creating queue for {purchase_option} {instance_type}")
2790-
efa_supported = self.plugin.get_EfaSupported(self.cluster_region, instance_type) and self.config['slurm']['ParallelClusterConfig']['EnableEfa']
2804+
efa_enabled = self.plugin.get_EfaSupported(self.cluster_region, instance_type) and instance_type_config['EnableEfa']
27912805
mem_mb = self.plugin.get_MemoryInMiB(self.cluster_region, instance_type)
27922806
mem_gb = int(mem_mb / 1024)
27932807
core_count = int(self.plugin.get_CoreCount(self.cluster_region, instance_type))
@@ -2855,18 +2869,21 @@ def create_parallel_cluster_config(self):
28552869
'MaxCount': max_count,
28562870
'DisableSimultaneousMultithreading': instance_type_config['DisableSimultaneousMultithreading'],
28572871
'Instances': [],
2858-
'Efa': {'Enabled': efa_supported},
2872+
'Efa': {'Enabled': efa_enabled},
28592873
'Networking': {
28602874
'PlacementGroup': {
2861-
'Enabled': efa_supported
2875+
'Enabled': efa_enabled
28622876
}
28632877
}
28642878
}
2879+
if efa_enabled and instance_type_config['PlacementGroupName']:
2880+
compute_resource['Networking']['PlacementGroup']['Name'] = instance_type_config['PlacementGroupName']
28652881
compute_resource['Instances'].append(
28662882
{
28672883
'InstanceType': instance_type
28682884
}
28692885
)
2886+
28702887
if config_schema.PARALLEL_CLUSTER_SUPPORTS_NODE_WEIGHTS(self.PARALLEL_CLUSTER_VERSION):
28712888
compute_resource['StaticNodePriority'] = int(price * 1000)
28722889
compute_resource['DynamicNodePriority'] = int(price * 10000)
@@ -3047,6 +3064,10 @@ def create_parallel_cluster_config(self):
30473064
key = 'ParallelClusterAssetReadPolicyArn',
30483065
value = self.parallel_cluster_asset_read_policy.managed_policy_arn
30493066
)
3067+
self.create_parallel_cluster_config_lambda.add_environment(
3068+
key = 'ParallelClusterEnableEnaExpressPolicyArn',
3069+
value = self.parallel_cluster_enable_ena_express_policy.managed_policy_arn
3070+
)
30503071
self.create_parallel_cluster_config_lambda.add_environment(
30513072
key = 'ParallelClusterJwtWritePolicyArn',
30523073
value = self.parallel_cluster_jwt_write_policy.managed_policy_arn
@@ -3188,7 +3209,8 @@ def create_queue_config(self, queue_name, allocation_strategy, purchase_option):
31883209
'AdditionalIamPolicies': [
31893210
{'Policy': 'arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore'},
31903211
{'Policy': '{{ParallelClusterAssetReadPolicyArn}}'},
3191-
{'Policy': '{{ParallelClusterSnsPublishPolicyArn}}'}
3212+
{'Policy': '{{ParallelClusterSnsPublishPolicyArn}}'},
3213+
{'Policy': '{{ParallelClusterEnableEnaExpressPolicyArn}}'}
31923214
]
31933215
},
31943216
'Networking': {

source/cdk/config_schema.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -1530,9 +1530,6 @@ def get_config_schema(config):
15301530
},
15311531
Optional('Architecture', default=DEFAULT_ARCHITECTURE): And(str, lambda s: s in VALID_ARCHITECTURES),
15321532
Optional('ComputeNodeAmi'): And(str, lambda s: s.startswith('ami-')),
1533-
# Recommend to not use EFA unless necessary to avoid insufficient capacity errors when starting new instances in group or when multiple instance types in the group
1534-
# See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-groups.html#placement-groups-cluster
1535-
Optional('EnableEfa', default=False): bool,
15361533
Optional('Database'): {
15371534
Optional('DatabaseStackName'): str,
15381535
Optional('FQDN'): str,
@@ -1641,6 +1638,11 @@ def get_config_schema(config):
16411638
# Configure spot instances
16421639
Optional('UseSpot', default=True): bool,
16431640
Optional('DisableSimultaneousMultithreading', default=True): bool,
1641+
# This is a global setting that can be overridden for instance types in InstanceConfig.
1642+
# Recommend to not use EFA unless necessary to avoid insufficient capacity errors when starting new instances in group or when multiple instance types in the group
1643+
# See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-groups.html#placement-groups-cluster
1644+
Optional('EnableEfa', default=False): bool,
1645+
Optional('PlacementGroupName'): str,
16441646
Optional('CpuVendor', default=cpu_vendors): [
16451647
And(str, lambda s: s in cpu_vendors)
16461648
],
@@ -1665,7 +1667,9 @@ def get_config_schema(config):
16651667
str: {
16661668
Optional('UseOnDemand'): bool,
16671669
Optional('UseSpot'): bool,
1668-
Optional('DisableSimultaneousMultithreading'): bool
1670+
Optional('DisableSimultaneousMultithreading'): bool,
1671+
Optional('EnableEfa'): bool,
1672+
Optional('PlacementGroupName'): str
16691673
}
16701674
},
16711675
lambda d: len(d) == 1
@@ -1680,7 +1684,9 @@ def get_config_schema(config):
16801684
str: {
16811685
Optional('UseOnDemand'): bool,
16821686
Optional('UseSpot'): bool,
1683-
Optional('DisableSimultaneousMultithreading'): bool
1687+
Optional('DisableSimultaneousMultithreading'): bool,
1688+
Optional('EnableEfa'): bool,
1689+
Optional('PlacementGroupName'): str
16841690
}
16851691
},
16861692
lambda d: len(d) == 1

source/resources/parallel-cluster/config/bin/on_compute_node_configured.sh

+6
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,12 @@ if [[ -e $config_dir/users_groups.json ]]; then
7171
$config_bin_dir/create_users_groups.py -i $config_dir/users_groups.json
7272
fi
7373

74+
# Enable ENA Express
75+
TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
76+
mac=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/network/interfaces/macs/)
77+
eni_id=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/network/interfaces/macs/${mac}interface-id/)
78+
aws ec2 modify-network-interface-attribute --network-interface-id ${eni_id} --ena-srd-specification 'EnaSrdEnabled=true,EnaSrdUdpSpecification={EnaSrdUdpEnabled=true}'
79+
7480
# ansible_compute_node_vars_yml_s3_url="s3://$assets_bucket/$assets_base_key/config/ansible/ansible_compute_node_vars.yml"
7581

7682
# # Configure using ansible

0 commit comments

Comments
 (0)