Skip to content

Commit 7ffb589

Browse files
himani2411Himani Anil Deshpande
andauthored
[SlurmTopo] Add support for slurm Block Topology (#3002)
* [SlurmTopo] Add a topology generator script * [SlurmTopo] Add a topology plugin if topology_block_size is not null * [SlurmTopo] Invoke pcluster_topology_generator.py during creation and update * Updating to use correct paramter name of block_sizes * [SlurmTopo] Update topology generator script and add unit tests * [SlurmTopo] Skip Topoloy generation for ALinux2 AMI * [SlurmTopo] USing p6egb200_block_sizes instead of topology_block_size as per CLI changes * [SlurmTopo] Adding a slurm_parallelcluster_topology.conf in slurm.conf * [SlurmTopo] TESTING TO BE REMOVED * [SlurmTopo] Adding a slurm_parallelcluster_topology.conf in slurm.conf * [SlurmTopo] Do not support Slurm Topology for AL2 * [SlurmTopo] We cleanup or generate Topology only if p6egb200_block_size exist * [SlurmTopo] Do not support Slurm Topology for AL2 * [SlurmTopo] Do not support Slurm Topology for AL2 * [SlurmTopo] Add resource for Block topology * [SlurmTopo] Updated unit tests for not generating a file if block_size is empty * [SlurmTopo] Update to use a generic variable `is_block_topology_plugin_supported` * [SlurmTopo]Removing commented out section which was for testing --------- Co-authored-by: Himani Anil Deshpande <[email protected]>
1 parent 9f091d2 commit 7ffb589

File tree

16 files changed

+864
-0
lines changed

16 files changed

+864
-0
lines changed
Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License").
4+
# You may not use this file except in compliance with the License.
5+
# A copy of the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "LICENSE.txt" file accompanying this file.
10+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
11+
# See the License for the specific language governing permissions and limitations under the License.
12+
13+
# FIXME: Fix Code Duplication
14+
# pylint: disable=R0801
15+
16+
import argparse
17+
import logging
18+
import os
19+
import traceback
20+
21+
import yaml
22+
23+
log = logging.getLogger()
24+
25+
26+
CAPACITY_TYPE_MAP = {
27+
"ONDEMAND": "on-demand",
28+
"SPOT": "spot",
29+
"CAPACITY_BLOCK": "capacity-block",
30+
}
31+
CONFIG_HEADER = "# This file is automatically generated by pcluster\n"
32+
33+
34+
class CriticalError(Exception):
35+
"""Critical error for the script."""
36+
37+
pass
38+
39+
40+
class ConfigurationFieldNotFoundError(Exception):
41+
"""Field not found in configuration."""
42+
43+
pass
44+
45+
46+
def _load_cluster_config(input_file_path):
47+
"""Load cluster config file."""
48+
with open(input_file_path, encoding="utf-8") as input_file:
49+
return yaml.load(input_file, Loader=yaml.SafeLoader)
50+
51+
52+
def generate_topology_config_file(output_file: str, input_file: str, block_sizes: str): # noqa: C901
53+
"""
54+
Generate Topology configuration file.
55+
56+
Generate topology.conf
57+
58+
# This file is automatically generated by pcluster
59+
BlockName=block1 Nodes=queue-1-st-compute-resource-0-[1-9] #### 9 nodes
60+
BlockName=block2 Nodes=queue-1-st-compute-resource-0-[1-18] #### 18 nodes
61+
BlockSizes=9,18
62+
"""
63+
if block_sizes:
64+
min_block_size_list = min(list(map(int, block_sizes.split(","))))
65+
max_block_size_list = max(list(map(int, block_sizes.split(","))))
66+
67+
cluster_config = _load_cluster_config(input_file)
68+
queue_name, compute_resource_name = None, None
69+
try:
70+
topology_config = CONFIG_HEADER + "\n"
71+
block_count = 0
72+
for queue_config in cluster_config["Scheduling"]["SlurmQueues"]:
73+
queue_name = queue_config["Name"]
74+
75+
# Retrieve capacity info from the queue_name, if there
76+
queue_capacity_type = CAPACITY_TYPE_MAP.get(queue_config.get("CapacityType", "ONDEMAND"))
77+
if queue_capacity_type != CAPACITY_TYPE_MAP.get("CAPACITY_BLOCK"):
78+
log.info("ParallelCluster does not create topology for %s", queue_capacity_type)
79+
continue
80+
81+
for compute_resource_config in queue_config["ComputeResources"]:
82+
compute_resource_name = compute_resource_config["Name"]
83+
compute_min_count = compute_resource_config["MinCount"]
84+
compute_max_count = compute_resource_config["MaxCount"]
85+
if compute_min_count == compute_max_count:
86+
node_type = "st"
87+
else:
88+
continue
89+
90+
# Check for if reservation is for NVLink and size matches min_block_size_list
91+
if compute_resource_config.get("InstanceType") == "p6e-gb200.36xlarge":
92+
if min_block_size_list == compute_min_count or max_block_size_list == compute_max_count:
93+
block_count += 1
94+
# Each Capacity Reservation ID is a Capacity Block,
95+
# we associate each slurm block with a single capacity Block
96+
topology_config += (
97+
"BlockName=Block"
98+
+ str(block_count)
99+
+ " Nodes="
100+
+ str(queue_name)
101+
+ "-"
102+
+ str(node_type)
103+
+ "-"
104+
+ str(compute_resource_name)
105+
+ "-[1-"
106+
+ str(compute_max_count)
107+
+ "]\n"
108+
)
109+
110+
topology_config += "BlockSizes=" + str(block_sizes) + "\n"
111+
except (KeyError, AttributeError) as e:
112+
if isinstance(e, KeyError):
113+
message = f"Unable to find key {e} in the configuration file."
114+
else:
115+
message = f"Error parsing configuration file. {e}. {traceback.format_exc()}."
116+
message += f" Queue: {queue_name}" if queue_name else ""
117+
log.error(message)
118+
raise CriticalError(message)
119+
120+
log.info("Writing Info %s", topology_config)
121+
log.info("Generating %s", output_file)
122+
with open(output_file, "w", encoding="utf-8") as output:
123+
output.write(topology_config)
124+
125+
log.info("Finished.")
126+
127+
128+
def cleanup_topology_config_file(file_path):
129+
"""Cleanup topology.conf file."""
130+
try:
131+
if os.path.exists(file_path):
132+
log.info("Cleaning up %s", file_path)
133+
os.remove(file_path)
134+
except Exception as err:
135+
log.warning("Unable to delete %s due to %s", file_path, err)
136+
137+
138+
def main():
139+
try:
140+
logging.basicConfig(
141+
level=logging.INFO, format="%(asctime)s - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s"
142+
)
143+
log.info("Running ParallelCluster Topology Config Generator")
144+
parser = argparse.ArgumentParser(description="Take in Topology configuration generator related parameters")
145+
cleanup_or_generate_exclusive_group = parser.add_mutually_exclusive_group(required=True)
146+
parser.add_argument("--output-file", help="The output file for generated topology.conf", required=True)
147+
parser.add_argument(
148+
"--input-file",
149+
help="Yaml file containing pcluster CLI configuration file with default values",
150+
required=True,
151+
)
152+
cleanup_or_generate_exclusive_group.add_argument("--block-sizes", help="Block Sizes of topology.conf")
153+
cleanup_or_generate_exclusive_group.add_argument(
154+
"--cleanup",
155+
action="store_true",
156+
help="Cleanup topology.conf",
157+
)
158+
args = parser.parse_args()
159+
if args.cleanup:
160+
cleanup_topology_config_file(args.output_file)
161+
else:
162+
generate_topology_config_file(args.output_file, args.input_file, args.block_sizes)
163+
log.info("Completed Execution of ParallelCluster Topology Config Generator")
164+
except Exception as e:
165+
log.exception("Failed to generate Topology.conf, exception: %s", e)
166+
raise
167+
168+
169+
if __name__ == "__main__":
170+
main()

cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
owner 'root'
4646
group 'root'
4747
mode '0644'
48+
variables(is_block_topology_plugin_supported: platform?('amazon') && node['platform_version'] == "2")
4849
end
4950

5051
template "#{node['cluster']['slurm']['install_dir']}/etc/gres.conf" do
@@ -54,6 +55,10 @@
5455
mode '0644'
5556
end
5657

58+
block_topology 'Add Block Topology configuration' do
59+
action :configure
60+
end
61+
5762
unless on_docker?
5863
# Generate pcluster specific configs
5964
no_gpu = nvidia_installed? ? "" : "--no-gpu"

cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,10 @@ def update_nodes_in_queue(strategy, queues)
151151
end
152152
end
153153

154+
block_topology 'Update or Cleanup Slurm Topology' do
155+
action :update
156+
end
157+
154158
execute "generate_pcluster_slurm_configs" do
155159
command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_slurm_config_generator.py" \
156160
" --output-directory #{node['cluster']['slurm']['install_dir']}/etc/" \
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :block_topology, platform: 'amazon' do |node|
16+
node['platform_version'].to_i == 2023
17+
end
18+
19+
use 'partial/_block_topology_common.rb'
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :block_topology, platform: 'amazon', platform_version: '2'
16+
17+
use 'partial/_block_topology_common.rb'
18+
19+
def is_block_topology_supported?
20+
# We do not support Block Topology with Alinux2 as we do not support Gb200 with this OS
21+
false
22+
end
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :block_topology, platform: 'redhat' do |node|
16+
node['platform_version'].to_i >= 8
17+
end
18+
19+
use 'partial/_block_topology_common.rb'
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :block_topology, platform: 'rocky' do |node|
16+
node['platform_version'].to_i >= 8
17+
end
18+
19+
use 'partial/_block_topology_common.rb'
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :block_topology, platform: 'ubuntu' do |node|
16+
node['platform_version'].to_i >= 22
17+
end
18+
19+
use 'partial/_block_topology_common.rb'
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# frozen_string_literal: true
2+
#
3+
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
unified_mode true
16+
default_action :configure
17+
18+
action :configure do
19+
return unless is_block_topology_supported?
20+
# Use slurm_parallelcluster_topology to add Block Topology plugin
21+
template "#{node['cluster']['slurm']['install_dir']}/etc/slurm_parallelcluster_topology.conf" do
22+
source 'slurm/block_topology/slurm_parallelcluster_topology.conf.erb'
23+
owner 'root'
24+
group 'root'
25+
mode '0644'
26+
end
27+
# Generate Slurm topology.conf file
28+
execute "generate_topology_config" do
29+
command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_topology_generator.py"\
30+
" --output-file #{node['cluster']['slurm']['install_dir']}/etc/topology.conf"\
31+
" --block-sizes #{node['cluster']['p6egb200_block_sizes']}"\
32+
" --input-file #{node['cluster']['cluster_config_path']}"
33+
not_if { node['cluster']['p6egb200_block_sizes'].nil? }
34+
end
35+
end
36+
37+
action :update do
38+
return unless is_block_topology_supported?
39+
# Update slurm_parallelcluster_topology to add/remove Block Topology plugin
40+
template "#{node['cluster']['slurm']['install_dir']}/etc/slurm_parallelcluster_topology.conf" do
41+
source 'slurm/block_topology/slurm_parallelcluster_topology.conf.erb'
42+
owner 'root'
43+
group 'root'
44+
mode '0644'
45+
end
46+
# Update Slurm topology.conf file
47+
execute "update or cleanup topology.conf" do
48+
command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_topology_generator.py"\
49+
" --output-file #{node['cluster']['slurm']['install_dir']}/etc/topology.conf"\
50+
" --input-file #{node['cluster']['cluster_config_path']}"\
51+
"#{topology_generator_command_args}"
52+
not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && topology_generator_command_args.nil? }
53+
end
54+
end
55+
56+
def is_block_topology_supported?
57+
true
58+
end
59+
60+
def topology_generator_command_args
61+
if node['cluster']['p6egb200_block_sizes'].nil? && are_queues_updated? && ::File.exist?("#{node['cluster']['slurm']['install_dir']}/etc/topology.conf")
62+
# If topology.conf exist and Capacity Block is removed, we cleanup
63+
" --cleanup"
64+
elsif node['cluster']['p6egb200_block_sizes'].nil? && !are_queues_updated?
65+
# We do nothing if p6e-gb200 is not used and queues are not updated
66+
nil
67+
else
68+
" --block-sizes #{node['cluster']['p6egb200_block_sizes']}"
69+
end
70+
end

0 commit comments

Comments
 (0)