Skip to content

Commit 3a43f29

Browse files
author
Himani Anil Deshpande
committed
[SlurmTopo] Add resource for Block topology
1 parent 7fe4219 commit 3a43f29

File tree

13 files changed

+332
-50
lines changed

13 files changed

+332
-50
lines changed

cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,3 @@ def check_for_protected_mode(fleet_status_command) # rubocop:disable Lint/Nested
242242
def get_login_node_pool_config(config, pool_name)
243243
config['LoginNodes']['Pools'].select { |pool| pool['Name'] == pool_name }.first
244244
end
245-
246-
def is_amazon_linux_2?
247-
platform?('amazon') && node['platform_version'] == "2"
248-
end

cookbooks/aws-parallelcluster-slurm/libraries/update.rb

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -100,15 +100,3 @@ def is_login_nodes_removed?
100100
previous_config = YAML.safe_load(File.read(node['cluster']['previous_cluster_config_path']))
101101
previous_config.dig("LoginNodes") and !config.dig("LoginNodes")
102102
end
103-
104-
def topology_generator_command_args
105-
if node['cluster']['p6egb200_block_sizes'].nil? && are_queues_updated? && ::File.exist?("#{node['cluster']['slurm']['install_dir']}/etc/topology.conf")
106-
# If topology.conf exist and Capacity Block is removed, we cleanup
107-
" --cleanup"
108-
elsif node['cluster']['p6egb200_block_sizes'].nil? && !are_queues_updated?
109-
# We do nothing if p6e-gb200 is not used and queues are not updated
110-
nil
111-
else
112-
" --block-sizes #{node['cluster']['p6egb200_block_sizes']}"
113-
end
114-
end

cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
owner 'root'
4646
group 'root'
4747
mode '0644'
48-
variables( is_amazon_linux_2: is_amazon_linux_2? )
48+
variables(is_amazon_linux_2: platform?('amazon') && node['platform_version'] == "2")
4949
end
5050

5151
template "#{node['cluster']['slurm']['install_dir']}/etc/gres.conf" do
@@ -55,25 +55,11 @@
5555
mode '0644'
5656
end
5757

58-
# Use slurm_parallelcluster_topology to add Block Topology plugin
59-
template "#{node['cluster']['slurm']['install_dir']}/etc/slurm_parallelcluster_topology.conf" do
60-
source 'slurm/slurm_parallelcluster_topology.conf.erb'
61-
owner 'root'
62-
group 'root'
63-
mode '0644'
64-
not_if { is_amazon_linux_2? }
58+
block_topology 'Add Block Topology configuration' do
59+
action :configure
6560
end
6661

6762
unless on_docker?
68-
# Generate Slurm topology.conf file
69-
execute "generate_topology_config" do
70-
command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_topology_generator.py"\
71-
" --output-file #{node['cluster']['slurm']['install_dir']}/etc/topology.conf"\
72-
" --block-sizes #{node['cluster']['p6egb200_block_sizes']}"\
73-
" --input-file #{node['cluster']['cluster_config_path']}"
74-
not_if { node['cluster']['p6egb200_block_sizes'].nil? || (platform?('amazon') && node['platform_version'] == "2") }
75-
end
76-
7763
# Generate pcluster specific configs
7864
no_gpu = nvidia_installed? ? "" : "--no-gpu"
7965
execute "generate_pcluster_slurm_configs" do

cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -151,22 +151,8 @@ def update_nodes_in_queue(strategy, queues)
151151
end
152152
end
153153

154-
# Update slurm_parallelcluster_topology to add/remove Block Topology plugin
155-
template "#{node['cluster']['slurm']['install_dir']}/etc/slurm_parallelcluster_topology.conf" do
156-
source 'slurm/slurm_parallelcluster_topology.conf.erb'
157-
owner 'root'
158-
group 'root'
159-
mode '0644'
160-
not_if { is_amazon_linux_2? }
161-
end
162-
163-
# Update Slurm topology.conf file
164-
execute "update or cleanup topology.conf" do
165-
command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_topology_generator.py"\
166-
" --output-file #{node['cluster']['slurm']['install_dir']}/etc/topology.conf"\
167-
" --input-file #{node['cluster']['cluster_config_path']}"\
168-
"#{topology_generator_command_args}"
169-
not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && topology_generator_command_args.nil? || is_amazon_linux_2? }
154+
block_topology 'Update or Cleanup Slurm Topology' do
155+
action :update
170156
end
171157

172158
execute "generate_pcluster_slurm_configs" do
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :block_topology, platform: 'amazon' do |node|
16+
node['platform_version'].to_i == 2023
17+
end
18+
19+
use 'partial/_block_topology_common.rb'
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :block_topology, platform: 'amazon', platform_version: '2'
16+
17+
use 'partial/_block_topology_common.rb'
18+
19+
def is_block_topology_supported?
20+
# We do not support Block Topology with Alinux2 as we do not support Gb200 with this OS
21+
false
22+
end
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :block_topology, platform: 'redhat' do |node|
16+
node['platform_version'].to_i >= 8
17+
end
18+
19+
use 'partial/_block_topology_common.rb'
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :block_topology, platform: 'rocky' do |node|
16+
node['platform_version'].to_i >= 8
17+
end
18+
19+
use 'partial/_block_topology_common.rb'
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :block_topology, platform: 'ubuntu' do |node|
16+
node['platform_version'].to_i >= 22
17+
end
18+
19+
use 'partial/_block_topology_common.rb'
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# frozen_string_literal: true
2+
#
3+
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
unified_mode true
16+
default_action :configure
17+
18+
action :configure do
19+
return unless is_block_topology_supported?
20+
# Use slurm_parallelcluster_topology to add Block Topology plugin
21+
template "#{node['cluster']['slurm']['install_dir']}/etc/slurm_parallelcluster_topology.conf" do
22+
source 'slurm/block_topology/slurm_parallelcluster_topology.conf.erb'
23+
owner 'root'
24+
group 'root'
25+
mode '0644'
26+
end
27+
# Generate Slurm topology.conf file
28+
execute "generate_topology_config" do
29+
command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_topology_generator.py"\
30+
" --output-file #{node['cluster']['slurm']['install_dir']}/etc/topology.conf"\
31+
" --block-sizes #{node['cluster']['p6egb200_block_sizes']}"\
32+
" --input-file #{node['cluster']['cluster_config_path']}"
33+
not_if { node['cluster']['p6egb200_block_sizes'].nil? }
34+
end
35+
end
36+
37+
action :update do
38+
return unless is_block_topology_supported?
39+
# Update slurm_parallelcluster_topology to add/remove Block Topology plugin
40+
template "#{node['cluster']['slurm']['install_dir']}/etc/slurm_parallelcluster_topology.conf" do
41+
source 'slurm/block_topology/slurm_parallelcluster_topology.conf.erb'
42+
owner 'root'
43+
group 'root'
44+
mode '0644'
45+
end
46+
# Update Slurm topology.conf file
47+
execute "update or cleanup topology.conf" do
48+
command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_topology_generator.py"\
49+
" --output-file #{node['cluster']['slurm']['install_dir']}/etc/topology.conf"\
50+
" --input-file #{node['cluster']['cluster_config_path']}"\
51+
"#{topology_generator_command_args}"
52+
not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && topology_generator_command_args.nil? }
53+
end
54+
end
55+
56+
def is_block_topology_supported?
57+
true
58+
end
59+
60+
def topology_generator_command_args
61+
if node['cluster']['p6egb200_block_sizes'].nil? && are_queues_updated? && ::File.exist?("#{node['cluster']['slurm']['install_dir']}/etc/topology.conf")
62+
# If topology.conf exist and Capacity Block is removed, we cleanup
63+
" --cleanup"
64+
elsif node['cluster']['p6egb200_block_sizes'].nil? && !are_queues_updated?
65+
# We do nothing if p6e-gb200 is not used and queues are not updated
66+
nil
67+
else
68+
" --block-sizes #{node['cluster']['p6egb200_block_sizes']}"
69+
end
70+
end

0 commit comments

Comments
 (0)