Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
92ca1c5
[SlurmTopo] Add a topology generator script
May 20, 2025
a2e9f2f
[SlurmTopo] Add a topology plugin if topology_block_size is not null
May 20, 2025
7c5ed6a
[SlurmTopo] Invoke pcluster_topology_generator.py during creation and…
May 20, 2025
4a664bb
[SlurmTopo] Update topology generator script and add unit tests
Aug 6, 2025
7b81277
[SlurmTopo] Skip Topoloy generation for ALinux2 AMI
Aug 6, 2025
35dadde
[SlurmTopo] USing p6egb200_block_sizes instead of topology_block_size…
Aug 6, 2025
1acfeb4
[SlurmTopo] Adding a slurm_parallelcluster_topology.conf in slurm.conf
Aug 7, 2025
dad2046
[SlurmTopo] TESTING TO BE REMOVED
Aug 7, 2025
64b09c7
[SlurmTopo] Adding a slurm_parallelcluster_topology.conf in slurm.conf
Aug 7, 2025
f1b89ea
[SlurmTopo] Do not support Slurm Topology for AL2
Aug 8, 2025
33357fe
[SlurmTopo] We cleanup or generate Topology only if p6egb200_block_si…
Aug 8, 2025
525a3d1
[SlurmTopo] Do not support Slurm Topology for AL2
Aug 8, 2025
b63c24a
[SlurmTopo] Do not support Slurm Topology for AL2
Aug 8, 2025
072604c
[SlurmTopo] Add resource for Block topology
Aug 8, 2025
edab2f7
[SlurmTopo] Updated unit tests for not generating a file if block_siz…
Aug 9, 2025
23a98be
[SlurmTopo] Update to use a generic variable `is_block_topology_plugi…
Aug 11, 2025
89c79d9
[SlurmTopo]Removing commented out section which was for testing
Aug 11, 2025
a8af115
[SlurmTopo]Removing unused varaibls for capacity reservationID
Aug 12, 2025
676ae77
[SlurmTopo] Code linters
Aug 12, 2025
b9e932f
[SlurmTopo] Code linters
Aug 12, 2025
908c2b9
Merge branch 'develop' into slurm-topo
himani2411 Aug 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.

# FIXME: Fix Code Duplication
# pylint: disable=R0801

import argparse
import logging
import os
import traceback

import yaml

log = logging.getLogger()


CAPACITY_TYPE_MAP = {
"ONDEMAND": "on-demand",
"SPOT": "spot",
"CAPACITY_BLOCK": "capacity-block",
}
CONFIG_HEADER = "# This file is automatically generated by pcluster\n"


class CriticalError(Exception):
"""Critical error for the script."""

pass


class ConfigurationFieldNotFoundError(Exception):
"""Field not found in configuration."""

pass


def _load_cluster_config(input_file_path):
"""Load cluster config file."""
with open(input_file_path, encoding="utf-8") as input_file:
return yaml.load(input_file, Loader=yaml.SafeLoader)


def generate_topology_config_file(output_file: str, input_file: str, block_sizes: str): # noqa: C901
"""
Generate Topology configuration file.

Generate topology.conf

# This file is automatically generated by pcluster
BlockName=block1 Nodes=queue-1-st-compute-resource-0-[1-9] #### 9 nodes
BlockName=block2 Nodes=queue-1-st-compute-resource-0-[1-18] #### 18 nodes
BlockSizes=9,18
"""
if block_sizes:
min_block_size_list = min(list(map(int, block_sizes.split(","))))
max_block_size_list = max(list(map(int, block_sizes.split(","))))

cluster_config = _load_cluster_config(input_file)
queue_name, compute_resource_name = None, None
try:
topology_config = CONFIG_HEADER + "\n"
block_count = 0
for queue_config in cluster_config["Scheduling"]["SlurmQueues"]:
queue_name = queue_config["Name"]

# Retrieve capacity info from the queue_name, if there
queue_capacity_type = CAPACITY_TYPE_MAP.get(queue_config.get("CapacityType", "ONDEMAND"))
if queue_capacity_type != CAPACITY_TYPE_MAP.get("CAPACITY_BLOCK"):
log.info("ParallelCluster does not create topology for %s", queue_capacity_type)
continue

for compute_resource_config in queue_config["ComputeResources"]:
compute_resource_name = compute_resource_config["Name"]
compute_min_count = compute_resource_config["MinCount"]
compute_max_count = compute_resource_config["MaxCount"]
if compute_min_count == compute_max_count:
node_type = "st"
else:
continue

# Check for if reservation is for NVLink and size matches min_block_size_list
if compute_resource_config.get("InstanceType") == "p6e-gb200.36xlarge":
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why just this instance type? .72xlarge? Any way to more programatically determine these?

if min_block_size_list == compute_min_count or max_block_size_list == compute_max_count:
block_count += 1
# Each Capacity Reservation ID is a Capacity Block,
# we associate each slurm block with a single capacity Block
topology_config += (
"BlockName=Block"
+ str(block_count)
+ " Nodes="
+ str(queue_name)
+ "-"
+ str(node_type)
+ "-"
+ str(compute_resource_name)
+ "-[1-"
+ str(compute_max_count)
+ "]\n"
)

topology_config += "BlockSizes=" + str(block_sizes) + "\n"
except (KeyError, AttributeError) as e:
if isinstance(e, KeyError):
message = f"Unable to find key {e} in the configuration file."
else:
message = f"Error parsing configuration file. {e}. {traceback.format_exc()}."
message += f" Queue: {queue_name}" if queue_name else ""
log.error(message)
raise CriticalError(message)

log.info("Writing Info %s", topology_config)
log.info("Generating %s", output_file)
with open(output_file, "w", encoding="utf-8") as output:
output.write(topology_config)

log.info("Finished.")


def cleanup_topology_config_file(file_path):
"""Cleanup topology.conf file."""
try:
if os.path.exists(file_path):
log.info("Cleaning up %s", file_path)
os.remove(file_path)
except Exception as err:
log.warning("Unable to delete %s due to %s", file_path, err)


def main():
try:
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s"
)
log.info("Running ParallelCluster Topology Config Generator")
parser = argparse.ArgumentParser(description="Take in Topology configuration generator related parameters")
cleanup_or_generate_exclusive_group = parser.add_mutually_exclusive_group(required=True)
parser.add_argument("--output-file", help="The output file for generated topology.conf", required=True)
parser.add_argument(
"--input-file",
help="Yaml file containing pcluster CLI configuration file with default values",
required=True,
)
cleanup_or_generate_exclusive_group.add_argument("--block-sizes", help="Block Sizes of topology.conf")
cleanup_or_generate_exclusive_group.add_argument(
"--cleanup",
action="store_true",
help="Cleanup topology.conf",
)
args = parser.parse_args()
if args.cleanup:
cleanup_topology_config_file(args.output_file)
else:
generate_topology_config_file(args.output_file, args.input_file, args.block_sizes)
log.info("Completed Execution of ParallelCluster Topology Config Generator")
except Exception as e:
log.exception("Failed to generate Topology.conf, exception: %s", e)
raise


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
owner 'root'
group 'root'
mode '0644'
variables(is_block_topology_plugin_supported: platform?('amazon') && node['platform_version'] == "2")
end

template "#{node['cluster']['slurm']['install_dir']}/etc/gres.conf" do
Expand All @@ -54,6 +55,10 @@
mode '0644'
end

block_topology 'Add Block Topology configuration' do
action :configure
end

unless on_docker?
# Generate pcluster specific configs
no_gpu = nvidia_installed? ? "" : "--no-gpu"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,10 @@ def update_nodes_in_queue(strategy, queues)
end
end

block_topology 'Update or Cleanup Slurm Topology' do
action :update
end

execute "generate_pcluster_slurm_configs" do
command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_slurm_config_generator.py" \
" --output-directory #{node['cluster']['slurm']['install_dir']}/etc/" \
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# frozen_string_literal: true

# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.

provides :block_topology, platform: 'amazon' do |node|
node['platform_version'].to_i == 2023
end

use 'partial/_block_topology_common.rb'
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# frozen_string_literal: true

# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.

provides :block_topology, platform: 'amazon', platform_version: '2'

use 'partial/_block_topology_common.rb'

def is_block_topology_supported?
# We do not support Block Topology with Alinux2 as we do not support Gb200 with this OS
false
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# frozen_string_literal: true

# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.

provides :block_topology, platform: 'redhat' do |node|
node['platform_version'].to_i >= 8
end

use 'partial/_block_topology_common.rb'
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# frozen_string_literal: true

# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.

provides :block_topology, platform: 'rocky' do |node|
node['platform_version'].to_i >= 8
end

use 'partial/_block_topology_common.rb'
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# frozen_string_literal: true

# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.

provides :block_topology, platform: 'ubuntu' do |node|
node['platform_version'].to_i >= 22
end

use 'partial/_block_topology_common.rb'
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# frozen_string_literal: true
#
# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.

unified_mode true
default_action :configure

action :configure do
return unless is_block_topology_supported?
# Use slurm_parallelcluster_topology to add Block Topology plugin
template "#{node['cluster']['slurm']['install_dir']}/etc/slurm_parallelcluster_topology.conf" do
source 'slurm/block_topology/slurm_parallelcluster_topology.conf.erb'
owner 'root'
group 'root'
mode '0644'
end
# Generate Slurm topology.conf file
execute "generate_topology_config" do
command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_topology_generator.py"\
" --output-file #{node['cluster']['slurm']['install_dir']}/etc/topology.conf"\
" --block-sizes #{node['cluster']['p6egb200_block_sizes']}"\
" --input-file #{node['cluster']['cluster_config_path']}"
not_if { node['cluster']['p6egb200_block_sizes'].nil? }
end
end

action :update do
return unless is_block_topology_supported?
# Update slurm_parallelcluster_topology to add/remove Block Topology plugin
template "#{node['cluster']['slurm']['install_dir']}/etc/slurm_parallelcluster_topology.conf" do
source 'slurm/block_topology/slurm_parallelcluster_topology.conf.erb'
owner 'root'
group 'root'
mode '0644'
end
# Update Slurm topology.conf file
execute "update or cleanup topology.conf" do
command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_topology_generator.py"\
" --output-file #{node['cluster']['slurm']['install_dir']}/etc/topology.conf"\
" --input-file #{node['cluster']['cluster_config_path']}"\
"#{topology_generator_command_args}"
not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && topology_generator_command_args.nil? }
end
end

def is_block_topology_supported?
true
end

def topology_generator_command_args
if node['cluster']['p6egb200_block_sizes'].nil? && are_queues_updated? && ::File.exist?("#{node['cluster']['slurm']['install_dir']}/etc/topology.conf")
# If topology.conf exist and Capacity Block is removed, we cleanup
" --cleanup"
elsif node['cluster']['p6egb200_block_sizes'].nil? && !are_queues_updated?
# We do nothing if p6e-gb200 is not used and queues are not updated
nil
else
" --block-sizes #{node['cluster']['p6egb200_block_sizes']}"
end
end
Loading
Loading