diff --git a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb index 533cde463..b37255b9d 100644 --- a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb +++ b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb @@ -25,6 +25,7 @@ # nvidia-imex default['cluster']['nvidia']['imex']['shared_dir'] = "#{node['cluster']['shared_dir']}/nvidia-imex" +default['cluster']['nvidia']['imex']['force_configuration'] = false # DCV default['cluster']['dcv']['authenticator']['user'] = "dcvextauth" diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index e74b83c3b..388ac00a9 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -31,7 +31,7 @@ action :configure do return unless imex_installed? && node['cluster']['node_type'] == "ComputeFleet" # Start nvidia-imex on p6e-gb200 and only on ComputeFleet - if get_nvswitch_count(get_device_ids['gb200']) > 1 + if get_nvswitch_count(get_device_ids['gb200']) > 1 || enable_force_configuration? # For each Compute Resource, we generate a unique NVIDIA IMEX configuration file, # if one doesn't already exist in a common, shared location. template nvidia_imex_nodes_conf_file do @@ -104,3 +104,7 @@ def nvidia_imex_main_conf_file def nvidia_imex_nodes_conf_file "#{node['cluster']['nvidia']['imex']['shared_dir']}/nodes_config_#{node['cluster']['launch_template_id']}.cfg" end + +def enable_force_configuration? + ['true', 'yes', true].include?(node['cluster']['nvidia']['imex']['force_configuration']) +end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb index 8608b88a9..966af1593 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb @@ -137,6 +137,26 @@ def self.configure(chef_run) end end +describe 'nvidia_imex:enable_force_configuration?' do + [['false', false], [false, false], ['no', false], ['true', true], [true, true], ['yes', true]].each do |force_indicator, actual_indicator| + context "where node['cluster']['nvidia']['imex']['force_configuration'] is #{force_indicator}" do + cached(:chef_run) do + ChefSpec::SoloRunner.new(step_into: ['nvidia_imex']) do |node| + node.override['cluster']['nvidia']['imex']['force_configuration'] = force_indicator + end + end + cached(:resource) do + ConvergeNvidiaImex.configure(chef_run) + chef_run.find_resource('nvidia_imex', 'configure') + end + it "we get #{actual_indicator}" do + allow_any_instance_of(Object).to receive(:enable_force_configuration?).and_return(actual_indicator) + expect(resource.enable_force_configuration?).to eq(actual_indicator) + end + end + end +end + describe 'nvidia_imex:install' do for_all_oses do |platform, version| context "on #{platform}#{version}" do @@ -274,109 +294,121 @@ def self.configure(chef_run) end describe 'nvidia_imex:configure' do - for_all_oses do |platform, version| - context "on #{platform}#{version}" do - context "when nvidia-imex binary is not installed" do - cached(:chef_run) do - stubs_for_resource('nvidia_imex') do |res| - allow(res).to receive(:imex_installed?).and_return(false) + [%w(false), [false], %w(no), %w(true), [true], %w(yes)].each do |force_indicator| + for_all_oses do |platform, version| + context "on #{platform}#{version} with force_configuration #{force_indicator}" do + context "when nvidia-imex binary is not installed" do + cached(:chef_run) do + stubs_for_resource('nvidia_imex') do |res| + allow(res).to receive(:imex_installed?).and_return(false) + end + runner = runner(platform: platform, version: version, step_into: ['nvidia_imex']) + ConvergeNvidiaImex.configure(runner) + end + cached(:node) { chef_run.node } + + it 'does not configure nvidia-imex' do + is_expected.not_to configure_nvidia_imex('nvidia-imex') end - runner = runner(platform: platform, version: version, step_into: ['nvidia_imex']) - ConvergeNvidiaImex.configure(runner) end - cached(:node) { chef_run.node } - it 'does not configure nvidia-imex' do - is_expected.not_to configure_nvidia_imex('nvidia-imex') + %w(HeadNode LoginNode ComputeFleet).each do |node_type| + context "when get_nvswitch_count > 1 on #{node_type} node" do + cached(:chef_run) do + stubs_for_provider('nvidia_imex[configure]') do |pro| + allow(pro).to receive(:imex_installed?).and_return(true) + allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' }) + allow(pro).to receive(:get_nvswitch_count).with('test').and_return(4) + allow(pro).to receive(:enable_force_configuration?).and_return(force_indicator) + end + runner(platform: platform, version: version, step_into: ['nvidia_imex']) + end + cached(:node) { chef_run.node } + + before do + chef_run.node.override['cluster']['region'] = 'aws_region' + chef_run.node.override['cluster']['nvidia']['imex']['force_configuration'] = force_indicator + chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir + chef_run.node.override['cluster']['node_type'] = node_type + chef_run.node.override['cluster']['launch_template_id'] = launch_template_id + ConvergeNvidiaImex.configure(chef_run) + end + + if (platform == 'amazon' && version == '2') || %w(HeadNode LoginNode).include?(node_type) + it 'does not configure nvidia-imex' do + is_expected.not_to create_template("#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg") + .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + is_expected.not_to create_template("#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg") + .with(source: 'nvidia-imex/nvidia-imex-config.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + .with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg" }) + is_expected.not_to create_template("/etc/systemd/system/nvidia-imex.service") + .with(source: 'nvidia-imex/nvidia-imex.service.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0644') + .with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg" }) + is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) + end + else + it 'it starts nvidia-imex service' do + is_expected.to create_template("#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg") + .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + is_expected.to create_template("#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg") + .with(source: 'nvidia-imex/nvidia-imex-config.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + .with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg" }) + is_expected.to create_template("/etc/systemd/system/nvidia-imex.service") + .with(source: 'nvidia-imex/nvidia-imex.service.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0644') + .with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg" }) + is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) + end + end + end end - end - %w(HeadNode LoginNode ComputeFleet).each do |node_type| - context "when get_nvswitch_count > 1 on #{node_type} node" do + context "when get_nvswitch_count <= 1" do cached(:chef_run) do stubs_for_provider('nvidia_imex[configure]') do |pro| allow(pro).to receive(:imex_installed?).and_return(true) allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' }) - allow(pro).to receive(:get_nvswitch_count).with('test').and_return(4) + allow(pro).to receive(:get_nvswitch_count).with('test').and_return(1) + allow(pro).to receive(:enable_force_configuration?).and_return(force_indicator) end - runner(platform: platform, version: version, step_into: ['nvidia_imex']) + runner = runner(platform: platform, version: version, step_into: ['nvidia_imex']) + ConvergeNvidiaImex.configure(runner) end cached(:node) { chef_run.node } before do chef_run.node.override['cluster']['region'] = 'aws_region' - chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir - chef_run.node.override['cluster']['node_type'] = node_type - chef_run.node.override['cluster']['launch_template_id'] = launch_template_id - ConvergeNvidiaImex.configure(chef_run) + chef_run.node.override['cluster']['nvidia']['imex']['force_configuration'] = force_indicator end - if (platform == 'amazon' && version == '2') || %w(HeadNode LoginNode).include?(node_type) - it 'does not configure nvidia-imex' do - is_expected.not_to create_template("#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg") - .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0755') - is_expected.not_to create_template("#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg") - .with(source: 'nvidia-imex/nvidia-imex-config.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0755') - .with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg" }) - is_expected.not_to create_template("/etc/systemd/system/nvidia-imex.service") - .with(source: 'nvidia-imex/nvidia-imex.service.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0644') - .with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg" }) - is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) + if ['true', 'yes', true].include?(force_indicator) + it 'does configure nvidia-imex' do + is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) end else - it 'it starts nvidia-imex service' do - is_expected.to create_template("#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg") - .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0755') - is_expected.to create_template("#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg") - .with(source: 'nvidia-imex/nvidia-imex-config.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0755') - .with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg" }) - is_expected.to create_template("/etc/systemd/system/nvidia-imex.service") - .with(source: 'nvidia-imex/nvidia-imex.service.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0644') - .with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg" }) - is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) + it 'does not configure nvidia-imex' do + is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) end end end end - - context "when get_nvswitch_count <= 1" do - cached(:chef_run) do - stubs_for_provider('nvidia_imex[configure]') do |pro| - allow(pro).to receive(:imex_installed?).and_return(true) - allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' }) - allow(pro).to receive(:get_nvswitch_count).with('test').and_return(1) - end - runner = runner(platform: platform, version: version, step_into: ['nvidia_imex']) - ConvergeNvidiaImex.configure(runner) - end - cached(:node) { chef_run.node } - - before do - chef_run.node.override['cluster']['region'] = 'aws_region' - end - - it 'does not configure nvidia-imex' do - is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) - end - end end end end