Skip to content

Commit 0ed90e2

Browse files
author
Himani Anil Deshpande
committed
[NVIDIA-IMEX] Add test attribute for NVIDIA-imex simulation
1 parent 581530a commit 0ed90e2

File tree

2 files changed

+117
-81
lines changed

2 files changed

+117
-81
lines changed

cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
action :configure do
3232
return unless imex_installed? && node['cluster']['node_type'] == "ComputeFleet"
3333
# Start nvidia-imex on p6e-gb200 and only on ComputeFleet
34-
if get_nvswitch_count(get_device_ids['gb200']) > 1
34+
if get_nvswitch_count(get_device_ids['gb200']) > 1 || enable_force_configuration?
3535
# For each Compute Resource, we generate a unique NVIDIA IMEX configuration file,
3636
# if one doesn't already exist in a common, shared location.
3737
template nvidia_imex_nodes_conf_file do
@@ -104,3 +104,7 @@ def nvidia_imex_main_conf_file
104104
def nvidia_imex_nodes_conf_file
105105
"#{node['cluster']['nvidia']['imex']['shared_dir']}/nodes_config_#{node['cluster']['launch_template_id']}.cfg"
106106
end
107+
108+
def enable_force_configuration?
109+
['true', 'yes', true].include?(node['cluster']['nvidia']['imex']['force_configuration'])
110+
end

cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb

Lines changed: 112 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,26 @@ def self.configure(chef_run)
137137
end
138138
end
139139

140+
describe 'nvidia_imex:enable_force_configuration?' do
141+
[['false', false], [false, false], ['no', false], ['true', true], [true, true], ['yes', true]].each do |force_indicator, actual_indicator|
142+
context "where node['cluster']['nvidia']['imex']['force_configuration'] is #{force_indicator}" do
143+
cached(:chef_run) do
144+
ChefSpec::SoloRunner.new(step_into: ['nvidia_imex']) do |node|
145+
node.override['cluster']['nvidia']['imex']['force_configuration'] = force_indicator
146+
end
147+
end
148+
cached(:resource) do
149+
ConvergeNvidiaImex.configure(chef_run)
150+
chef_run.find_resource('nvidia_imex', 'configure')
151+
end
152+
it "we get #{actual_indicator}" do
153+
allow_any_instance_of(Object).to receive(:enable_force_configuration?).and_return(actual_indicator)
154+
expect(resource.enable_force_configuration?).to eq(actual_indicator)
155+
end
156+
end
157+
end
158+
end
159+
140160
describe 'nvidia_imex:install' do
141161
for_all_oses do |platform, version|
142162
context "on #{platform}#{version}" do
@@ -274,109 +294,121 @@ def self.configure(chef_run)
274294
end
275295

276296
describe 'nvidia_imex:configure' do
277-
for_all_oses do |platform, version|
278-
context "on #{platform}#{version}" do
279-
context "when nvidia-imex binary is not installed" do
280-
cached(:chef_run) do
281-
stubs_for_resource('nvidia_imex') do |res|
282-
allow(res).to receive(:imex_installed?).and_return(false)
297+
[%w(false), [false], %w(no), %w(true), [true], %w(yes)].each do |force_indicator|
298+
for_all_oses do |platform, version|
299+
context "on #{platform}#{version} with force_configuration #{force_indicator}" do
300+
context "when nvidia-imex binary is not installed" do
301+
cached(:chef_run) do
302+
stubs_for_resource('nvidia_imex') do |res|
303+
allow(res).to receive(:imex_installed?).and_return(false)
304+
end
305+
runner = runner(platform: platform, version: version, step_into: ['nvidia_imex'])
306+
ConvergeNvidiaImex.configure(runner)
307+
end
308+
cached(:node) { chef_run.node }
309+
310+
it 'does not configure nvidia-imex' do
311+
is_expected.not_to configure_nvidia_imex('nvidia-imex')
283312
end
284-
runner = runner(platform: platform, version: version, step_into: ['nvidia_imex'])
285-
ConvergeNvidiaImex.configure(runner)
286313
end
287-
cached(:node) { chef_run.node }
288314

289-
it 'does not configure nvidia-imex' do
290-
is_expected.not_to configure_nvidia_imex('nvidia-imex')
315+
%w(HeadNode LoginNode ComputeFleet).each do |node_type|
316+
context "when get_nvswitch_count > 1 on #{node_type} node" do
317+
cached(:chef_run) do
318+
stubs_for_provider('nvidia_imex[configure]') do |pro|
319+
allow(pro).to receive(:imex_installed?).and_return(true)
320+
allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' })
321+
allow(pro).to receive(:get_nvswitch_count).with('test').and_return(4)
322+
allow(pro).to receive(:enable_force_configuration?).and_return(force_indicator)
323+
end
324+
runner(platform: platform, version: version, step_into: ['nvidia_imex'])
325+
end
326+
cached(:node) { chef_run.node }
327+
328+
before do
329+
chef_run.node.override['cluster']['region'] = 'aws_region'
330+
chef_run.node.override['cluster']['nvidia']['imex']['force_configuration'] = force_indicator
331+
chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir
332+
chef_run.node.override['cluster']['node_type'] = node_type
333+
chef_run.node.override['cluster']['launch_template_id'] = launch_template_id
334+
ConvergeNvidiaImex.configure(chef_run)
335+
end
336+
337+
if (platform == 'amazon' && version == '2') || %w(HeadNode LoginNode).include?(node_type)
338+
it 'does not configure nvidia-imex' do
339+
is_expected.not_to create_template("#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg")
340+
.with(source: 'nvidia-imex/nvidia-imex-nodes.erb')
341+
.with(user: 'root')
342+
.with(group: 'root')
343+
.with(mode: '0755')
344+
is_expected.not_to create_template("#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg")
345+
.with(source: 'nvidia-imex/nvidia-imex-config.erb')
346+
.with(user: 'root')
347+
.with(group: 'root')
348+
.with(mode: '0755')
349+
.with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg" })
350+
is_expected.not_to create_template("/etc/systemd/system/nvidia-imex.service")
351+
.with(source: 'nvidia-imex/nvidia-imex.service.erb')
352+
.with(user: 'root')
353+
.with(group: 'root')
354+
.with(mode: '0644')
355+
.with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg" })
356+
is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
357+
end
358+
else
359+
it 'it starts nvidia-imex service' do
360+
is_expected.to create_template("#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg")
361+
.with(source: 'nvidia-imex/nvidia-imex-nodes.erb')
362+
.with(user: 'root')
363+
.with(group: 'root')
364+
.with(mode: '0755')
365+
is_expected.to create_template("#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg")
366+
.with(source: 'nvidia-imex/nvidia-imex-config.erb')
367+
.with(user: 'root')
368+
.with(group: 'root')
369+
.with(mode: '0755')
370+
.with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg" })
371+
is_expected.to create_template("/etc/systemd/system/nvidia-imex.service")
372+
.with(source: 'nvidia-imex/nvidia-imex.service.erb')
373+
.with(user: 'root')
374+
.with(group: 'root')
375+
.with(mode: '0644')
376+
.with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg" })
377+
is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
378+
end
379+
end
380+
end
291381
end
292-
end
293382

294-
%w(HeadNode LoginNode ComputeFleet).each do |node_type|
295-
context "when get_nvswitch_count > 1 on #{node_type} node" do
383+
context "when get_nvswitch_count <= 1" do
296384
cached(:chef_run) do
297385
stubs_for_provider('nvidia_imex[configure]') do |pro|
298386
allow(pro).to receive(:imex_installed?).and_return(true)
299387
allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' })
300-
allow(pro).to receive(:get_nvswitch_count).with('test').and_return(4)
388+
allow(pro).to receive(:get_nvswitch_count).with('test').and_return(1)
389+
allow(pro).to receive(:enable_force_configuration?).and_return(force_indicator)
301390
end
302-
runner(platform: platform, version: version, step_into: ['nvidia_imex'])
391+
runner = runner(platform: platform, version: version, step_into: ['nvidia_imex'])
392+
ConvergeNvidiaImex.configure(runner)
303393
end
304394
cached(:node) { chef_run.node }
305395

306396
before do
307397
chef_run.node.override['cluster']['region'] = 'aws_region'
308-
chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir
309-
chef_run.node.override['cluster']['node_type'] = node_type
310-
chef_run.node.override['cluster']['launch_template_id'] = launch_template_id
311-
ConvergeNvidiaImex.configure(chef_run)
398+
chef_run.node.override['cluster']['nvidia']['imex']['force_configuration'] = force_indicator
312399
end
313400

314-
if (platform == 'amazon' && version == '2') || %w(HeadNode LoginNode).include?(node_type)
315-
it 'does not configure nvidia-imex' do
316-
is_expected.not_to create_template("#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg")
317-
.with(source: 'nvidia-imex/nvidia-imex-nodes.erb')
318-
.with(user: 'root')
319-
.with(group: 'root')
320-
.with(mode: '0755')
321-
is_expected.not_to create_template("#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg")
322-
.with(source: 'nvidia-imex/nvidia-imex-config.erb')
323-
.with(user: 'root')
324-
.with(group: 'root')
325-
.with(mode: '0755')
326-
.with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg" })
327-
is_expected.not_to create_template("/etc/systemd/system/nvidia-imex.service")
328-
.with(source: 'nvidia-imex/nvidia-imex.service.erb')
329-
.with(user: 'root')
330-
.with(group: 'root')
331-
.with(mode: '0644')
332-
.with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg" })
333-
is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
401+
if ['true', 'yes', true].include?(force_indicator)
402+
it 'does configure nvidia-imex' do
403+
is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
334404
end
335405
else
336-
it 'it starts nvidia-imex service' do
337-
is_expected.to create_template("#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg")
338-
.with(source: 'nvidia-imex/nvidia-imex-nodes.erb')
339-
.with(user: 'root')
340-
.with(group: 'root')
341-
.with(mode: '0755')
342-
is_expected.to create_template("#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg")
343-
.with(source: 'nvidia-imex/nvidia-imex-config.erb')
344-
.with(user: 'root')
345-
.with(group: 'root')
346-
.with(mode: '0755')
347-
.with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg" })
348-
is_expected.to create_template("/etc/systemd/system/nvidia-imex.service")
349-
.with(source: 'nvidia-imex/nvidia-imex.service.erb')
350-
.with(user: 'root')
351-
.with(group: 'root')
352-
.with(mode: '0644')
353-
.with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg" })
354-
is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
406+
it 'does not configure nvidia-imex' do
407+
is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
355408
end
356409
end
357410
end
358411
end
359-
360-
context "when get_nvswitch_count <= 1" do
361-
cached(:chef_run) do
362-
stubs_for_provider('nvidia_imex[configure]') do |pro|
363-
allow(pro).to receive(:imex_installed?).and_return(true)
364-
allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' })
365-
allow(pro).to receive(:get_nvswitch_count).with('test').and_return(1)
366-
end
367-
runner = runner(platform: platform, version: version, step_into: ['nvidia_imex'])
368-
ConvergeNvidiaImex.configure(runner)
369-
end
370-
cached(:node) { chef_run.node }
371-
372-
before do
373-
chef_run.node.override['cluster']['region'] = 'aws_region'
374-
end
375-
376-
it 'does not configure nvidia-imex' do
377-
is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
378-
end
379-
end
380412
end
381413
end
382414
end

0 commit comments

Comments
 (0)