Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

# nvidia-imex
default['cluster']['nvidia']['imex']['shared_dir'] = "#{node['cluster']['shared_dir']}/nvidia-imex"
default['cluster']['nvidia']['imex']['force_configuration'] = false

# DCV
default['cluster']['dcv']['authenticator']['user'] = "dcvextauth"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
action :configure do
return unless imex_installed? && node['cluster']['node_type'] == "ComputeFleet"
# Start nvidia-imex on p6e-gb200 and only on ComputeFleet
if get_nvswitch_count(get_device_ids['gb200']) > 1
if get_nvswitch_count(get_device_ids['gb200']) > 1 || enable_force_configuration?
# For each Compute Resource, we generate a unique NVIDIA IMEX configuration file,
# if one doesn't already exist in a common, shared location.
template nvidia_imex_nodes_conf_file do
Expand Down Expand Up @@ -104,3 +104,7 @@ def nvidia_imex_main_conf_file
def nvidia_imex_nodes_conf_file
"#{node['cluster']['nvidia']['imex']['shared_dir']}/nodes_config_#{node['cluster']['launch_template_id']}.cfg"
end

def enable_force_configuration?
['true', 'yes', true].include?(node['cluster']['nvidia']['imex']['force_configuration'])
end
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,26 @@ def self.configure(chef_run)
end
end

describe 'nvidia_imex:enable_force_configuration?' do
[['false', false], [false, false], ['no', false], ['true', true], [true, true], ['yes', true]].each do |force_indicator, actual_indicator|
context "where node['cluster']['nvidia']['imex']['force_configuration'] is #{force_indicator}" do
cached(:chef_run) do
ChefSpec::SoloRunner.new(step_into: ['nvidia_imex']) do |node|
node.override['cluster']['nvidia']['imex']['force_configuration'] = force_indicator
end
end
cached(:resource) do
ConvergeNvidiaImex.configure(chef_run)
chef_run.find_resource('nvidia_imex', 'configure')
end
it "we get #{actual_indicator}" do
allow_any_instance_of(Object).to receive(:enable_force_configuration?).and_return(actual_indicator)
expect(resource.enable_force_configuration?).to eq(actual_indicator)
end
end
end
end

describe 'nvidia_imex:install' do
for_all_oses do |platform, version|
context "on #{platform}#{version}" do
Expand Down Expand Up @@ -274,109 +294,121 @@ def self.configure(chef_run)
end

describe 'nvidia_imex:configure' do
for_all_oses do |platform, version|
context "on #{platform}#{version}" do
context "when nvidia-imex binary is not installed" do
cached(:chef_run) do
stubs_for_resource('nvidia_imex') do |res|
allow(res).to receive(:imex_installed?).and_return(false)
[%w(false), [false], %w(no), %w(true), [true], %w(yes)].each do |force_indicator|
for_all_oses do |platform, version|
context "on #{platform}#{version} with force_configuration #{force_indicator}" do
context "when nvidia-imex binary is not installed" do
cached(:chef_run) do
stubs_for_resource('nvidia_imex') do |res|
allow(res).to receive(:imex_installed?).and_return(false)
end
runner = runner(platform: platform, version: version, step_into: ['nvidia_imex'])
ConvergeNvidiaImex.configure(runner)
end
cached(:node) { chef_run.node }

it 'does not configure nvidia-imex' do
is_expected.not_to configure_nvidia_imex('nvidia-imex')
end
runner = runner(platform: platform, version: version, step_into: ['nvidia_imex'])
ConvergeNvidiaImex.configure(runner)
end
cached(:node) { chef_run.node }

it 'does not configure nvidia-imex' do
is_expected.not_to configure_nvidia_imex('nvidia-imex')
%w(HeadNode LoginNode ComputeFleet).each do |node_type|
context "when get_nvswitch_count > 1 on #{node_type} node" do
cached(:chef_run) do
stubs_for_provider('nvidia_imex[configure]') do |pro|
allow(pro).to receive(:imex_installed?).and_return(true)
allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' })
allow(pro).to receive(:get_nvswitch_count).with('test').and_return(4)
allow(pro).to receive(:enable_force_configuration?).and_return(force_indicator)
end
runner(platform: platform, version: version, step_into: ['nvidia_imex'])
end
cached(:node) { chef_run.node }

before do
chef_run.node.override['cluster']['region'] = 'aws_region'
chef_run.node.override['cluster']['nvidia']['imex']['force_configuration'] = force_indicator
chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir
chef_run.node.override['cluster']['node_type'] = node_type
chef_run.node.override['cluster']['launch_template_id'] = launch_template_id
ConvergeNvidiaImex.configure(chef_run)
end

if (platform == 'amazon' && version == '2') || %w(HeadNode LoginNode).include?(node_type)
it 'does not configure nvidia-imex' do
is_expected.not_to create_template("#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg")
.with(source: 'nvidia-imex/nvidia-imex-nodes.erb')
.with(user: 'root')
.with(group: 'root')
.with(mode: '0755')
is_expected.not_to create_template("#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg")
.with(source: 'nvidia-imex/nvidia-imex-config.erb')
.with(user: 'root')
.with(group: 'root')
.with(mode: '0755')
.with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg" })
is_expected.not_to create_template("/etc/systemd/system/nvidia-imex.service")
.with(source: 'nvidia-imex/nvidia-imex.service.erb')
.with(user: 'root')
.with(group: 'root')
.with(mode: '0644')
.with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg" })
is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
end
else
it 'it starts nvidia-imex service' do
is_expected.to create_template("#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg")
.with(source: 'nvidia-imex/nvidia-imex-nodes.erb')
.with(user: 'root')
.with(group: 'root')
.with(mode: '0755')
is_expected.to create_template("#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg")
.with(source: 'nvidia-imex/nvidia-imex-config.erb')
.with(user: 'root')
.with(group: 'root')
.with(mode: '0755')
.with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg" })
is_expected.to create_template("/etc/systemd/system/nvidia-imex.service")
.with(source: 'nvidia-imex/nvidia-imex.service.erb')
.with(user: 'root')
.with(group: 'root')
.with(mode: '0644')
.with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg" })
is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
end
end
end
end
end

%w(HeadNode LoginNode ComputeFleet).each do |node_type|
context "when get_nvswitch_count > 1 on #{node_type} node" do
context "when get_nvswitch_count <= 1" do
cached(:chef_run) do
stubs_for_provider('nvidia_imex[configure]') do |pro|
allow(pro).to receive(:imex_installed?).and_return(true)
allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' })
allow(pro).to receive(:get_nvswitch_count).with('test').and_return(4)
allow(pro).to receive(:get_nvswitch_count).with('test').and_return(1)
allow(pro).to receive(:enable_force_configuration?).and_return(force_indicator)
end
runner(platform: platform, version: version, step_into: ['nvidia_imex'])
runner = runner(platform: platform, version: version, step_into: ['nvidia_imex'])
ConvergeNvidiaImex.configure(runner)
end
cached(:node) { chef_run.node }

before do
chef_run.node.override['cluster']['region'] = 'aws_region'
chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir
chef_run.node.override['cluster']['node_type'] = node_type
chef_run.node.override['cluster']['launch_template_id'] = launch_template_id
ConvergeNvidiaImex.configure(chef_run)
chef_run.node.override['cluster']['nvidia']['imex']['force_configuration'] = force_indicator
end

if (platform == 'amazon' && version == '2') || %w(HeadNode LoginNode).include?(node_type)
it 'does not configure nvidia-imex' do
is_expected.not_to create_template("#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg")
.with(source: 'nvidia-imex/nvidia-imex-nodes.erb')
.with(user: 'root')
.with(group: 'root')
.with(mode: '0755')
is_expected.not_to create_template("#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg")
.with(source: 'nvidia-imex/nvidia-imex-config.erb')
.with(user: 'root')
.with(group: 'root')
.with(mode: '0755')
.with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg" })
is_expected.not_to create_template("/etc/systemd/system/nvidia-imex.service")
.with(source: 'nvidia-imex/nvidia-imex.service.erb')
.with(user: 'root')
.with(group: 'root')
.with(mode: '0644')
.with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg" })
is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
if ['true', 'yes', true].include?(force_indicator)
it 'does configure nvidia-imex' do
is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
end
else
it 'it starts nvidia-imex service' do
is_expected.to create_template("#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg")
.with(source: 'nvidia-imex/nvidia-imex-nodes.erb')
.with(user: 'root')
.with(group: 'root')
.with(mode: '0755')
is_expected.to create_template("#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg")
.with(source: 'nvidia-imex/nvidia-imex-config.erb')
.with(user: 'root')
.with(group: 'root')
.with(mode: '0755')
.with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{launch_template_id}.cfg" })
is_expected.to create_template("/etc/systemd/system/nvidia-imex.service")
.with(source: 'nvidia-imex/nvidia-imex.service.erb')
.with(user: 'root')
.with(group: 'root')
.with(mode: '0644')
.with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{launch_template_id}.cfg" })
is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
it 'does not configure nvidia-imex' do
is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
end
end
end
end

context "when get_nvswitch_count <= 1" do
cached(:chef_run) do
stubs_for_provider('nvidia_imex[configure]') do |pro|
allow(pro).to receive(:imex_installed?).and_return(true)
allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' })
allow(pro).to receive(:get_nvswitch_count).with('test').and_return(1)
end
runner = runner(platform: platform, version: version, step_into: ['nvidia_imex'])
ConvergeNvidiaImex.configure(runner)
end
cached(:node) { chef_run.node }

before do
chef_run.node.override['cluster']['region'] = 'aws_region'
end

it 'does not configure nvidia-imex' do
is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
end
end
end
end
end
Loading