Skip to content

Commit a38b423

Browse files
committed
Fix the way Pyxis and Enroot are configured.
1. Pyxis is disabled by default. In particular, the Enroot, SPANK and Pyxis config files required to enable it are stored in `/opt/parallelcluster/examples` folder so that they are ineffective but can be used by the user to enable Pyxis by simply moving them to the expected location. 2. Moved Pyxis and Enroot configuration to build time (there was no reason to configure Pyxis and Enroot at runtime) 3. Skip Enroot installation if Enroot is already installed. 4. Skip Pyxis installation if Pyxis is already installed. 5. The sample configurations provided for Pyxis uses runtime path to `/run/pyxis`. As per [documentation](https://github.com/NVIDIA/pyxis/wiki/Setup#slurm-plugstack-configuration) a tmpfs should be used. 6. The sample configuration provided for Enroot uses the following paths, as suggested in [documentation](https://github.com/NVIDIA/pyxis/wiki/Setup#enroot-configuration-example) 1. Using tmpfs storage for `ENROOT_RUNTIME_PATH` and `ENROOT_DATA_PATH` 2. Using a persistent local storage for `ENROOT_CACHE_PATH` and `ENROOT_CONFIG_PATH`. 7. We do not create any directory used in the Pyxis or Enroot sample configuration. The user is supposed to create the desired directories. 8. *Minor*: Moved Pyxis attributes from platform cookbook to slurm cookbook because Pyxis is a SLURM plugin so it would be conceptually wrong to have its attributes defined in platform cookbook. 9. Added missing unit tests. Signed-off-by: Giacomo Marciani <[email protected]>
1 parent e87af87 commit a38b423

File tree

17 files changed

+329
-113
lines changed

17 files changed

+329
-113
lines changed

cookbooks/aws-parallelcluster-platform/attributes/platform.rb

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,10 @@
99
# ArmPL
1010
default['conditions']['arm_pl_supported'] = arm_instance?
1111

12-
# Enroot + Pyxis
12+
# Enroot
1313
default['cluster']['enroot']['version'] = '3.4.1'
14-
default['cluster']['pyxis']['version'] = '0.20.0'
14+
default['cluster']['enroot']['temporary_dir'] = '/run/enroot'
15+
default['cluster']['enroot']['persistent_dir'] = '/var/enroot'
1516

1617
# NVidia
1718
default['cluster']['nvidia']['enabled'] = 'no'

cookbooks/aws-parallelcluster-platform/recipes/config.rb

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,3 @@
2626
include_recipe 'aws-parallelcluster-platform::supervisord_config'
2727
fetch_config 'Fetch and load cluster configs'
2828
include_recipe 'aws-parallelcluster-platform::config_login' if node['cluster']['node_type'] == 'LoginNode'
29-
enroot 'Configure Enroot' do
30-
action :configure
31-
end

cookbooks/aws-parallelcluster-platform/recipes/install/directories.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
directory node['cluster']['license_dir']
2222
directory node['cluster']['configs_dir']
2323
directory node['cluster']['shared_dir']
24+
directory node['cluster']['examples_dir']
2425
directory node['cluster']['shared_dir_login_nodes']
2526

2627
# Create ParallelCluster log folder

cookbooks/aws-parallelcluster-platform/resources/enroot/partial/_enroot_common.rb

Lines changed: 9 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# frozen_string_literal: true
22
#
3-
# Copyright:: 2013-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3+
# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
44
#
55
# Licensed under the Apache License, Version 2.0 (the "License").
66
# You may not use this file except in compliance with the License.
@@ -16,52 +16,19 @@
1616
default_action :setup
1717

1818
action :setup do
19-
return if on_docker?
19+
return if on_docker? || enroot_installed
20+
2021
action_install_package
21-
end
2222

23-
action :configure do
24-
return if on_docker?
25-
return unless enroot_installed
23+
enroot_examples_dir = "#{node['cluster']['examples_dir']}/enroot"
24+
25+
directory enroot_examples_dir
2626

27-
cookbook_file "/tmp/enroot.template.conf" do
28-
source 'enroot/enroot.template.conf'
29-
cookbook 'aws-parallelcluster-platform'
27+
template "#{enroot_examples_dir}/enroot.conf" do
28+
source 'enroot/enroot.conf.erb'
3029
owner 'root'
3130
group 'root'
32-
mode '0755'
33-
action :create_if_missing
34-
end
35-
36-
bash "Configure enroot" do
37-
user 'root'
38-
code <<-ENROOT_CONFIGURE
39-
set -e
40-
ENROOT_CONFIG_RELEASE=pyxis
41-
SHARED_DIR=#{node['cluster']['shared_dir']}
42-
NONROOT_USER=#{node['cluster']['cluster_user']}
43-
mkdir -p ${SHARED_DIR}/enroot
44-
chown ${NONROOT_USER} ${SHARED_DIR}/enroot
45-
ENROOT_CACHE_PATH=${SHARED_DIR}/enroot envsubst < /tmp/enroot.template.conf > /tmp/enroot.conf
46-
mv /tmp/enroot.conf /etc/enroot/enroot.conf
47-
chmod 0644 /etc/enroot/enroot.conf
48-
49-
mkdir -p /tmp/enroot
50-
chmod 1777 /tmp/enroot
51-
mkdir -p /tmp/enroot/data
52-
chmod 1777 /tmp/enroot/data
53-
54-
chmod 1777 ${SHARED_DIR}/enroot
55-
56-
mkdir -p ${SHARED_DIR}/pyxis/
57-
chown ${NONROOT_USER} ${SHARED_DIR}/pyxis/
58-
sed -i '${s/$/ runtime_path=${SHARED_DIR}\\/pyxis/}' /opt/slurm/etc/plugstack.conf.d/pyxis.conf
59-
SHARED_DIR=${SHARED_DIR} envsubst < /opt/slurm/etc/plugstack.conf.d/pyxis.conf > /opt/slurm/etc/plugstack.conf.d/pyxis.tmp.conf
60-
mv /opt/slurm/etc/plugstack.conf.d/pyxis.tmp.conf /opt/slurm/etc/plugstack.conf.d/pyxis.conf
61-
62-
ENROOT_CONFIGURE
63-
retries 3
64-
retry_delay 5
31+
mode '0644'
6532
end
6633
end
6734

cookbooks/aws-parallelcluster-platform/spec/unit/recipes/directories_spec.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@
3636
is_expected.to create_directory(node['cluster']['shared_dir'])
3737
end
3838

39+
it 'creates examples directory' do
40+
is_expected.to create_directory(node['cluster']['examples_dir'])
41+
end
42+
3943
it 'creates log directory' do
4044
is_expected.to create_directory(node['cluster']['log_base_dir']).with(
4145
owner: 'root',

cookbooks/aws-parallelcluster-platform/spec/unit/resources/enroot_spec.rb

Lines changed: 81 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,9 @@ def self.setup(chef_run)
99
end
1010
end
1111
end
12-
13-
def self.configure(chef_run)
14-
chef_run.converge_dsl('aws-parallelcluster-platform') do
15-
enroot 'configure' do
16-
action :configure
17-
end
18-
end
19-
end
2012
end
2113

22-
describe 'enroot:package_version' do
14+
describe 'aws-parallelcluster-platform::enroot:package_version' do
2315
for_all_oses do |platform, version|
2416
context "on #{platform}#{version}" do
2517
cached(:chef_run) do
@@ -39,7 +31,34 @@ def self.configure(chef_run)
3931
end
4032
end
4133

42-
describe 'enroot:arch_suffix' do
34+
describe 'aws-parallelcluster-platform::enroot:enroot_installed' do
35+
for_all_oses do |platform, version|
36+
context "on #{platform}#{version}" do
37+
binary = '/usr/bin/enroot'
38+
[true, false].each do |binary_exist|
39+
context "when binary #{binary} does #{'not ' unless binary_exist}exist" do
40+
cached(:chef_run) do
41+
allow(File).to receive(:exist?).with(binary).and_return(binary_exist)
42+
runner = runner(platform: platform, version: version, step_into: ['enroot'])
43+
ConvergeEnroot.setup(runner)
44+
end
45+
46+
cached(:resource) do
47+
chef_run.find_resource('enroot', 'setup')
48+
end
49+
50+
expected_result = binary_exist
51+
52+
it "returns #{expected_result}" do
53+
expect(resource.enroot_installed).to eq(expected_result)
54+
end
55+
end
56+
end
57+
end
58+
end
59+
end
60+
61+
describe 'aws-parallelcluster-platform::enroot:arch_suffix' do
4362
for_all_oses do |platform, version|
4463
context "on #{platform}#{version} - arm" do
4564
cached(:chef_run) do
@@ -81,15 +100,66 @@ def self.configure(chef_run)
81100
end
82101
end
83102

84-
describe 'enroot:setup' do
103+
describe 'aws-parallelcluster-platform::enroot:setup' do
85104
for_all_oses do |platform, version|
86105
context "on #{platform}#{version}" do
106+
cached(:cluster_examples_dir) { '/path/to/cluster/examples/dir' }
107+
cached(:enroot_persistent_dir) { '/path/to/enroot/persistent/dir' }
108+
cached(:enroot_temporary_dir) { '/path/to/enroot/temporary/dir' }
109+
110+
context "when enroot is already installed" do
111+
let(:chef_run) do
112+
stubs_for_resource('enroot') do |res|
113+
allow(res).to receive(:enroot_installed).and_return(true)
114+
end
115+
runner(platform: platform, version: version, step_into: ['enroot']) do |node|
116+
node.override['cluster']['enroot']['version'] = package_version
117+
node.override['cluster']['examples_dir'] = cluster_examples_dir
118+
end
119+
end
120+
121+
before do
122+
ConvergeEnroot.setup(chef_run)
123+
end
124+
125+
it 'does not install Enroot' do
126+
is_expected.not_to run_bash('Install enroot')
127+
end
128+
129+
it 'does not create the Enroot configuration' do
130+
is_expected.not_to create_template("#{cluster_examples_dir}/enroot/enroot.conf")
131+
end
132+
end
133+
87134
let(:chef_run) do
135+
stubs_for_resource('enroot') do |res|
136+
allow(res).to receive(:enroot_installed).and_return(false)
137+
end
88138
runner(platform: platform, version: version, step_into: ['enroot']) do |node|
89139
node.override['cluster']['enroot']['version'] = package_version
140+
node.override['cluster']['examples_dir'] = cluster_examples_dir
141+
node.override['cluster']['enroot']['persistent_dir'] = enroot_persistent_dir
142+
node.override['cluster']['enroot']['temporary_dir'] = enroot_temporary_dir
90143
end
91144
end
92145

146+
before do
147+
ConvergeEnroot.setup(chef_run)
148+
end
149+
150+
it 'installs Enroot' do
151+
is_expected.not_to run_bash('Install enroot')
152+
end
153+
154+
it 'creates the Enroot example configuration' do
155+
is_expected.to create_template("#{cluster_examples_dir}/enroot/enroot.conf").with(
156+
source: 'enroot/enroot.conf.erb',
157+
owner: 'root',
158+
group: 'root',
159+
mode: '0644'
160+
)
161+
end
162+
93163
context 'when nvidia is enabled' do
94164
before do
95165
stubs_for_provider('enroot') do |resource|
@@ -128,44 +198,3 @@ def self.configure(chef_run)
128198
end
129199
end
130200
end
131-
132-
describe 'enroot:configure' do
133-
for_all_oses do |platform, version|
134-
context "on #{platform}#{version}" do
135-
let(:chef_run) do
136-
runner(platform: platform, version: version, step_into: ['enroot'])
137-
end
138-
139-
context 'when enroot is installed' do
140-
before do
141-
stubs_for_provider('enroot') do |resource|
142-
allow(resource).to receive(:enroot_installed).and_return(true)
143-
end
144-
ConvergeEnroot.configure(chef_run)
145-
end
146-
it 'run configure enroot script' do
147-
is_expected.to run_bash('Configure enroot')
148-
.with(retries: 3)
149-
.with(retry_delay: 5)
150-
.with(user: 'root')
151-
end
152-
end
153-
154-
context 'when enroot is not installed' do
155-
before do
156-
stubs_for_provider('enroot') do |resource|
157-
allow(resource).to receive(:enroot_installed).and_return(false)
158-
end
159-
ConvergeEnroot.configure(chef_run)
160-
end
161-
162-
it 'does not run configure enroot script' do
163-
is_expected.not_to run_bash('Configure enroot')
164-
.with(retries: 3)
165-
.with(retry_delay: 5)
166-
.with(user: 'root')
167-
end
168-
end
169-
end
170-
end
171-
end

cookbooks/aws-parallelcluster-platform/files/enroot/enroot.template.conf renamed to cookbooks/aws-parallelcluster-platform/templates/enroot/enroot.conf.erb

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#ENROOT_LIBRARY_PATH /usr/lib/enroot
22
#ENROOT_SYSCONF_PATH /etc/enroot
3-
ENROOT_RUNTIME_PATH /tmp/enroot/user-$(id -u)
4-
ENROOT_CONFIG_PATH ${ENROOT_CONFIG_PATH}
5-
ENROOT_CACHE_PATH ${ENROOT_CACHE_PATH}
6-
ENROOT_DATA_PATH /tmp/enroot/data/user-$(id -u)
3+
ENROOT_RUNTIME_PATH <%= node['cluster']['enroot']['temporary_dir'] %>/runtime/user-$(id -u)
4+
ENROOT_DATA_PATH <%= node['cluster']['enroot']['temporary_dir'] %>/data/user-$(id -u)
5+
ENROOT_CONFIG_PATH <%= node['cluster']['enroot']['persistent_dir'] %>/config/user-$(id -u)
6+
ENROOT_CACHE_PATH <%= node['cluster']['enroot']['persistent_dir'] %>/cache/group-$(id -g)
77
#ENROOT_TEMP_PATH ${TMPDIR:-/tmp}
88

99
# Gzip program used to uncompress digest layers.
@@ -68,4 +68,4 @@ ENROOT_RESTRICT_DEV no
6868
#all_proxy
6969
#no_proxy
7070
#http_proxy
71-
#https_proxy
71+
#https_proxy

cookbooks/aws-parallelcluster-platform/test/controls/enroot_spec.rb

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,26 @@
1414

1515
expected_enroot_version = node['cluster']['enroot']['version']
1616

17-
describe "gdrcopy version is expected to be #{expected_enroot_version}" do
17+
describe "enroot version is expected to be #{expected_enroot_version}" do
1818
subject { command('enroot version').stdout.strip() }
1919
it { should eq expected_enroot_version }
2020
end
21+
22+
persistent_dirs = %w(/etc/enroot)
23+
persistent_dirs.each do |path|
24+
describe directory(path) do
25+
it { should exist }
26+
its('owner') { should eq 'root' }
27+
its('group') { should eq 'root' }
28+
its('mode') { should cmp '0755' }
29+
end
30+
end
2131
end
2232

2333
control 'tag:config_enroot_enabled_on_graphic_instances' do
2434
only_if { !os_properties.on_docker? && ['yes', true].include?(node['cluster']['nvidia']['enabled']) }
2535

26-
describe file("/opt/parallelcluster/shared/enroot") do
36+
describe file("/var/enroot/cache-group-1000") do
2737
it { should exist }
2838
its('group') { should eq 'root' }
2939
end unless os_properties.redhat_on_docker?

cookbooks/aws-parallelcluster-shared/attributes/cluster.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
default['cluster']['license_dir'] = "#{node['cluster']['base_dir']}/licenses"
55
default['cluster']['configs_dir'] = "#{node['cluster']['base_dir']}/configs"
66
default['cluster']['shared_dir'] = "#{node['cluster']['base_dir']}/shared"
7+
default['cluster']['examples_dir'] = "#{node['cluster']['base_dir']}/examples"
78
default['cluster']['shared_dir_login_nodes'] = "#{node['cluster']['base_dir']}/shared_login_nodes"
89
default['cluster']['log_base_dir'] = '/var/log/parallelcluster'
910
default['cluster']['etc_dir'] = '/etc/parallelcluster'

cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,10 @@
1818

1919
# Slurmdbd
2020
default['cluster']['slurmdbd_service_enabled'] = "true"
21+
22+
# Spank
23+
default['cluster']['slurm']['spank_config_dir'] = "#{node['cluster']['slurm']['install_dir']}/etc/plugstack.conf.d"
24+
25+
# Pyxis
26+
default['cluster']['pyxis']['version'] = '0.20.0'
27+
default['cluster']['pyxis']['runtime_path'] = '/run/pyxis'

0 commit comments

Comments
 (0)