Skip to content

Commit fae667e

Browse files
committed
Fix EFA installation on RHEL8
There is a conflicts between `libibverbs` and `librdmacm` packages provided by EFA bundle and same packages coming from OS repository. `libibverbs` and `librdmacm` are installed as dependencies of the `hwloc-devel` and `iptables` package by our recipes (latest available version) EFA installer is installing `libibverbs`, `librdmacm`, `libibverbs-utils`, `librdmacm-utils` and `rdma-core-devel` version 43, included in the bundle. If the system already has `libibverbs` or `librdmacm` installed EFA installer will skip them but will install the `*-utils` packages causing a misalignment on the version. Error message from the EFA installer is: ``` - nothing provides libibverbs(x86-64) = 43.0-1.el8 needed by libibverbs-utils-43.0-1.el8.x86_64 - nothing provides librdmacm(x86-64) = 43.0-1.el8 needed by librdmacm-utils-43.0-1.el8.x86_64 - nothing provides libibverbs(x86-64) = 43.0-1.el8 needed by rdma-core-devel-43.0-1.el8.x86_64 - nothing provides librdmacm(x86-64) = 43.0-1.el8 needed by rdma-core-devel-43.0-1.el8.x86_64 ``` With this patch we're explicitly installing `*-utils` and `rdma-core-devel` packages from the OS to avoid the conflict at EFA installation time. EFA will skip installation of all of them and there are no issues. With this patch I'm also moving the two packages from `default_amazon2.rb` to `efa_alinxu2` to clarify they are required for EFA. Note that: `hwloc-devel` is required for Slurm `iptables` is required for IMDS configuration References: * Same fix for Alinux2: 04fb07b * Slurm documentation mentioning hwloc: https://slurm.schedmd.com/quickstart_admin.html Signed-off-by: Enrico Usai <[email protected]>
1 parent 0c775c2 commit fae667e

File tree

9 files changed

+55
-7
lines changed

9 files changed

+55
-7
lines changed

cookbooks/aws-parallelcluster-common/resources/efa/efa_alinux2.rb

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,9 @@ def conflicting_packages
2727
%w(openmpi-devel openmpi)
2828
end
2929
end
30+
31+
action_class do
32+
def prerequisites
33+
%w(environment-modules libibverbs-utils librdmacm-utils)
34+
end
35+
end

cookbooks/aws-parallelcluster-common/resources/efa/efa_centos7.rb

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,9 @@ def conflicting_packages
3030
%w(openmpi-devel openmpi)
3131
end
3232
end
33+
34+
action_class do
35+
def prerequisites
36+
%w(environment-modules)
37+
end
38+
end

cookbooks/aws-parallelcluster-common/resources/efa/efa_redhat8.rb

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,13 @@ def conflicting_packages
4040
%w(openmpi-devel openmpi)
4141
end
4242
end
43+
44+
action_class do
45+
def prerequisites
46+
if redhat_ubi?
47+
%w(environment-modules)
48+
else
49+
%w(environment-modules libibverbs-utils librdmacm-utils rdma-core-devel)
50+
end
51+
end
52+
end

cookbooks/aws-parallelcluster-common/resources/efa/efa_ubuntu1804.rb

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,9 @@ def conflicting_packages
2828
%w(libopenmpi-dev)
2929
end
3030
end
31+
32+
action_class do
33+
def prerequisites
34+
%w(environment-modules)
35+
end
36+
end

cookbooks/aws-parallelcluster-common/resources/efa/efa_ubuntu2004.rb

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,9 @@ def conflicting_packages
2828
%w(libopenmpi-dev)
2929
end
3030
end
31+
32+
action_class do
33+
def prerequisites
34+
%w(environment-modules)
35+
end
36+
end

cookbooks/aws-parallelcluster-common/resources/efa/partial/_setup.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
package_repos 'update package repos' do
4343
action :update
4444
end
45-
package %w(environment-modules) do
45+
package prerequisites do
4646
retries 3
4747
retry_delay 5
4848
end

cookbooks/aws-parallelcluster-common/spec/unit/resources/efa_spec.rb

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,14 @@ def mock_efa_supported(supported)
4646
end
4747
end
4848

49+
prerequisites = if platform == 'redhat'
50+
%w(environment-modules libibverbs-utils librdmacm-utils rdma-core-devel)
51+
elsif platform == 'amazon'
52+
%w(environment-modules libibverbs-utils librdmacm-utils)
53+
else
54+
"environment-modules"
55+
end
56+
4957
context 'when efa installed' do
5058
before do
5159
mock_efa_installed(true)
@@ -75,7 +83,7 @@ def mock_efa_supported(supported)
7583
is_expected.not_to write_log('efa installed')
7684
is_expected.not_to remove_package(%w(openmpi-devel openmpi))
7785
is_expected.to update_package_repos('update package repos')
78-
is_expected.to install_package("environment-modules")
86+
is_expected.to install_package(prerequisites)
7987
is_expected.to create_if_missing_remote_file("#{source_dir}/aws-efa-installer.tar.gz")
8088
is_expected.not_to run_bash('install efa')
8189
end
@@ -97,7 +105,7 @@ def mock_efa_supported(supported)
97105
is_expected.not_to write_log('efa installed')
98106
is_expected.to remove_package(platform == 'ubuntu' ? ['libopenmpi-dev'] : %w(openmpi-devel openmpi))
99107
is_expected.to update_package_repos('update package repos')
100-
is_expected.to install_package("environment-modules")
108+
is_expected.to install_package(prerequisites)
101109
is_expected.to create_if_missing_remote_file("#{source_dir}/aws-efa-installer.tar.gz")
102110
.with(source: "https://efa-installer.amazonaws.com/aws-efa-installer-#{efa_version}.tar.gz")
103111
.with(mode: '0644')
@@ -124,7 +132,7 @@ def mock_efa_supported(supported)
124132
it 'installs EFA skipping kmod' do
125133
is_expected.to remove_package(platform == 'ubuntu' ? ['libopenmpi-dev'] : %w(openmpi-devel openmpi))
126134
is_expected.to update_package_repos('update package repos')
127-
is_expected.to install_package("environment-modules")
135+
is_expected.to install_package(prerequisites)
128136
is_expected.to create_if_missing_remote_file("#{source_dir}/aws-efa-installer.tar.gz")
129137
.with(source: "https://efa-installer.amazonaws.com/aws-efa-installer-#{efa_version}.tar.gz")
130138
.with(mode: '0644')

cookbooks/aws-parallelcluster-install/attributes/default_amazon2.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
libxml2-devel perl-devel tar gzip bison flex gcc gcc-c++ patch
1313
rpm-build rpm-sign system-rpm-config cscope ctags diffstat doxygen elfutils
1414
gcc-gfortran git indent intltool patchutils rcs subversion swig systemtap curl
15-
jq wget python-pip NetworkManager-config-routing-rules libibverbs-utils
16-
librdmacm-utils python3 python3-pip iptables libcurl-devel yum-plugin-versionlock
15+
jq wget python-pip NetworkManager-config-routing-rules
16+
python3 python3-pip iptables libcurl-devel yum-plugin-versionlock
1717
coreutils moreutils environment-modules bzip2)
1818

1919
# Install R via amazon linux extras

test/resources/controls/aws_parallelcluster_install/efa_spec.rb

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,13 @@
3232
control 'efa_prereq_packages_installed' do
3333
title "EFA prereq packages are installed"
3434

35-
efa_prereq_packages = %w(environment-modules)
35+
efa_prereq_packages = if os_properties.redhat8? && !os_properties.redhat_ubi?
36+
%w(environment-modules libibverbs-utils librdmacm-utils rdma-core-devel)
37+
elsif os_properties.alinux2?
38+
%w(environment-modules libibverbs-utils librdmacm-utils)
39+
else
40+
%w(environment-modules)
41+
end
3642
efa_prereq_packages.each do |pkg|
3743
describe package(pkg) do
3844
it { should be_installed }

0 commit comments

Comments
 (0)