Skip to content

Commit 46c841d

Browse files
committed
Disable CUDA install test on alinux with P4d instances
The nvidia-fabric-manager daemon is not installed on Amazon Linux 1 AMIs and this makes the deviceQuery utility fail on P4d instances. Skipping the test as the behavior is expected. Signed-off-by: ddeidda <[email protected]>
1 parent a1c4074 commit 46c841d

File tree

1 file changed

+32
-31
lines changed

1 file changed

+32
-31
lines changed

recipes/tests.rb

Lines changed: 32 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -378,38 +378,39 @@ module load intelmpi && mpirun --help | grep '#{node['cfncluster']['intelmpi']['
378378
TESTNVIDIA
379379
end
380380

381-
bash 'test CUDA install' do
382-
cwd Chef::Config[:file_cache_path]
383-
code <<-TESTCUDA
384-
has_gpu=$(lspci | grep -o "NVIDIA")
385-
if [ -z "$has_gpu" ]; then
386-
echo "No GPU detected, no test needed."
387-
exit 0
388-
fi
389-
390-
set -e
391-
cuda_ver="#{node['cfncluster']['nvidia']['cuda_version']}"
392-
# Test CUDA installation
393-
echo "Testing CUDA install with nvcc..."
394-
export PATH=/usr/local/cuda-$cuda_ver/bin:$PATH
395-
export LD_LIBRARY_PATH=/usr/local/cuda-$cuda_ver/lib64:$LD_LIBRARY_PATH
396-
# grep CUDA version from nvcc output. If CUDA is not installed nvcc command will fail
397-
cuda_output=$(nvcc -V | grep -E -o "release [0-9]+.[0-9]+")
398-
if [ "$cuda_output" != "release $cuda_ver" ]; then
399-
echo "CUDA installed incorrectly! Installed $cuda_output but expected $cuda_ver"
400-
exit 1
401-
else
402-
echo "CUDA nvcc test passed, $cuda_output"
403-
fi
404-
405-
# Test deviceQuery
406-
echo "Testing CUDA install with deviceQuery..."
407-
/usr/local/cuda-$cuda_ver/extras/demo_suite/deviceQuery | grep -o "Result = PASS"
408-
echo "CUDA deviceQuery test passed"
409-
echo "Correctly installed CUDA $cuda_output"
410-
TESTCUDA
381+
unless node['cfncluster']['cfn_base_os'] == 'alinux' && get_nvswitches > 1
382+
bash 'test CUDA install' do
383+
cwd Chef::Config[:file_cache_path]
384+
code <<-TESTCUDA
385+
has_gpu=$(lspci | grep -o "NVIDIA")
386+
if [ -z "$has_gpu" ]; then
387+
echo "No GPU detected, no test needed."
388+
exit 0
389+
fi
390+
391+
set -e
392+
cuda_ver="#{node['cfncluster']['nvidia']['cuda_version']}"
393+
# Test CUDA installation
394+
echo "Testing CUDA install with nvcc..."
395+
export PATH=/usr/local/cuda-$cuda_ver/bin:$PATH
396+
export LD_LIBRARY_PATH=/usr/local/cuda-$cuda_ver/lib64:$LD_LIBRARY_PATH
397+
# grep CUDA version from nvcc output. If CUDA is not installed nvcc command will fail
398+
cuda_output=$(nvcc -V | grep -E -o "release [0-9]+.[0-9]+")
399+
if [ "$cuda_output" != "release $cuda_ver" ]; then
400+
echo "CUDA installed incorrectly! Installed $cuda_output but expected $cuda_ver"
401+
exit 1
402+
else
403+
echo "CUDA nvcc test passed, $cuda_output"
404+
fi
405+
406+
# Test deviceQuery
407+
echo "Testing CUDA install with deviceQuery..."
408+
/usr/local/cuda-$cuda_ver/extras/demo_suite/deviceQuery | grep -o "Result = PASS"
409+
echo "CUDA deviceQuery test passed"
410+
echo "Correctly installed CUDA $cuda_output"
411+
TESTCUDA
412+
end
411413
end
412-
413414
###################
414415
# FabricManager
415416
###################

0 commit comments

Comments
 (0)