diff --git a/checks/apps/pytorch/pytorch_allreduce.py b/checks/apps/pytorch/pytorch_allreduce.py index 0273b3033..e8c4aa15b 100644 --- a/checks/apps/pytorch/pytorch_allreduce.py +++ b/checks/apps/pytorch/pytorch_allreduce.py @@ -22,6 +22,7 @@ @rfm.simple_test class PyTorchNCCLAllReduce(rfm.RunOnlyRegressionTest, ContainerEngineMixin): + descr = 'All-reduce PyTorch benchmark with CE (NCCL version)' valid_systems = ['+nvgpu'] valid_prog_environs = ['builtin'] num_nodes = variable(int, value=8) @@ -31,7 +32,6 @@ class PyTorchNCCLAllReduce(rfm.RunOnlyRegressionTest, ContainerEngineMixin): # NOTE: only the "-py3" image is supported by the test supported_flavors = ["-py3"] - pytorch_tags = nvidia_image_tags('pytorch') latest_tags = [] @@ -68,7 +68,7 @@ def set_image(self): 'aws_ofi_nccl.enabled': 'true', 'aws_ofi_nccl.variant': 'cuda12', }, - } + } @run_after('setup') def setup_test(self): @@ -82,7 +82,7 @@ def setup_test(self): @run_after('setup') def set_executable_opts(self): - self.prerun_cmds = ['wget https://jfrog.svc.cscs.ch/artifactory/cscs-reframe-tests/PyTorch/all_reduce_bench.py'] # noqa: E501 + self.prerun_cmds = ['wget https://jfrog.svc.cscs.ch/artifactory/cscs-reframe-tests/PyTorch/all_reduce_bench.py'] # noqa: E501 headnode_cmd = ( '$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)' ) @@ -103,12 +103,12 @@ def assert_sanity(self): @performance_function('GB/s') def bandwidth(self): return sn.extractsingle(r'\|\s*16GiB\s*\|\s*(?P\S+)GBps\s*\|', - self.stdout, tag='busbw', conv=float - ) + self.stdout, tag='busbw', conv=float) @rfm.simple_test class PyTorchRCCLAllReduce(rfm.RunOnlyRegressionTest, ContainerEngineMixin): + descr = 'All-reduce PyTorch benchmark with CE (RCCL version)' valid_systems = ['+amdgpu'] valid_prog_environs = ['builtin'] num_nodes = variable(int, value=8) @@ -116,7 +116,6 @@ class PyTorchRCCLAllReduce(rfm.RunOnlyRegressionTest, ContainerEngineMixin): curated_images = [ 'rocm/pytorch:rocm6.3.3_ubuntu24.04_py3.12_pytorch_release_2.4.0' ] - image = parameter(curated_images) #+ latest_images) executable = 'torchrun' num_tasks_per_node = 1 @@ -165,7 +164,6 @@ def set_executable_opts(self): @run_after('setup') def set_nccl_min_nchannels(self): gpu_devices = self.current_partition.select_devices('gpu')[0] - # https://rocm.docs.amd.com/projects/rccl/en/latest/how-to/rccl-usage-tips.html#improving-performance-on-the-mi300x-accelerator-when-using-fewer-than-8-gpus noqa: E501 if gpu_devices.num_devices < 8 and gpu_devices.arch == 'gfx942': self.env_vars['NCCL_MIN_NCHANNELS'] = 32 diff --git a/checks/apps/pytorch/pytorch_megatronlm.py b/checks/apps/pytorch/pytorch_megatronlm.py index 23d6ebbbc..4708d32ae 100644 --- a/checks/apps/pytorch/pytorch_megatronlm.py +++ b/checks/apps/pytorch/pytorch_megatronlm.py @@ -228,6 +228,9 @@ class PyTorchMegatronLM(rfm.RunOnlyRegressionTest): @run_after('setup') def setup_test(self): + descr = ( + 'Megatron tests with synthetic data, with options for large scale ' + 'and real data tests') model_config = self.configurations[self.model] if self.default_num_nodes is None: self.num_nodes = model_config['num_nodes'] diff --git a/checks/containers/container_engine/check_cuda_nbody.py b/checks/containers/container_engine/check_cuda_nbody.py index 040ad9c87..a0aaea2ad 100644 --- a/checks/containers/container_engine/check_cuda_nbody.py +++ b/checks/containers/container_engine/check_cuda_nbody.py @@ -16,6 +16,7 @@ @rfm.simple_test class CudaNBodyCheckCE(rfm.RunOnlyRegressionTest, ContainerEngineMixin): + descr = 'Single-node N-Body test for GPUs using CE (from CUDA samples)' valid_systems = ['+ce +nvgpu'] valid_prog_environs = ['builtin'] sourcesdir = None diff --git a/checks/containers/container_engine/cuda_mps.py b/checks/containers/container_engine/cuda_mps.py index 9cc5e13cd..8c8b349e6 100644 --- a/checks/containers/container_engine/cuda_mps.py +++ b/checks/containers/container_engine/cuda_mps.py @@ -17,6 +17,7 @@ @rfm.simple_test class CUDA_MPS_CE(rfm.RunOnlyRegressionTest, ContainerEngineMixin): + descr = 'Check for CUDA MPS with CE' valid_prog_environs = ['builtin'] valid_systems = ['+ce +nvgpu'] test_name = 'cuda_mps' @@ -36,4 +37,3 @@ class CUDA_MPS_CE(rfm.RunOnlyRegressionTest, ContainerEngineMixin): @sanity_function def assert_sanity(self): return sn.assert_found(r'^\d+ nvidia-cuda-mps-control -d$', self.stdout) - diff --git a/checks/containers/container_engine/omb.py b/checks/containers/container_engine/omb.py index 5fa5a447d..f1fe8738a 100644 --- a/checks/containers/container_engine/omb.py +++ b/checks/containers/container_engine/omb.py @@ -75,6 +75,7 @@ def set_perf(self): @rfm.simple_test class OMB_MPICH_CE(OMB_Base_CE): + descr = 'OSU Micro-benchmarks for MPICH/CE (Point-to-Point and All-to-All)' container_image = ( 'jfrog.svc.cscs.ch#reframe-oci/osu-mb:7.5-mpich4.3.0-ofi1.15-cuda12.8' ) @@ -99,6 +100,7 @@ def set_pmi2(self): @rfm.simple_test class OMB_OMPI_CE(OMB_Base_CE): + descr = 'OSU Micro-benchmarks for OpenMPI/CE (Point-to-Point and All-to-All)' container_image = (f'jfrog.svc.cscs.ch#reframe-oci/osu-mb:7.5-ompi5.0.7-ofi1.15-cuda12.8') valid_systems = ['+ce +nvgpu'] reference_per_test = { diff --git a/checks/containers/container_engine/ssh.py b/checks/containers/container_engine/ssh.py index e499224ff..e8602dd7a 100644 --- a/checks/containers/container_engine/ssh.py +++ b/checks/containers/container_engine/ssh.py @@ -17,6 +17,7 @@ @rfm.simple_test class SSH_CE(rfm.RunOnlyRegressionTest, ContainerEngineMixin): + descr = 'Checks if SSH is available with CE' valid_prog_environs = ['builtin'] valid_systems = ['+ce'] test_name = 'ssh' @@ -36,4 +37,3 @@ class SSH_CE(rfm.RunOnlyRegressionTest, ContainerEngineMixin): @sanity_function def assert_sanity(self): return sn.assert_found(r'^\d+ /opt/oci-hooks/ssh/dropbear/bin/dropbear.*-p 15263.*$', self.stdout) - diff --git a/checks/containers/container_engine/xccl_tests.py b/checks/containers/container_engine/xccl_tests.py index acc186a94..5bce0209c 100644 --- a/checks/containers/container_engine/xccl_tests.py +++ b/checks/containers/container_engine/xccl_tests.py @@ -87,6 +87,7 @@ def set_perf(self): @rfm.simple_test class NCCLTestsCE(XCCLTestBase): + descr = 'Point-to-Point and All-Reduce NCCL tests with CE' valid_systems = ['+ce +nvgpu'] image_tag = parameter(['cuda12.9.1']) @@ -126,6 +127,7 @@ def setup_ce(self): @rfm.simple_test class RCCLTestCE(XCCLTestBase): + descr = 'Point-to-Point and All-Reduce RCCL tests with CE' valid_systems = ['+ce +amdgpu'] image_tag = parameter(['rocm6.3.4']) min_bytes = '4096M' diff --git a/checks/microbenchmarks/cpu_gpu/node_burn/node-burn-ce.py b/checks/microbenchmarks/cpu_gpu/node_burn/node-burn-ce.py index a96b3aa5f..8fa53c63a 100644 --- a/checks/microbenchmarks/cpu_gpu/node_burn/node-burn-ce.py +++ b/checks/microbenchmarks/cpu_gpu/node_burn/node-burn-ce.py @@ -108,6 +108,7 @@ def nb_gbps(self): @rfm.simple_test class CudaNodeBurnGemmCE(NodeBurnGemmCE): + descr = 'GPU Node burn GEMM test for A100/GH200 using CE' executable = 'burn-f64' ref_nb_gflops = { 'a100': {'nb_gflops': (9746*2*0.85, -0.1, None, 'GFlops')}, @@ -134,6 +135,7 @@ def setup_job(self): @rfm.simple_test class CPUNodeBurnGemmCE(NodeBurnGemmCE): + descr = 'CPU Node burn GEMM test for A100/GH200-nodes using CE' executable = 'burn-f64-cpu' ref_nb_gflops = { 'gh200': {'nb_gflops': (3150, -0.1, None, 'GFlops')}, @@ -178,6 +180,7 @@ def setup_job(self): @rfm.simple_test class CudaNodeBurnStreamCE(NodeBurnStreamCE): + descr = 'GPU Node burn Stream test for A100/GH200 using CE' executable = 'burn-f64' ref_nb_gbps = { 'a100': {'nb_gbps': (2 * 1000 * 0.95, -0.1, None, 'GB/s')}, @@ -207,6 +210,7 @@ def setup_job(self): @rfm.simple_test class CPUNodeBurnStreamCE(NodeBurnStreamCE): + descr = 'CPU Node burn Stream test for A100/GH200-nodes using CE' executable = 'burn-f64-cpu' ref_nb_gbps = { 'gh200': {'nb_gbps': (450.0, -0.1, None, 'GB/s')}, diff --git a/checks/system/ce/ce_import_run_image.py b/checks/system/ce/ce_import_run_image.py index 6501b4e58..8262a1788 100644 --- a/checks/system/ce/ce_import_run_image.py +++ b/checks/system/ce/ce_import_run_image.py @@ -42,6 +42,7 @@ class enroot_import_image_ngc(enroot_import_image): @rfm.simple_test class RunJobCE(rfm.RunOnlyRegressionTest, ContainerEngineMixin): + descr = 'CE check with Dockerhub import and simple image run (ubuntu)' valid_systems = ['+ce'] valid_prog_environs = ['builtin'] container_image = '' # Defined after setup @@ -62,6 +63,7 @@ def assert_found_found_ubuntu(self): @rfm.simple_test class RunNVGPUJobCE(rfm.RunOnlyRegressionTest, ContainerEngineMixin): + descr = 'CE check with NGC import and Stream job on GPU' valid_systems = ['+ce +nvgpu'] valid_prog_environs = ['builtin'] container_image = '' # Defined after setup diff --git a/checks/system/slurm/gres_gpu.py b/checks/system/slurm/gres_gpu.py index dbeaf6c88..798ea5125 100644 --- a/checks/system/slurm/gres_gpu.py +++ b/checks/system/slurm/gres_gpu.py @@ -9,7 +9,7 @@ @rfm.simple_test class SlurmGPUGresTest(rfm.RunOnlyRegressionTest): - '''Ensure that the Slurm GRES (Gereric REsource Scheduling) of the number + descr = '''Ensure that the Slurm GRES (Generic REsource Scheduling) of the number of gpus is correctly set on all the nodes of each partition. For the current partition, the test performs the following steps: diff --git a/checks/system/slurm/slurm.py b/checks/system/slurm/slurm.py index d2649403d..7694831d3 100644 --- a/checks/system/slurm/slurm.py +++ b/checks/system/slurm/slurm.py @@ -14,51 +14,44 @@ class SlurmSimpleBaseCheck(rfm.RunOnlyRegressionTest): '''Base class for Slurm simple binary tests''' - valid_systems = ['daint:normal', 'eiger:mc', 'pilatus:mc'] - valid_prog_environs = ['PrgEnv-cray'] + valid_systems = ['+remote'] + valid_prog_environs = ['+prgenv'] tags = {'slurm', 'maintenance', 'ops', 'production', 'single-node'} num_tasks_per_node = 1 - - @run_after('init') - def customize_systems(self): - if self.current_system.name in ['arolla', 'tsa']: - self.valid_prog_environs = ['PrgEnv-gnu', 'PrgEnv-pgi'] - self.exclusive_access = True + # TODO: maintainers class SlurmCompiledBaseCheck(rfm.RegressionTest): '''Base class for Slurm tests that require compiling some code''' - - valid_systems = [] - valid_prog_environs = ['PrgEnv-cray'] - tags = {'slurm', 'maintenance', 'ops', - 'production', 'single-node'} + valid_systems = ['+remote'] + valid_prog_environs = ['+prgenv'] + build_locally = False + tags = {'slurm', 'maintenance', 'ops', 'production', 'single-node'} num_tasks_per_node = 1 @rfm.simple_test class HostnameCheck(SlurmSimpleBaseCheck): + descr = 'Check hostname pattern nidXXXXXX on the CN' + sourcesdir = None + time_limit = '1m' executable = '/bin/hostname' valid_prog_environs = ['builtin'] - hostname_patt = { - 'daint:normal': r'^nid\d{6}$', - 'eiger:mc': r'^nid\d{6}$', - 'pilatus:mc': r'^nid\d{6}$' - } + tags.add('flexible') @run_before('sanity') def set_sanity_patterns(self): - partname = self.current_partition.fullname - num_matches = sn.count( - sn.findall(self.hostname_patt[partname], self.stdout) - ) + num_matches = sn.count(sn.findall(r'^nid\d{6}$', self.stdout)) self.sanity_patterns = sn.assert_eq(self.num_tasks, num_matches) @rfm.simple_test class EnvironmentVariableCheck(SlurmSimpleBaseCheck): + descr = 'Test if user env variables are propagated to CN' + sourcesdir = None + time_limit = '1m' num_tasks = 2 - valid_systems = ['daint:normal', 'eiger:mc', 'pilatus:mc'] + valid_prog_environs = ['builtin'] executable = '/bin/echo' executable_opts = ['$MY_VAR'] env_vars = {'MY_VAR': 'TEST123456!'} @@ -72,7 +65,11 @@ def assert_num_tasks(self): @rfm.simple_test class RequiredConstraintCheck(SlurmSimpleBaseCheck): - valid_systems = [] + descr = 'Test if -C constraint is required (deprecated)' + sourcesdir = None + time_limit = '1m' + valid_prog_environs = ['builtin'] + valid_systems = [] # will never run, we use slurm partitions now executable = 'srun' executable_opts = ['-A', osext.osgroup(), 'hostname'] @@ -86,24 +83,32 @@ def assert_found_missing_constraint(self): @rfm.simple_test class RequestLargeMemoryNodeCheck(SlurmSimpleBaseCheck): - valid_systems = [] + descr = 'Check if slurm memory flag works (deprecated, replaced by MemoryOverconsumptionCheck)' + sourcesdir = None + time_limit = '1m' + valid_systems = [] # use MemoryOverconsumptionCheck instead + valid_prog_environs = ['builtin'] executable = '/usr/bin/free' executable_opts = ['-h'] + @run_before('run') + def set_memory_limit(self): + self.job.options = ['--mem=120000'] + @sanity_function def assert_memory_is_bounded(self): mem_obtained = sn.extractsingle(r'Mem:\s+(?P\S+)G', self.stdout, 'mem', float) - return sn.assert_bounded(mem_obtained, 122.0, 128.0) - - @run_before('run') - def set_memory_limit(self): - self.job.options = ['--mem=120000'] + return sn.assert_bounded(mem_obtained, 122.0, None) @rfm.simple_test -class DefaultRequestGPU(SlurmSimpleBaseCheck): - valid_systems = ['daint:normal'] +class NvidiaSmiDriverVersion(SlurmSimpleBaseCheck): + descr = 'Nvidia-smi sanity check (output driver version)' + sourcesdir = None + time_limit = '1m' + valid_prog_environs = ['builtin'] + valid_systems = ['+nvgpu'] executable = 'nvidia-smi' @sanity_function @@ -114,17 +119,24 @@ def asser_found_nvidia_driver_version(self): @rfm.simple_test class DefaultRequestGPUSetsGRES(SlurmSimpleBaseCheck): - valid_systems = [] + descr = 'Checks slurm config for 4-GPUs per node' + sourcesdir = None + time_limit = '1m' + valid_prog_environs = ['builtin'] + valid_systems = ['+gpu'] executable = 'scontrol show job ${SLURM_JOB_ID}' + tags.add('flexible') @sanity_function def assert_found_resources(self): - return sn.assert_found(r'.*(TresPerNode|Gres)=.*gpu=4.*', self.stdout) + return sn.assert_found(r'.*(AllocTRES|Gres)=.*gres/gpu=4.*', + self.stdout) @rfm.simple_test class DefaultRequest(SlurmSimpleBaseCheck): - valid_systems = ['daint:normal'] + descr = 'Sanity check for core count (needs to be updated)' + valid_systems = [] # will never run, TODO: use .reframe/topology/ # This is a basic test that should return the number of CPUs on the # system which, on a MC node should be 72 executable = 'lscpu -p |grep -v "^#" -c' @@ -136,66 +148,58 @@ def assert_found_num_cpus(self): @rfm.simple_test class ConstraintRequestCabinetGrouping(SlurmSimpleBaseCheck): - valid_systems = [] + descr = 'Checks if contraint works for requesting specific cabinets (deprecated, needs attention)' + valid_systems = [] # will never run, TODO: update executable = 'cat /proc/cray_xt/cname' cabinets = { 'daint:gpu': 'c0-1', 'daint:mc': 'c1-0', } - @sanity_function - def assert_found_cabinet(self): - # We choose a default pattern that will cause assert_found() to fail - cabinet = self.cabinets.get(self.current_system.name, r'$^') - return sn.assert_found(fr'{cabinet}.*', self.stdout) - @run_before('run') def set_slurm_constraint(self): cabinet = self.cabinets.get(self.current_partition.fullname) if cabinet: self.job.options = [f'--constraint={cabinet}'] + @sanity_function + def assert_found_cabinet(self): + # We choose a default pattern that will cause assert_found() to fail + cabinet = self.cabinets.get(self.current_system.name, r'$^') + return sn.assert_found(fr'{cabinet}.*', self.stdout) + @rfm.simple_test class MemoryOverconsumptionCheck(SlurmCompiledBaseCheck): - time_limit = '1m' - valid_systems = ['daint:normal', 'eiger:mc', 'pilatus:mc'] + descr = 'Tests if requested memory limit works' + time_limit = '2m' tags.add('mem') - sourcepath = 'eatmemory.c' + build_system = 'SingleSource' + sourcepath = 'eatmem/eatmemory.c' executable_opts = ['4000M'] + @run_before('run') + def set_memory_limit(self): + self.job.options = ['--mem=2000'] + @sanity_function def assert_found_exceeded_memory(self): return sn.assert_found(r'(exceeded memory limit)|(Out Of Memory)', self.stderr) - @run_before('run') - def set_memory_limit(self): - self.job.options = ['--mem=2000'] - @rfm.simple_test class MemoryOverconsumptionMpiCheck(SlurmCompiledBaseCheck): - maintainers = ['@jgphpc', '@ekouts'] + descr = 'Tests for max allocatable memory' + # TODO: maintainers = ['@jgphpc', '@ekouts'] valid_systems = ['+remote'] + valid_prog_environs = ['+prgenv +mpi'] time_limit = '5m' build_system = 'SingleSource' - sourcepath = 'eatmemory_mpi.c' - env_vars = {'MPICH_GPU_SUPPORT_ENABLED': 0} + sourcepath = 'eatmem/eatmemory_mpi.c' + # env_vars = {'MPICH_GPU_SUPPORT_ENABLED': 0} tags.add('mem') - @run_before('compile') - def unset_ldflags(self): - if 'alps' in self.current_partition.features: - self.build_system.ldflags = ['-L.'] - - @run_before('run') - def set_job_parameters(self): - # fix for "MPIR_pmi_init(83)....: PMI2_Job_GetId returned 14" - self.job.launcher.options += ( - self.current_environ.extras.get('launcher_options', []) - ) - @run_before('run') def set_num_tasks(self): self.skip_if_no_procinfo() @@ -212,17 +216,19 @@ def assert_found_oom(self): @performance_function('GB') def cn_avail_memory_from_sysconf(self): regex = r'memory from sysconf: total: \S+ \S+ avail: (?P\S+) GB' - return sn.extractsingle(regex, self.stdout, 'mem', int) + # return float to avoid truncation in Elastic + return sn.extractsingle(regex, self.stdout, 'mem', float) @performance_function('GB') def cn_max_allocated_memory(self): regex = (r'^Eating \d+ MB\/mpi \*\d+mpi = -\d+ MB memory from \/proc\/' r'meminfo: total: \d+ GB, free: \d+ GB, avail: \d+ GB, using:' r' (\d+) GB') - return sn.max(sn.extractall(regex, self.stdout, 1, int)) + # return float to avoid truncation in Elastic + return sn.max(sn.extractall(regex, self.stdout, 1, float)) @run_before('performance') - def set_references(self): + def set_reference_from_config_systems_file(self): reference_mem = self.current_partition.extras['cn_memory'] - 3 self.reference = { '*': { @@ -233,8 +239,9 @@ def set_references(self): @rfm.simple_test class slurm_response_check(rfm.RunOnlyRegressionTest): + descr = 'Slurm basic commands test (squeue, sacct)' command = parameter(['squeue', 'sacct']) - descr = 'Slurm command test' + sourcesdir = None valid_systems = ['-remote'] valid_prog_environs = ['builtin'] num_tasks = 1 @@ -249,7 +256,7 @@ class slurm_response_check(rfm.RunOnlyRegressionTest): } executable = 'time -p' tags = {'diagnostic', 'health'} - maintainers = ['CB', 'VH'] + # TODO: maintainers = ['CB', 'VH'] @run_before('run') def set_exec_opts(self): @@ -283,8 +290,7 @@ def get_system_partitions(): @rfm.simple_test class SlurmQueueStatusCheck(rfm.RunOnlyRegressionTest): - '''check system queue status''' - + descr = 'check system queue status (# of nodes)' valid_systems = ['-remote'] valid_prog_environs = ['builtin'] tags = {'slurm', 'ops', 'production', 'single-node'} @@ -330,12 +336,12 @@ def assert_percentage_nodes(self): matches = sn.extractall( fr'^{re.escape(self.slurm_partition)},up,' fr'(?P\d+),(allocated|reserved|idle|mixed)', - self.stdout, 'nodes', int + self.stdout, 'nodes', float ) num_matches = sn.sum(matches) all_matches = sn.extractall(fr'^{re.escape(self.slurm_partition)},up,' fr'(?P\d+),.*', self.stdout, - 'nodes', int) + 'nodes', float) self.num_all_matches = sn.sum(all_matches) diff_matches = self.num_all_matches - num_matches return sn.assert_le(diff_matches, @@ -365,7 +371,7 @@ def idle_nodes(self): sn.extractall( fr'^{re.escape(self.slurm_partition)},up,' fr'(?P\d+),idle', - self.stdout, 'nodes', int + self.stdout, 'nodes', float ) ) @@ -375,7 +381,7 @@ def allocated_nodes(self): sn.extractall( fr'^{re.escape(self.slurm_partition)},up,' fr'(?P\d+),allocated', - self.stdout, 'nodes', int + self.stdout, 'nodes', float ) ) @@ -385,7 +391,7 @@ def mixed_nodes(self): sn.extractall( fr'^{re.escape(self.slurm_partition)},up,' fr'(?P\d+),mixed', - self.stdout, 'nodes', int + self.stdout, 'nodes', float ) ) @@ -395,7 +401,7 @@ def reserved_nodes(self): sn.extractall( fr'^{re.escape(self.slurm_partition)},up,' fr'(?P\d+),reserved', - self.stdout, 'nodes', int + self.stdout, 'nodes', float ) ) @@ -410,6 +416,7 @@ def available_nodes_percentage(self): @rfm.simple_test class SlurmPrologEpilogCheck(rfm.RunOnlyRegressionTest): + descr = 'Runs Prolog and Epilog tests' valid_systems = ['*'] valid_prog_environs = ['builtin'] time_limit = '2m' @@ -446,7 +453,7 @@ def validate(self): @rfm.simple_test class SlurmTransparentHugepagesCheck(rfm.RunOnlyRegressionTest): - '''Check Slurm transparent hugepages configuration''' + descr = 'Checks if Slurm transparent hugepages constraint works' hugepages_options = parameter(['default', 'always', 'madvise', 'never']) valid_systems = ['+hugepages_slurm'] @@ -494,3 +501,39 @@ class SlurmParanoidCheck(rfm.RunOnlyRegressionTest): @sanity_function def validate(self): return sn.assert_found(r'0', self.stdout) + + +@rfm.simple_test +class SlurmGPUGresTest(SlurmSimpleBaseCheck): + descr = '''Ensure that the Slurm GRES (Gereric REsource Scheduling) of the number + of gpus is correctly set on all the nodes of each partition.''' + + ''' For the current partition, the test performs the following steps: + 1) count the number of nodes (node_count) + 2) count the number of nodes having Gres=gpu:N (gres_count) where + N=num_devices from the configuration + 3) ensure that 1) and 2) match + ''' + valid_systems = ['+scontrol +gpu'] + valid_prog_environs = ['builtin'] + sourcesdir = None + time_limit = '1m' + num_tasks_per_node = 1 + executable = 'scontrol' + executable_opts = ['show', 'nodes', '--oneliner'] + tags = {'production', 'maintenance'} + + @sanity_function + def assert_gres_valid(self): + partition_name = self.current_partition.name + gpu_count = self.current_partition.select_devices('gpu')[0].num_devices + part_re = rf'Partitions=\S*{partition_name}' + gres_re = rf'gres/gpu={gpu_count} ' + node_count = sn.count(sn.extractall(part_re, self.stdout)) + gres_count = sn.count( + sn.extractall(rf'{part_re}.*{gres_re}', self.stdout)) + return sn.assert_eq( + node_count, gres_count, + f'{gres_count}/{node_count} of ' + f'{partition_name} nodes satisfy {gres_re}' + ) \ No newline at end of file