From 4403f3be6997eaafb2c5c77b87a399a17a4d68be Mon Sep 17 00:00:00 2001 From: Guilherme Date: Tue, 23 Sep 2025 15:20:16 +0200 Subject: [PATCH 01/11] Add descr pytorch_allreduce.py --- checks/apps/pytorch/pytorch_allreduce.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/checks/apps/pytorch/pytorch_allreduce.py b/checks/apps/pytorch/pytorch_allreduce.py index 0273b3033..40c900c97 100644 --- a/checks/apps/pytorch/pytorch_allreduce.py +++ b/checks/apps/pytorch/pytorch_allreduce.py @@ -22,6 +22,7 @@ @rfm.simple_test class PyTorchNCCLAllReduce(rfm.RunOnlyRegressionTest, ContainerEngineMixin): + descr = 'All-reduce PyTorch benchmark with CE (NCCL version)' valid_systems = ['+nvgpu'] valid_prog_environs = ['builtin'] num_nodes = variable(int, value=8) @@ -109,6 +110,7 @@ def bandwidth(self): @rfm.simple_test class PyTorchRCCLAllReduce(rfm.RunOnlyRegressionTest, ContainerEngineMixin): + descr = 'All-reduce PyTorch benchmark with CE (RCCL version)' valid_systems = ['+amdgpu'] valid_prog_environs = ['builtin'] num_nodes = variable(int, value=8) From 0ff52ccb257612323408c603af7068810aa36c90 Mon Sep 17 00:00:00 2001 From: Guilherme Date: Wed, 1 Oct 2025 13:13:13 +0200 Subject: [PATCH 02/11] Add descr pytorch_megatronlm.py --- checks/apps/pytorch/pytorch_megatronlm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/checks/apps/pytorch/pytorch_megatronlm.py b/checks/apps/pytorch/pytorch_megatronlm.py index 23d6ebbbc..b5e285cdf 100644 --- a/checks/apps/pytorch/pytorch_megatronlm.py +++ b/checks/apps/pytorch/pytorch_megatronlm.py @@ -228,6 +228,7 @@ class PyTorchMegatronLM(rfm.RunOnlyRegressionTest): @run_after('setup') def setup_test(self): + descr = 'Megatron tests with synthetic data, with options for large scale and real data tests' model_config = self.configurations[self.model] if self.default_num_nodes is None: self.num_nodes = model_config['num_nodes'] From b7d8e7a6fed1fd81a6abf628ebdd0f7e6ba439c1 Mon Sep 17 00:00:00 2001 From: Guilherme Date: Wed, 1 Oct 2025 13:15:33 +0200 Subject: [PATCH 03/11] Add descr check_cuda_nbody.py --- checks/containers/container_engine/check_cuda_nbody.py | 1 + 1 file changed, 1 insertion(+) diff --git a/checks/containers/container_engine/check_cuda_nbody.py b/checks/containers/container_engine/check_cuda_nbody.py index 040ad9c87..2cef9d665 100644 --- a/checks/containers/container_engine/check_cuda_nbody.py +++ b/checks/containers/container_engine/check_cuda_nbody.py @@ -16,6 +16,7 @@ @rfm.simple_test class CudaNBodyCheckCE(rfm.RunOnlyRegressionTest, ContainerEngineMixin): + descr = 'Single-node N-Body test for GPU (from CUDA samples)' valid_systems = ['+ce +nvgpu'] valid_prog_environs = ['builtin'] sourcesdir = None From a46e8dcd3e78f87ffa109b1affbb813374de8bb7 Mon Sep 17 00:00:00 2001 From: Guilherme Date: Wed, 1 Oct 2025 15:10:34 +0200 Subject: [PATCH 04/11] Adding descriptions --- checks/containers/container_engine/check_cuda_nbody.py | 2 +- checks/containers/container_engine/cuda_mps.py | 1 + checks/containers/container_engine/omb.py | 2 ++ checks/containers/container_engine/ssh.py | 1 + checks/containers/container_engine/xccl_tests.py | 2 ++ checks/microbenchmarks/cpu_gpu/node_burn/node-burn-ce.py | 4 ++++ checks/system/ce/ce_import_run_image.py | 2 ++ checks/system/slurm/gres_gpu.py | 2 +- 8 files changed, 14 insertions(+), 2 deletions(-) diff --git a/checks/containers/container_engine/check_cuda_nbody.py b/checks/containers/container_engine/check_cuda_nbody.py index 2cef9d665..a0aaea2ad 100644 --- a/checks/containers/container_engine/check_cuda_nbody.py +++ b/checks/containers/container_engine/check_cuda_nbody.py @@ -16,7 +16,7 @@ @rfm.simple_test class CudaNBodyCheckCE(rfm.RunOnlyRegressionTest, ContainerEngineMixin): - descr = 'Single-node N-Body test for GPU (from CUDA samples)' + descr = 'Single-node N-Body test for GPUs using CE (from CUDA samples)' valid_systems = ['+ce +nvgpu'] valid_prog_environs = ['builtin'] sourcesdir = None diff --git a/checks/containers/container_engine/cuda_mps.py b/checks/containers/container_engine/cuda_mps.py index 9cc5e13cd..54aa2f9ff 100644 --- a/checks/containers/container_engine/cuda_mps.py +++ b/checks/containers/container_engine/cuda_mps.py @@ -17,6 +17,7 @@ @rfm.simple_test class CUDA_MPS_CE(rfm.RunOnlyRegressionTest, ContainerEngineMixin): + descr = 'Check for CUDA MPS with CE' valid_prog_environs = ['builtin'] valid_systems = ['+ce +nvgpu'] test_name = 'cuda_mps' diff --git a/checks/containers/container_engine/omb.py b/checks/containers/container_engine/omb.py index 5fa5a447d..f1fe8738a 100644 --- a/checks/containers/container_engine/omb.py +++ b/checks/containers/container_engine/omb.py @@ -75,6 +75,7 @@ def set_perf(self): @rfm.simple_test class OMB_MPICH_CE(OMB_Base_CE): + descr = 'OSU Micro-benchmarks for MPICH/CE (Point-to-Point and All-to-All)' container_image = ( 'jfrog.svc.cscs.ch#reframe-oci/osu-mb:7.5-mpich4.3.0-ofi1.15-cuda12.8' ) @@ -99,6 +100,7 @@ def set_pmi2(self): @rfm.simple_test class OMB_OMPI_CE(OMB_Base_CE): + descr = 'OSU Micro-benchmarks for OpenMPI/CE (Point-to-Point and All-to-All)' container_image = (f'jfrog.svc.cscs.ch#reframe-oci/osu-mb:7.5-ompi5.0.7-ofi1.15-cuda12.8') valid_systems = ['+ce +nvgpu'] reference_per_test = { diff --git a/checks/containers/container_engine/ssh.py b/checks/containers/container_engine/ssh.py index e499224ff..167b53444 100644 --- a/checks/containers/container_engine/ssh.py +++ b/checks/containers/container_engine/ssh.py @@ -17,6 +17,7 @@ @rfm.simple_test class SSH_CE(rfm.RunOnlyRegressionTest, ContainerEngineMixin): + descr = 'Checks if SSH is available with CE' valid_prog_environs = ['builtin'] valid_systems = ['+ce'] test_name = 'ssh' diff --git a/checks/containers/container_engine/xccl_tests.py b/checks/containers/container_engine/xccl_tests.py index acc186a94..5bce0209c 100644 --- a/checks/containers/container_engine/xccl_tests.py +++ b/checks/containers/container_engine/xccl_tests.py @@ -87,6 +87,7 @@ def set_perf(self): @rfm.simple_test class NCCLTestsCE(XCCLTestBase): + descr = 'Point-to-Point and All-Reduce NCCL tests with CE' valid_systems = ['+ce +nvgpu'] image_tag = parameter(['cuda12.9.1']) @@ -126,6 +127,7 @@ def setup_ce(self): @rfm.simple_test class RCCLTestCE(XCCLTestBase): + descr = 'Point-to-Point and All-Reduce RCCL tests with CE' valid_systems = ['+ce +amdgpu'] image_tag = parameter(['rocm6.3.4']) min_bytes = '4096M' diff --git a/checks/microbenchmarks/cpu_gpu/node_burn/node-burn-ce.py b/checks/microbenchmarks/cpu_gpu/node_burn/node-burn-ce.py index a96b3aa5f..8fa53c63a 100644 --- a/checks/microbenchmarks/cpu_gpu/node_burn/node-burn-ce.py +++ b/checks/microbenchmarks/cpu_gpu/node_burn/node-burn-ce.py @@ -108,6 +108,7 @@ def nb_gbps(self): @rfm.simple_test class CudaNodeBurnGemmCE(NodeBurnGemmCE): + descr = 'GPU Node burn GEMM test for A100/GH200 using CE' executable = 'burn-f64' ref_nb_gflops = { 'a100': {'nb_gflops': (9746*2*0.85, -0.1, None, 'GFlops')}, @@ -134,6 +135,7 @@ def setup_job(self): @rfm.simple_test class CPUNodeBurnGemmCE(NodeBurnGemmCE): + descr = 'CPU Node burn GEMM test for A100/GH200-nodes using CE' executable = 'burn-f64-cpu' ref_nb_gflops = { 'gh200': {'nb_gflops': (3150, -0.1, None, 'GFlops')}, @@ -178,6 +180,7 @@ def setup_job(self): @rfm.simple_test class CudaNodeBurnStreamCE(NodeBurnStreamCE): + descr = 'GPU Node burn Stream test for A100/GH200 using CE' executable = 'burn-f64' ref_nb_gbps = { 'a100': {'nb_gbps': (2 * 1000 * 0.95, -0.1, None, 'GB/s')}, @@ -207,6 +210,7 @@ def setup_job(self): @rfm.simple_test class CPUNodeBurnStreamCE(NodeBurnStreamCE): + descr = 'CPU Node burn Stream test for A100/GH200-nodes using CE' executable = 'burn-f64-cpu' ref_nb_gbps = { 'gh200': {'nb_gbps': (450.0, -0.1, None, 'GB/s')}, diff --git a/checks/system/ce/ce_import_run_image.py b/checks/system/ce/ce_import_run_image.py index 6501b4e58..8262a1788 100644 --- a/checks/system/ce/ce_import_run_image.py +++ b/checks/system/ce/ce_import_run_image.py @@ -42,6 +42,7 @@ class enroot_import_image_ngc(enroot_import_image): @rfm.simple_test class RunJobCE(rfm.RunOnlyRegressionTest, ContainerEngineMixin): + descr = 'CE check with Dockerhub import and simple image run (ubuntu)' valid_systems = ['+ce'] valid_prog_environs = ['builtin'] container_image = '' # Defined after setup @@ -62,6 +63,7 @@ def assert_found_found_ubuntu(self): @rfm.simple_test class RunNVGPUJobCE(rfm.RunOnlyRegressionTest, ContainerEngineMixin): + descr = 'CE check with NGC import and Stream job on GPU' valid_systems = ['+ce +nvgpu'] valid_prog_environs = ['builtin'] container_image = '' # Defined after setup diff --git a/checks/system/slurm/gres_gpu.py b/checks/system/slurm/gres_gpu.py index dbeaf6c88..d52494c99 100644 --- a/checks/system/slurm/gres_gpu.py +++ b/checks/system/slurm/gres_gpu.py @@ -9,7 +9,7 @@ @rfm.simple_test class SlurmGPUGresTest(rfm.RunOnlyRegressionTest): - '''Ensure that the Slurm GRES (Gereric REsource Scheduling) of the number + descr = '''Ensure that the Slurm GRES (Gereric REsource Scheduling) of the number of gpus is correctly set on all the nodes of each partition. For the current partition, the test performs the following steps: From 4e9f530ef45abc08d740de08a2b8315d3fe4a03e Mon Sep 17 00:00:00 2001 From: Guilherme Date: Thu, 2 Oct 2025 15:58:00 +0200 Subject: [PATCH 05/11] Adding descr to slurm tests --- checks/system/slurm/slurm.py | 199 +++++++++++++++++++++-------------- 1 file changed, 121 insertions(+), 78 deletions(-) diff --git a/checks/system/slurm/slurm.py b/checks/system/slurm/slurm.py index d2649403d..7694831d3 100644 --- a/checks/system/slurm/slurm.py +++ b/checks/system/slurm/slurm.py @@ -14,51 +14,44 @@ class SlurmSimpleBaseCheck(rfm.RunOnlyRegressionTest): '''Base class for Slurm simple binary tests''' - valid_systems = ['daint:normal', 'eiger:mc', 'pilatus:mc'] - valid_prog_environs = ['PrgEnv-cray'] + valid_systems = ['+remote'] + valid_prog_environs = ['+prgenv'] tags = {'slurm', 'maintenance', 'ops', 'production', 'single-node'} num_tasks_per_node = 1 - - @run_after('init') - def customize_systems(self): - if self.current_system.name in ['arolla', 'tsa']: - self.valid_prog_environs = ['PrgEnv-gnu', 'PrgEnv-pgi'] - self.exclusive_access = True + # TODO: maintainers class SlurmCompiledBaseCheck(rfm.RegressionTest): '''Base class for Slurm tests that require compiling some code''' - - valid_systems = [] - valid_prog_environs = ['PrgEnv-cray'] - tags = {'slurm', 'maintenance', 'ops', - 'production', 'single-node'} + valid_systems = ['+remote'] + valid_prog_environs = ['+prgenv'] + build_locally = False + tags = {'slurm', 'maintenance', 'ops', 'production', 'single-node'} num_tasks_per_node = 1 @rfm.simple_test class HostnameCheck(SlurmSimpleBaseCheck): + descr = 'Check hostname pattern nidXXXXXX on the CN' + sourcesdir = None + time_limit = '1m' executable = '/bin/hostname' valid_prog_environs = ['builtin'] - hostname_patt = { - 'daint:normal': r'^nid\d{6}$', - 'eiger:mc': r'^nid\d{6}$', - 'pilatus:mc': r'^nid\d{6}$' - } + tags.add('flexible') @run_before('sanity') def set_sanity_patterns(self): - partname = self.current_partition.fullname - num_matches = sn.count( - sn.findall(self.hostname_patt[partname], self.stdout) - ) + num_matches = sn.count(sn.findall(r'^nid\d{6}$', self.stdout)) self.sanity_patterns = sn.assert_eq(self.num_tasks, num_matches) @rfm.simple_test class EnvironmentVariableCheck(SlurmSimpleBaseCheck): + descr = 'Test if user env variables are propagated to CN' + sourcesdir = None + time_limit = '1m' num_tasks = 2 - valid_systems = ['daint:normal', 'eiger:mc', 'pilatus:mc'] + valid_prog_environs = ['builtin'] executable = '/bin/echo' executable_opts = ['$MY_VAR'] env_vars = {'MY_VAR': 'TEST123456!'} @@ -72,7 +65,11 @@ def assert_num_tasks(self): @rfm.simple_test class RequiredConstraintCheck(SlurmSimpleBaseCheck): - valid_systems = [] + descr = 'Test if -C constraint is required (deprecated)' + sourcesdir = None + time_limit = '1m' + valid_prog_environs = ['builtin'] + valid_systems = [] # will never run, we use slurm partitions now executable = 'srun' executable_opts = ['-A', osext.osgroup(), 'hostname'] @@ -86,24 +83,32 @@ def assert_found_missing_constraint(self): @rfm.simple_test class RequestLargeMemoryNodeCheck(SlurmSimpleBaseCheck): - valid_systems = [] + descr = 'Check if slurm memory flag works (deprecated, replaced by MemoryOverconsumptionCheck)' + sourcesdir = None + time_limit = '1m' + valid_systems = [] # use MemoryOverconsumptionCheck instead + valid_prog_environs = ['builtin'] executable = '/usr/bin/free' executable_opts = ['-h'] + @run_before('run') + def set_memory_limit(self): + self.job.options = ['--mem=120000'] + @sanity_function def assert_memory_is_bounded(self): mem_obtained = sn.extractsingle(r'Mem:\s+(?P\S+)G', self.stdout, 'mem', float) - return sn.assert_bounded(mem_obtained, 122.0, 128.0) - - @run_before('run') - def set_memory_limit(self): - self.job.options = ['--mem=120000'] + return sn.assert_bounded(mem_obtained, 122.0, None) @rfm.simple_test -class DefaultRequestGPU(SlurmSimpleBaseCheck): - valid_systems = ['daint:normal'] +class NvidiaSmiDriverVersion(SlurmSimpleBaseCheck): + descr = 'Nvidia-smi sanity check (output driver version)' + sourcesdir = None + time_limit = '1m' + valid_prog_environs = ['builtin'] + valid_systems = ['+nvgpu'] executable = 'nvidia-smi' @sanity_function @@ -114,17 +119,24 @@ def asser_found_nvidia_driver_version(self): @rfm.simple_test class DefaultRequestGPUSetsGRES(SlurmSimpleBaseCheck): - valid_systems = [] + descr = 'Checks slurm config for 4-GPUs per node' + sourcesdir = None + time_limit = '1m' + valid_prog_environs = ['builtin'] + valid_systems = ['+gpu'] executable = 'scontrol show job ${SLURM_JOB_ID}' + tags.add('flexible') @sanity_function def assert_found_resources(self): - return sn.assert_found(r'.*(TresPerNode|Gres)=.*gpu=4.*', self.stdout) + return sn.assert_found(r'.*(AllocTRES|Gres)=.*gres/gpu=4.*', + self.stdout) @rfm.simple_test class DefaultRequest(SlurmSimpleBaseCheck): - valid_systems = ['daint:normal'] + descr = 'Sanity check for core count (needs to be updated)' + valid_systems = [] # will never run, TODO: use .reframe/topology/ # This is a basic test that should return the number of CPUs on the # system which, on a MC node should be 72 executable = 'lscpu -p |grep -v "^#" -c' @@ -136,66 +148,58 @@ def assert_found_num_cpus(self): @rfm.simple_test class ConstraintRequestCabinetGrouping(SlurmSimpleBaseCheck): - valid_systems = [] + descr = 'Checks if contraint works for requesting specific cabinets (deprecated, needs attention)' + valid_systems = [] # will never run, TODO: update executable = 'cat /proc/cray_xt/cname' cabinets = { 'daint:gpu': 'c0-1', 'daint:mc': 'c1-0', } - @sanity_function - def assert_found_cabinet(self): - # We choose a default pattern that will cause assert_found() to fail - cabinet = self.cabinets.get(self.current_system.name, r'$^') - return sn.assert_found(fr'{cabinet}.*', self.stdout) - @run_before('run') def set_slurm_constraint(self): cabinet = self.cabinets.get(self.current_partition.fullname) if cabinet: self.job.options = [f'--constraint={cabinet}'] + @sanity_function + def assert_found_cabinet(self): + # We choose a default pattern that will cause assert_found() to fail + cabinet = self.cabinets.get(self.current_system.name, r'$^') + return sn.assert_found(fr'{cabinet}.*', self.stdout) + @rfm.simple_test class MemoryOverconsumptionCheck(SlurmCompiledBaseCheck): - time_limit = '1m' - valid_systems = ['daint:normal', 'eiger:mc', 'pilatus:mc'] + descr = 'Tests if requested memory limit works' + time_limit = '2m' tags.add('mem') - sourcepath = 'eatmemory.c' + build_system = 'SingleSource' + sourcepath = 'eatmem/eatmemory.c' executable_opts = ['4000M'] + @run_before('run') + def set_memory_limit(self): + self.job.options = ['--mem=2000'] + @sanity_function def assert_found_exceeded_memory(self): return sn.assert_found(r'(exceeded memory limit)|(Out Of Memory)', self.stderr) - @run_before('run') - def set_memory_limit(self): - self.job.options = ['--mem=2000'] - @rfm.simple_test class MemoryOverconsumptionMpiCheck(SlurmCompiledBaseCheck): - maintainers = ['@jgphpc', '@ekouts'] + descr = 'Tests for max allocatable memory' + # TODO: maintainers = ['@jgphpc', '@ekouts'] valid_systems = ['+remote'] + valid_prog_environs = ['+prgenv +mpi'] time_limit = '5m' build_system = 'SingleSource' - sourcepath = 'eatmemory_mpi.c' - env_vars = {'MPICH_GPU_SUPPORT_ENABLED': 0} + sourcepath = 'eatmem/eatmemory_mpi.c' + # env_vars = {'MPICH_GPU_SUPPORT_ENABLED': 0} tags.add('mem') - @run_before('compile') - def unset_ldflags(self): - if 'alps' in self.current_partition.features: - self.build_system.ldflags = ['-L.'] - - @run_before('run') - def set_job_parameters(self): - # fix for "MPIR_pmi_init(83)....: PMI2_Job_GetId returned 14" - self.job.launcher.options += ( - self.current_environ.extras.get('launcher_options', []) - ) - @run_before('run') def set_num_tasks(self): self.skip_if_no_procinfo() @@ -212,17 +216,19 @@ def assert_found_oom(self): @performance_function('GB') def cn_avail_memory_from_sysconf(self): regex = r'memory from sysconf: total: \S+ \S+ avail: (?P\S+) GB' - return sn.extractsingle(regex, self.stdout, 'mem', int) + # return float to avoid truncation in Elastic + return sn.extractsingle(regex, self.stdout, 'mem', float) @performance_function('GB') def cn_max_allocated_memory(self): regex = (r'^Eating \d+ MB\/mpi \*\d+mpi = -\d+ MB memory from \/proc\/' r'meminfo: total: \d+ GB, free: \d+ GB, avail: \d+ GB, using:' r' (\d+) GB') - return sn.max(sn.extractall(regex, self.stdout, 1, int)) + # return float to avoid truncation in Elastic + return sn.max(sn.extractall(regex, self.stdout, 1, float)) @run_before('performance') - def set_references(self): + def set_reference_from_config_systems_file(self): reference_mem = self.current_partition.extras['cn_memory'] - 3 self.reference = { '*': { @@ -233,8 +239,9 @@ def set_references(self): @rfm.simple_test class slurm_response_check(rfm.RunOnlyRegressionTest): + descr = 'Slurm basic commands test (squeue, sacct)' command = parameter(['squeue', 'sacct']) - descr = 'Slurm command test' + sourcesdir = None valid_systems = ['-remote'] valid_prog_environs = ['builtin'] num_tasks = 1 @@ -249,7 +256,7 @@ class slurm_response_check(rfm.RunOnlyRegressionTest): } executable = 'time -p' tags = {'diagnostic', 'health'} - maintainers = ['CB', 'VH'] + # TODO: maintainers = ['CB', 'VH'] @run_before('run') def set_exec_opts(self): @@ -283,8 +290,7 @@ def get_system_partitions(): @rfm.simple_test class SlurmQueueStatusCheck(rfm.RunOnlyRegressionTest): - '''check system queue status''' - + descr = 'check system queue status (# of nodes)' valid_systems = ['-remote'] valid_prog_environs = ['builtin'] tags = {'slurm', 'ops', 'production', 'single-node'} @@ -330,12 +336,12 @@ def assert_percentage_nodes(self): matches = sn.extractall( fr'^{re.escape(self.slurm_partition)},up,' fr'(?P\d+),(allocated|reserved|idle|mixed)', - self.stdout, 'nodes', int + self.stdout, 'nodes', float ) num_matches = sn.sum(matches) all_matches = sn.extractall(fr'^{re.escape(self.slurm_partition)},up,' fr'(?P\d+),.*', self.stdout, - 'nodes', int) + 'nodes', float) self.num_all_matches = sn.sum(all_matches) diff_matches = self.num_all_matches - num_matches return sn.assert_le(diff_matches, @@ -365,7 +371,7 @@ def idle_nodes(self): sn.extractall( fr'^{re.escape(self.slurm_partition)},up,' fr'(?P\d+),idle', - self.stdout, 'nodes', int + self.stdout, 'nodes', float ) ) @@ -375,7 +381,7 @@ def allocated_nodes(self): sn.extractall( fr'^{re.escape(self.slurm_partition)},up,' fr'(?P\d+),allocated', - self.stdout, 'nodes', int + self.stdout, 'nodes', float ) ) @@ -385,7 +391,7 @@ def mixed_nodes(self): sn.extractall( fr'^{re.escape(self.slurm_partition)},up,' fr'(?P\d+),mixed', - self.stdout, 'nodes', int + self.stdout, 'nodes', float ) ) @@ -395,7 +401,7 @@ def reserved_nodes(self): sn.extractall( fr'^{re.escape(self.slurm_partition)},up,' fr'(?P\d+),reserved', - self.stdout, 'nodes', int + self.stdout, 'nodes', float ) ) @@ -410,6 +416,7 @@ def available_nodes_percentage(self): @rfm.simple_test class SlurmPrologEpilogCheck(rfm.RunOnlyRegressionTest): + descr = 'Runs Prolog and Epilog tests' valid_systems = ['*'] valid_prog_environs = ['builtin'] time_limit = '2m' @@ -446,7 +453,7 @@ def validate(self): @rfm.simple_test class SlurmTransparentHugepagesCheck(rfm.RunOnlyRegressionTest): - '''Check Slurm transparent hugepages configuration''' + descr = 'Checks if Slurm transparent hugepages constraint works' hugepages_options = parameter(['default', 'always', 'madvise', 'never']) valid_systems = ['+hugepages_slurm'] @@ -494,3 +501,39 @@ class SlurmParanoidCheck(rfm.RunOnlyRegressionTest): @sanity_function def validate(self): return sn.assert_found(r'0', self.stdout) + + +@rfm.simple_test +class SlurmGPUGresTest(SlurmSimpleBaseCheck): + descr = '''Ensure that the Slurm GRES (Gereric REsource Scheduling) of the number + of gpus is correctly set on all the nodes of each partition.''' + + ''' For the current partition, the test performs the following steps: + 1) count the number of nodes (node_count) + 2) count the number of nodes having Gres=gpu:N (gres_count) where + N=num_devices from the configuration + 3) ensure that 1) and 2) match + ''' + valid_systems = ['+scontrol +gpu'] + valid_prog_environs = ['builtin'] + sourcesdir = None + time_limit = '1m' + num_tasks_per_node = 1 + executable = 'scontrol' + executable_opts = ['show', 'nodes', '--oneliner'] + tags = {'production', 'maintenance'} + + @sanity_function + def assert_gres_valid(self): + partition_name = self.current_partition.name + gpu_count = self.current_partition.select_devices('gpu')[0].num_devices + part_re = rf'Partitions=\S*{partition_name}' + gres_re = rf'gres/gpu={gpu_count} ' + node_count = sn.count(sn.extractall(part_re, self.stdout)) + gres_count = sn.count( + sn.extractall(rf'{part_re}.*{gres_re}', self.stdout)) + return sn.assert_eq( + node_count, gres_count, + f'{gres_count}/{node_count} of ' + f'{partition_name} nodes satisfy {gres_re}' + ) \ No newline at end of file From a6967982f1d0565fddc14928f6a71f405f67a9ca Mon Sep 17 00:00:00 2001 From: Guilherme Date: Thu, 2 Oct 2025 16:02:12 +0200 Subject: [PATCH 06/11] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- checks/system/slurm/gres_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/checks/system/slurm/gres_gpu.py b/checks/system/slurm/gres_gpu.py index d52494c99..798ea5125 100644 --- a/checks/system/slurm/gres_gpu.py +++ b/checks/system/slurm/gres_gpu.py @@ -9,7 +9,7 @@ @rfm.simple_test class SlurmGPUGresTest(rfm.RunOnlyRegressionTest): - descr = '''Ensure that the Slurm GRES (Gereric REsource Scheduling) of the number + descr = '''Ensure that the Slurm GRES (Generic REsource Scheduling) of the number of gpus is correctly set on all the nodes of each partition. For the current partition, the test performs the following steps: From e0af08ec9be986f38450347ce7332873b0e43380 Mon Sep 17 00:00:00 2001 From: Guilherme Date: Thu, 2 Oct 2025 16:02:46 +0200 Subject: [PATCH 07/11] Update pytorch_allreduce.py --- checks/apps/pytorch/pytorch_allreduce.py | 1 - 1 file changed, 1 deletion(-) diff --git a/checks/apps/pytorch/pytorch_allreduce.py b/checks/apps/pytorch/pytorch_allreduce.py index 40c900c97..44826eacd 100644 --- a/checks/apps/pytorch/pytorch_allreduce.py +++ b/checks/apps/pytorch/pytorch_allreduce.py @@ -32,7 +32,6 @@ class PyTorchNCCLAllReduce(rfm.RunOnlyRegressionTest, ContainerEngineMixin): # NOTE: only the "-py3" image is supported by the test supported_flavors = ["-py3"] - pytorch_tags = nvidia_image_tags('pytorch') latest_tags = [] From 2b4dcb6df2e930f84194a8bab15fd9e425459f85 Mon Sep 17 00:00:00 2001 From: Guilherme Date: Thu, 2 Oct 2025 16:15:20 +0200 Subject: [PATCH 08/11] Fixing syntax remarks --- checks/apps/pytorch/pytorch_allreduce.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/checks/apps/pytorch/pytorch_allreduce.py b/checks/apps/pytorch/pytorch_allreduce.py index 44826eacd..e8c4aa15b 100644 --- a/checks/apps/pytorch/pytorch_allreduce.py +++ b/checks/apps/pytorch/pytorch_allreduce.py @@ -68,7 +68,7 @@ def set_image(self): 'aws_ofi_nccl.enabled': 'true', 'aws_ofi_nccl.variant': 'cuda12', }, - } + } @run_after('setup') def setup_test(self): @@ -82,7 +82,7 @@ def setup_test(self): @run_after('setup') def set_executable_opts(self): - self.prerun_cmds = ['wget https://jfrog.svc.cscs.ch/artifactory/cscs-reframe-tests/PyTorch/all_reduce_bench.py'] # noqa: E501 + self.prerun_cmds = ['wget https://jfrog.svc.cscs.ch/artifactory/cscs-reframe-tests/PyTorch/all_reduce_bench.py'] # noqa: E501 headnode_cmd = ( '$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)' ) @@ -103,8 +103,7 @@ def assert_sanity(self): @performance_function('GB/s') def bandwidth(self): return sn.extractsingle(r'\|\s*16GiB\s*\|\s*(?P\S+)GBps\s*\|', - self.stdout, tag='busbw', conv=float - ) + self.stdout, tag='busbw', conv=float) @rfm.simple_test @@ -117,7 +116,6 @@ class PyTorchRCCLAllReduce(rfm.RunOnlyRegressionTest, ContainerEngineMixin): curated_images = [ 'rocm/pytorch:rocm6.3.3_ubuntu24.04_py3.12_pytorch_release_2.4.0' ] - image = parameter(curated_images) #+ latest_images) executable = 'torchrun' num_tasks_per_node = 1 @@ -166,7 +164,6 @@ def set_executable_opts(self): @run_after('setup') def set_nccl_min_nchannels(self): gpu_devices = self.current_partition.select_devices('gpu')[0] - # https://rocm.docs.amd.com/projects/rccl/en/latest/how-to/rccl-usage-tips.html#improving-performance-on-the-mi300x-accelerator-when-using-fewer-than-8-gpus noqa: E501 if gpu_devices.num_devices < 8 and gpu_devices.arch == 'gfx942': self.env_vars['NCCL_MIN_NCHANNELS'] = 32 From 17945e4c1510e04a3b5626605064d9b0d19d7f98 Mon Sep 17 00:00:00 2001 From: Guilherme Date: Thu, 2 Oct 2025 16:24:26 +0200 Subject: [PATCH 09/11] removed long line --- checks/apps/pytorch/pytorch_megatronlm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/checks/apps/pytorch/pytorch_megatronlm.py b/checks/apps/pytorch/pytorch_megatronlm.py index b5e285cdf..4708d32ae 100644 --- a/checks/apps/pytorch/pytorch_megatronlm.py +++ b/checks/apps/pytorch/pytorch_megatronlm.py @@ -228,7 +228,9 @@ class PyTorchMegatronLM(rfm.RunOnlyRegressionTest): @run_after('setup') def setup_test(self): - descr = 'Megatron tests with synthetic data, with options for large scale and real data tests' + descr = ( + 'Megatron tests with synthetic data, with options for large scale ' + 'and real data tests') model_config = self.configurations[self.model] if self.default_num_nodes is None: self.num_nodes = model_config['num_nodes'] From 01fd3285c8beeadbc9c31b2255592b7cbdf8a141 Mon Sep 17 00:00:00 2001 From: Guilherme Date: Thu, 2 Oct 2025 16:25:42 +0200 Subject: [PATCH 10/11] Update cuda_mps.py --- checks/containers/container_engine/cuda_mps.py | 1 - 1 file changed, 1 deletion(-) diff --git a/checks/containers/container_engine/cuda_mps.py b/checks/containers/container_engine/cuda_mps.py index 54aa2f9ff..8c8b349e6 100644 --- a/checks/containers/container_engine/cuda_mps.py +++ b/checks/containers/container_engine/cuda_mps.py @@ -37,4 +37,3 @@ class CUDA_MPS_CE(rfm.RunOnlyRegressionTest, ContainerEngineMixin): @sanity_function def assert_sanity(self): return sn.assert_found(r'^\d+ nvidia-cuda-mps-control -d$', self.stdout) - From b33242803283787a72da7a136b71cbc6dfaed3f0 Mon Sep 17 00:00:00 2001 From: Guilherme Date: Thu, 2 Oct 2025 16:27:07 +0200 Subject: [PATCH 11/11] Update ssh.py --- checks/containers/container_engine/ssh.py | 1 - 1 file changed, 1 deletion(-) diff --git a/checks/containers/container_engine/ssh.py b/checks/containers/container_engine/ssh.py index 167b53444..e8602dd7a 100644 --- a/checks/containers/container_engine/ssh.py +++ b/checks/containers/container_engine/ssh.py @@ -37,4 +37,3 @@ class SSH_CE(rfm.RunOnlyRegressionTest, ContainerEngineMixin): @sanity_function def assert_sanity(self): return sn.assert_found(r'^\d+ /opt/oci-hooks/ssh/dropbear/bin/dropbear.*-p 15263.*$', self.stdout) -