eth-cscs · jgphpc · Oct 2, 2025 · Sep 23, 2025 · Oct 1, 2025 · Oct 1, 2025
diff --git a/checks/apps/pytorch/pytorch_allreduce.py b/checks/apps/pytorch/pytorch_allreduce.py
@@ -22,6 +22,7 @@
 
 @rfm.simple_test
 class PyTorchNCCLAllReduce(rfm.RunOnlyRegressionTest, ContainerEngineMixin):
+    descr = 'All-reduce PyTorch benchmark with CE (NCCL version)'
     valid_systems = ['+nvgpu']
     valid_prog_environs = ['builtin']
     num_nodes = variable(int, value=8)
@@ -31,7 +32,6 @@ class PyTorchNCCLAllReduce(rfm.RunOnlyRegressionTest, ContainerEngineMixin):
     # NOTE: only the "-py3" image is supported by the test
     supported_flavors = ["-py3"]
 
-
     pytorch_tags = nvidia_image_tags('pytorch')
     latest_tags = []
 
@@ -68,7 +68,7 @@ def set_image(self):
                     'aws_ofi_nccl.enabled': 'true',
                     'aws_ofi_nccl.variant': 'cuda12',
             },
-       }
+        }
 
     @run_after('setup')
     def setup_test(self):
@@ -82,7 +82,7 @@ def setup_test(self):
 
     @run_after('setup')
     def set_executable_opts(self):
-        self.prerun_cmds = ['wget https://jfrog.svc.cscs.ch/artifactory/cscs-reframe-tests/PyTorch/all_reduce_bench.py'] # noqa: E501
+        self.prerun_cmds = ['wget https://jfrog.svc.cscs.ch/artifactory/cscs-reframe-tests/PyTorch/all_reduce_bench.py']  # noqa: E501
         headnode_cmd = (
             '$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)'
         )
@@ -103,20 +103,19 @@ def assert_sanity(self):
     @performance_function('GB/s')
     def bandwidth(self):
         return sn.extractsingle(r'\|\s*16GiB\s*\|\s*(?P<busbw>\S+)GBps\s*\|',
-                                self.stdout, tag='busbw', conv=float
-        )
+                                self.stdout, tag='busbw', conv=float)
 
 
 @rfm.simple_test
 class PyTorchRCCLAllReduce(rfm.RunOnlyRegressionTest, ContainerEngineMixin):
+    descr = 'All-reduce PyTorch benchmark with CE (RCCL version)'
     valid_systems = ['+amdgpu']
     valid_prog_environs = ['builtin']
     num_nodes = variable(int, value=8)
     sourcesdir = None
     curated_images = [
         'rocm/pytorch:rocm6.3.3_ubuntu24.04_py3.12_pytorch_release_2.4.0'
     ]
-
     image = parameter(curated_images) #+ latest_images)
     executable = 'torchrun'
     num_tasks_per_node = 1
@@ -165,7 +164,6 @@ def set_executable_opts(self):
     @run_after('setup')
     def set_nccl_min_nchannels(self):
         gpu_devices = self.current_partition.select_devices('gpu')[0]
-
         # https://rocm.docs.amd.com/projects/rccl/en/latest/how-to/rccl-usage-tips.html#improving-performance-on-the-mi300x-accelerator-when-using-fewer-than-8-gpus noqa: E501
         if gpu_devices.num_devices < 8 and gpu_devices.arch == 'gfx942':
             self.env_vars['NCCL_MIN_NCHANNELS'] = 32

diff --git a/checks/apps/pytorch/pytorch_megatronlm.py b/checks/apps/pytorch/pytorch_megatronlm.py
@@ -228,6 +228,9 @@ class PyTorchMegatronLM(rfm.RunOnlyRegressionTest):
 
     @run_after('setup')
     def setup_test(self):
+        descr = (
+            'Megatron tests with synthetic data, with options for large scale '
+            'and real data tests')
         model_config = self.configurations[self.model]
         if self.default_num_nodes is None:
             self.num_nodes = model_config['num_nodes']

diff --git a/checks/containers/container_engine/check_cuda_nbody.py b/checks/containers/container_engine/check_cuda_nbody.py
@@ -16,6 +16,7 @@
 
 @rfm.simple_test
 class CudaNBodyCheckCE(rfm.RunOnlyRegressionTest, ContainerEngineMixin):
+    descr = 'Single-node N-Body test for GPUs using CE (from CUDA samples)'
     valid_systems = ['+ce +nvgpu']
     valid_prog_environs = ['builtin']
     sourcesdir = None

diff --git a/checks/containers/container_engine/cuda_mps.py b/checks/containers/container_engine/cuda_mps.py
@@ -17,6 +17,7 @@
 
 @rfm.simple_test
 class CUDA_MPS_CE(rfm.RunOnlyRegressionTest, ContainerEngineMixin):
+    descr = 'Check for CUDA MPS with CE'
     valid_prog_environs = ['builtin']
     valid_systems = ['+ce +nvgpu']
     test_name = 'cuda_mps'
@@ -36,4 +37,3 @@ class CUDA_MPS_CE(rfm.RunOnlyRegressionTest, ContainerEngineMixin):
     @sanity_function
     def assert_sanity(self):
         return sn.assert_found(r'^\d+ nvidia-cuda-mps-control -d$', self.stdout)
-
diff --git a/checks/containers/container_engine/omb.py b/checks/containers/container_engine/omb.py
@@ -75,6 +75,7 @@ def set_perf(self):
 
 @rfm.simple_test
 class OMB_MPICH_CE(OMB_Base_CE):
+    descr = 'OSU Micro-benchmarks for MPICH/CE (Point-to-Point and All-to-All)'
     container_image = (
         'jfrog.svc.cscs.ch#reframe-oci/osu-mb:7.5-mpich4.3.0-ofi1.15-cuda12.8'
     )
@@ -99,6 +100,7 @@ def set_pmi2(self):
 
 @rfm.simple_test
 class OMB_OMPI_CE(OMB_Base_CE):
+    descr = 'OSU Micro-benchmarks for OpenMPI/CE (Point-to-Point and All-to-All)'
     container_image = (f'jfrog.svc.cscs.ch#reframe-oci/osu-mb:7.5-ompi5.0.7-ofi1.15-cuda12.8')
     valid_systems = ['+ce +nvgpu']
     reference_per_test = {

diff --git a/checks/containers/container_engine/ssh.py b/checks/containers/container_engine/ssh.py
@@ -17,6 +17,7 @@
 
 @rfm.simple_test
 class SSH_CE(rfm.RunOnlyRegressionTest, ContainerEngineMixin):
+    descr = 'Checks if SSH is available with CE'
     valid_prog_environs = ['builtin']
     valid_systems = ['+ce']
     test_name = 'ssh'
@@ -36,4 +37,3 @@ class SSH_CE(rfm.RunOnlyRegressionTest, ContainerEngineMixin):
     @sanity_function
     def assert_sanity(self):
         return sn.assert_found(r'^\d+ /opt/oci-hooks/ssh/dropbear/bin/dropbear.*-p 15263.*$', self.stdout)
-
diff --git a/checks/containers/container_engine/xccl_tests.py b/checks/containers/container_engine/xccl_tests.py
@@ -87,6 +87,7 @@ def set_perf(self):
 
 @rfm.simple_test
 class NCCLTestsCE(XCCLTestBase):
+    descr = 'Point-to-Point and All-Reduce NCCL tests with CE'
     valid_systems = ['+ce +nvgpu']
     image_tag = parameter(['cuda12.9.1'])
 
@@ -126,6 +127,7 @@ def setup_ce(self):
 
 @rfm.simple_test
 class RCCLTestCE(XCCLTestBase):
+    descr = 'Point-to-Point and All-Reduce RCCL tests with CE'
     valid_systems = ['+ce +amdgpu']
     image_tag = parameter(['rocm6.3.4'])
     min_bytes = '4096M'

diff --git a/checks/microbenchmarks/cpu_gpu/node_burn/node-burn-ce.py b/checks/microbenchmarks/cpu_gpu/node_burn/node-burn-ce.py
@@ -108,6 +108,7 @@ def nb_gbps(self):
 
 @rfm.simple_test
 class CudaNodeBurnGemmCE(NodeBurnGemmCE):
+    descr = 'GPU Node burn GEMM test for A100/GH200 using CE'
     executable = 'burn-f64'
     ref_nb_gflops = {
         'a100': {'nb_gflops': (9746*2*0.85, -0.1, None, 'GFlops')},
@@ -134,6 +135,7 @@ def setup_job(self):
 
 @rfm.simple_test
 class CPUNodeBurnGemmCE(NodeBurnGemmCE):
+    descr = 'CPU Node burn GEMM test for A100/GH200-nodes using CE'
     executable = 'burn-f64-cpu'
     ref_nb_gflops = {
         'gh200': {'nb_gflops': (3150, -0.1, None, 'GFlops')},
@@ -178,6 +180,7 @@ def setup_job(self):
 
 @rfm.simple_test
 class CudaNodeBurnStreamCE(NodeBurnStreamCE):
+    descr = 'GPU Node burn Stream test for A100/GH200 using CE'
     executable = 'burn-f64'
     ref_nb_gbps = {
         'a100': {'nb_gbps': (2 * 1000 * 0.95, -0.1, None, 'GB/s')},
@@ -207,6 +210,7 @@ def setup_job(self):
 
 @rfm.simple_test
 class CPUNodeBurnStreamCE(NodeBurnStreamCE):
+    descr = 'CPU Node burn Stream test for A100/GH200-nodes using CE'
     executable = 'burn-f64-cpu'
     ref_nb_gbps = {
         'gh200': {'nb_gbps': (450.0, -0.1, None, 'GB/s')},

diff --git a/checks/system/ce/ce_import_run_image.py b/checks/system/ce/ce_import_run_image.py
@@ -42,6 +42,7 @@ class enroot_import_image_ngc(enroot_import_image):
 
 @rfm.simple_test
 class RunJobCE(rfm.RunOnlyRegressionTest, ContainerEngineMixin):
+    descr = 'CE check with Dockerhub import and simple image run (ubuntu)'
     valid_systems = ['+ce']
     valid_prog_environs = ['builtin']
     container_image = ''  # Defined after setup
@@ -62,6 +63,7 @@ def assert_found_found_ubuntu(self):
 
 @rfm.simple_test
 class RunNVGPUJobCE(rfm.RunOnlyRegressionTest, ContainerEngineMixin):
+    descr = 'CE check with NGC import and Stream job on GPU'
     valid_systems = ['+ce +nvgpu']
     valid_prog_environs = ['builtin']
     container_image = ''  # Defined after setup

diff --git a/checks/system/slurm/gres_gpu.py b/checks/system/slurm/gres_gpu.py
@@ -9,7 +9,7 @@
 
 @rfm.simple_test
 class SlurmGPUGresTest(rfm.RunOnlyRegressionTest):
-    '''Ensure that the Slurm GRES (Gereric REsource Scheduling) of the number
+    descr = '''Ensure that the Slurm GRES (Generic REsource Scheduling) of the number
        of gpus is correctly set on all the nodes of each partition.
 
        For the current partition, the test performs the following steps: