Skip to content
Merged
12 changes: 5 additions & 7 deletions checks/apps/pytorch/pytorch_allreduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

@rfm.simple_test
class PyTorchNCCLAllReduce(rfm.RunOnlyRegressionTest, ContainerEngineMixin):
descr = 'All-reduce PyTorch benchmark with CE (NCCL version)'
valid_systems = ['+nvgpu']
valid_prog_environs = ['builtin']
num_nodes = variable(int, value=8)
Expand All @@ -31,7 +32,6 @@ class PyTorchNCCLAllReduce(rfm.RunOnlyRegressionTest, ContainerEngineMixin):
# NOTE: only the "-py3" image is supported by the test
supported_flavors = ["-py3"]


pytorch_tags = nvidia_image_tags('pytorch')
latest_tags = []

Expand Down Expand Up @@ -68,7 +68,7 @@ def set_image(self):
'aws_ofi_nccl.enabled': 'true',
'aws_ofi_nccl.variant': 'cuda12',
},
}
}

@run_after('setup')
def setup_test(self):
Expand All @@ -82,7 +82,7 @@ def setup_test(self):

@run_after('setup')
def set_executable_opts(self):
self.prerun_cmds = ['wget https://jfrog.svc.cscs.ch/artifactory/cscs-reframe-tests/PyTorch/all_reduce_bench.py'] # noqa: E501
self.prerun_cmds = ['wget https://jfrog.svc.cscs.ch/artifactory/cscs-reframe-tests/PyTorch/all_reduce_bench.py'] # noqa: E501
headnode_cmd = (
'$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)'
)
Expand All @@ -103,20 +103,19 @@ def assert_sanity(self):
@performance_function('GB/s')
def bandwidth(self):
return sn.extractsingle(r'\|\s*16GiB\s*\|\s*(?P<busbw>\S+)GBps\s*\|',
self.stdout, tag='busbw', conv=float
)
self.stdout, tag='busbw', conv=float)


@rfm.simple_test
class PyTorchRCCLAllReduce(rfm.RunOnlyRegressionTest, ContainerEngineMixin):
descr = 'All-reduce PyTorch benchmark with CE (RCCL version)'
valid_systems = ['+amdgpu']
valid_prog_environs = ['builtin']
num_nodes = variable(int, value=8)
sourcesdir = None
curated_images = [
'rocm/pytorch:rocm6.3.3_ubuntu24.04_py3.12_pytorch_release_2.4.0'
]

image = parameter(curated_images) #+ latest_images)
executable = 'torchrun'
num_tasks_per_node = 1
Expand Down Expand Up @@ -165,7 +164,6 @@ def set_executable_opts(self):
@run_after('setup')
def set_nccl_min_nchannels(self):
gpu_devices = self.current_partition.select_devices('gpu')[0]

# https://rocm.docs.amd.com/projects/rccl/en/latest/how-to/rccl-usage-tips.html#improving-performance-on-the-mi300x-accelerator-when-using-fewer-than-8-gpus noqa: E501
if gpu_devices.num_devices < 8 and gpu_devices.arch == 'gfx942':
self.env_vars['NCCL_MIN_NCHANNELS'] = 32
Expand Down
3 changes: 3 additions & 0 deletions checks/apps/pytorch/pytorch_megatronlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,9 @@ class PyTorchMegatronLM(rfm.RunOnlyRegressionTest):

@run_after('setup')
def setup_test(self):
descr = (
'Megatron tests with synthetic data, with options for large scale '
'and real data tests')
model_config = self.configurations[self.model]
if self.default_num_nodes is None:
self.num_nodes = model_config['num_nodes']
Expand Down
1 change: 1 addition & 0 deletions checks/containers/container_engine/check_cuda_nbody.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

@rfm.simple_test
class CudaNBodyCheckCE(rfm.RunOnlyRegressionTest, ContainerEngineMixin):
descr = 'Single-node N-Body test for GPUs using CE (from CUDA samples)'
valid_systems = ['+ce +nvgpu']
valid_prog_environs = ['builtin']
sourcesdir = None
Expand Down
2 changes: 1 addition & 1 deletion checks/containers/container_engine/cuda_mps.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

@rfm.simple_test
class CUDA_MPS_CE(rfm.RunOnlyRegressionTest, ContainerEngineMixin):
descr = 'Check for CUDA MPS with CE'
valid_prog_environs = ['builtin']
valid_systems = ['+ce +nvgpu']
test_name = 'cuda_mps'
Expand All @@ -36,4 +37,3 @@ class CUDA_MPS_CE(rfm.RunOnlyRegressionTest, ContainerEngineMixin):
@sanity_function
def assert_sanity(self):
return sn.assert_found(r'^\d+ nvidia-cuda-mps-control -d$', self.stdout)

2 changes: 2 additions & 0 deletions checks/containers/container_engine/omb.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def set_perf(self):

@rfm.simple_test
class OMB_MPICH_CE(OMB_Base_CE):
descr = 'OSU Micro-benchmarks for MPICH/CE (Point-to-Point and All-to-All)'
container_image = (
'jfrog.svc.cscs.ch#reframe-oci/osu-mb:7.5-mpich4.3.0-ofi1.15-cuda12.8'
)
Expand All @@ -99,6 +100,7 @@ def set_pmi2(self):

@rfm.simple_test
class OMB_OMPI_CE(OMB_Base_CE):
descr = 'OSU Micro-benchmarks for OpenMPI/CE (Point-to-Point and All-to-All)'
container_image = (f'jfrog.svc.cscs.ch#reframe-oci/osu-mb:7.5-ompi5.0.7-ofi1.15-cuda12.8')
valid_systems = ['+ce +nvgpu']
reference_per_test = {
Expand Down
2 changes: 1 addition & 1 deletion checks/containers/container_engine/ssh.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

@rfm.simple_test
class SSH_CE(rfm.RunOnlyRegressionTest, ContainerEngineMixin):
descr = 'Checks if SSH is available with CE'
valid_prog_environs = ['builtin']
valid_systems = ['+ce']
test_name = 'ssh'
Expand All @@ -36,4 +37,3 @@ class SSH_CE(rfm.RunOnlyRegressionTest, ContainerEngineMixin):
@sanity_function
def assert_sanity(self):
return sn.assert_found(r'^\d+ /opt/oci-hooks/ssh/dropbear/bin/dropbear.*-p 15263.*$', self.stdout)

2 changes: 2 additions & 0 deletions checks/containers/container_engine/xccl_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def set_perf(self):

@rfm.simple_test
class NCCLTestsCE(XCCLTestBase):
descr = 'Point-to-Point and All-Reduce NCCL tests with CE'
valid_systems = ['+ce +nvgpu']
image_tag = parameter(['cuda12.9.1'])

Expand Down Expand Up @@ -126,6 +127,7 @@ def setup_ce(self):

@rfm.simple_test
class RCCLTestCE(XCCLTestBase):
descr = 'Point-to-Point and All-Reduce RCCL tests with CE'
valid_systems = ['+ce +amdgpu']
image_tag = parameter(['rocm6.3.4'])
min_bytes = '4096M'
Expand Down
4 changes: 4 additions & 0 deletions checks/microbenchmarks/cpu_gpu/node_burn/node-burn-ce.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def nb_gbps(self):

@rfm.simple_test
class CudaNodeBurnGemmCE(NodeBurnGemmCE):
descr = 'GPU Node burn GEMM test for A100/GH200 using CE'
executable = 'burn-f64'
ref_nb_gflops = {
'a100': {'nb_gflops': (9746*2*0.85, -0.1, None, 'GFlops')},
Expand All @@ -134,6 +135,7 @@ def setup_job(self):

@rfm.simple_test
class CPUNodeBurnGemmCE(NodeBurnGemmCE):
descr = 'CPU Node burn GEMM test for A100/GH200-nodes using CE'
executable = 'burn-f64-cpu'
ref_nb_gflops = {
'gh200': {'nb_gflops': (3150, -0.1, None, 'GFlops')},
Expand Down Expand Up @@ -178,6 +180,7 @@ def setup_job(self):

@rfm.simple_test
class CudaNodeBurnStreamCE(NodeBurnStreamCE):
descr = 'GPU Node burn Stream test for A100/GH200 using CE'
executable = 'burn-f64'
ref_nb_gbps = {
'a100': {'nb_gbps': (2 * 1000 * 0.95, -0.1, None, 'GB/s')},
Expand Down Expand Up @@ -207,6 +210,7 @@ def setup_job(self):

@rfm.simple_test
class CPUNodeBurnStreamCE(NodeBurnStreamCE):
descr = 'CPU Node burn Stream test for A100/GH200-nodes using CE'
executable = 'burn-f64-cpu'
ref_nb_gbps = {
'gh200': {'nb_gbps': (450.0, -0.1, None, 'GB/s')},
Expand Down
2 changes: 2 additions & 0 deletions checks/system/ce/ce_import_run_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class enroot_import_image_ngc(enroot_import_image):

@rfm.simple_test
class RunJobCE(rfm.RunOnlyRegressionTest, ContainerEngineMixin):
descr = 'CE check with Dockerhub import and simple image run (ubuntu)'
valid_systems = ['+ce']
valid_prog_environs = ['builtin']
container_image = '' # Defined after setup
Expand All @@ -62,6 +63,7 @@ def assert_found_found_ubuntu(self):

@rfm.simple_test
class RunNVGPUJobCE(rfm.RunOnlyRegressionTest, ContainerEngineMixin):
descr = 'CE check with NGC import and Stream job on GPU'
valid_systems = ['+ce +nvgpu']
valid_prog_environs = ['builtin']
container_image = '' # Defined after setup
Expand Down
2 changes: 1 addition & 1 deletion checks/system/slurm/gres_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

@rfm.simple_test
class SlurmGPUGresTest(rfm.RunOnlyRegressionTest):
'''Ensure that the Slurm GRES (Gereric REsource Scheduling) of the number
descr = '''Ensure that the Slurm GRES (Generic REsource Scheduling) of the number
of gpus is correctly set on all the nodes of each partition.
For the current partition, the test performs the following steps:
Expand Down
Loading