aws · hanwen-cluster · Sep 22, 2025 · Sep 19, 2025 · Sep 19, 2025 · Sep 19, 2025
@@ -5,8 +5,8 @@ set -xe
 rm -rf /shared/${1}
 
 module load ${1}
-NCCL_BENCHMARKS_VERSION='2.16.7'
-NCCL_VERSION='2.27.7-1'
+NCCL_BENCHMARKS_VERSION='2.17.1'
+NCCL_VERSION='2.28.3-1'
 MPI_HOME=$(which mpirun | awk -F '/bin' '{print $1}')
 NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90" # Arch for NVIDIA A100 and H100, ref https://docs.nvidia.com/cuda/ada-compatibility-guide/index.html
 

@@ -3,8 +3,8 @@
 #SBATCH --exclusive
 
 module load openmpi
-NCCL_VERSION='2.27.7-1'
-NCCL_BENCHMARKS_VERSION='2.16.7'
+NCCL_VERSION='2.28.3-1'
+NCCL_BENCHMARKS_VERSION='2.17.1'
 
 . /etc/os-release
 if [[ $ID==rhel || $ID==rocky ]]; then

@@ -69,8 +69,8 @@ def install_and_run_nccl_benchmarks(remote_command_executor, mpi_module, schedul
         "p4d.24xlarge": 26.0,
         # p5.48xlarge - Expected "in-place busbw" bandwidth with 2 nodes, 8 tasks per node is about 250GB/s
         "p5.48xlarge": 250.0,
-        "p6-b200.48xlarge": 300,
-        "p6e-gb200.36xlarge": 500,
+        "p6-b200.48xlarge": 570,  # Initial testing performance 631.17
+        "p6e-gb200.36xlarge": 650,  # Initial testing performance 719.17
     }
 
     expected_bandwidth = instance_bandwidth_dict.get(instance)