diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy index 7426d35..fe58a3e 100644 --- a/.jenkins/common.groovy +++ b/.jenkins/common.groovy @@ -14,7 +14,7 @@ def runCompileCommand(platform, project, jobName) ${auxiliary.exitIfNotSuccess()} cd ${project.paths.project_build_prefix} cmake \ - -DCMAKE_CXX_COMPILER=/opt/rocm/hip/bin/hipcc \ + -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -S . -B build make -C build -j\$(nproc) ${auxiliary.exitIfNotSuccess()} @@ -30,9 +30,9 @@ def runTestCommand (platform, project) def command = """#!/usr/bin/env bash set -x cd ${project.paths.project_build_prefix} - python3 -m pip install --upgrade pytest - python3 -m pytest --version - python3 -m pytest -k "not MPI and not host and not fine" --verbose --junitxml=./testreport.xml + python3 -m pip install --upgrade pytest + python3 -m pytest --version + python3 -m pytest -k "not MPI and not host and not fine" --verbose --junitxml=./testreport.xml """ platform.runCommand(this, command) diff --git a/CMakeLists.txt b/CMakeLists.txt index 539a1ea..5577fb8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,29 @@ # ######################################################################## # Copyright 2022 Advanced Micro Devices, Inc. # ######################################################################## +#Adding pthread flag for linking +set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") +macro(check_mpi mpi_compiler mpi_lib_a mpi_lib_so mpi_bin_dir mpi_base_lib_dir mpi_inc_dir) + find_program(MPI_MPICXX ${mpi_compiler} PATHS ${mpi_bin_dir} NO_DEFAULT_PATH) + if (MPI_MPICXX) + message ("-- ${mpi_compiler} found @ ${MPI_MPICXX}") + find_file(MPI_H mpi.h PATHS ${mpi_inc_dir} NO_DEFAULT_PATH) + message ("-- mpi.h is in ${MPI_H}") + find_file(MPI_LIB NAMES ${mpi_lib_so} ${mpi_lib_a} PATHS ${mpi_base_lib_dir} PATH_SUFFIXES lib lib64 lib/x86_64-linux-gnu NO_DEFAULT_PATH) + message ("-- libmpi is ${MPI_LIB}") + if (NOT MPI_H OR NOT MPI_LIB) + set (MPI_MPICXX "MPI_MPICXX-NOTFOUND") + set (MPI_H "MPI_H-NOTFOUND") + set (MPI_LIB "MPI_LIB-NOTFOUND") + else() + add_definitions(-DMPI_SUPPORT) + include_directories(${mpi_inc_dir}) + link_libraries(${MPI_LIB}) + endif() + else() + message ("-- ${mpi_compiler} not found") + endif() +endmacro() cmake_minimum_required(VERSION 3.16.3 FATAL_ERROR) @@ -30,8 +53,8 @@ include(ROCMCheckTargetIds) include(ROCMClients) # Build variables -option(USE_MPI "Build RCCL-tests with MPI support. Requires the MPI path to be set.") -set(MPI_PATH "" CACHE PATH "Path to MPI installation") +option(NO_MPI "Build RCCL-tests without MPI support.") +option(MPI_PATH "Use MPI in the specified directory.") ## Get default GPU targets using rocm_check_target_ids rocm_check_target_ids( DEFAULT_AMDGPU_TARGETS @@ -39,13 +62,43 @@ rocm_check_target_ids( ) set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for these tests to target.") -# Find the MPI package if we're using MPI -if (USE_MPI) - if(NOT MPI_PATH STREQUAL "") - set(MPI_HOME "${MPI_PATH}") +if (NOT NO_MPI) + # CHECK for MPI Path first. User requested this directory explicitely + if (MPI_PATH) + set(mpi_spec_bin_dir "${MPI_PATH}/bin") + set(mpi_spec_inc_dir "${MPI_PATH}/include") + check_mpi(mpicxx libmpi.a libmpi.so ${mpi_spec_bin_dir} ${MPI_PATH} ${mpi_spec_inc_dir}) + if (NOT MPI_MPICXX) + # Since the user explicitely requested this directory, abort if something went wrong. + MESSAGE(FATAL_ERROR "Could not find MPI in ${MPI_PATH}") + endif() + endif() + + # Check for MPICH Ubuntu installation + if (NOT MPI_MPICXX) + check_mpi(mpicxx.mpich libmpich.a libmpich.so /usr/bin /usr /usr/include/x86_64-linux-gnu/mpich) + endif() + + # Check for Open MPI Ubuntu installation + if (NOT MPI_MPICXX) + check_mpi(mpicxx.openmpi libmpi.a libmpi.so /usr/bin /usr/lib/x86_64-linux-gnu/openmpi /usr/lib/x86_64-linux-gnu/openmpi/include) + endif() + + # Check for MPICH RHEL installation + if (NOT MPI_MPICXX) + check_mpi(mpicxx libmpich.a libmpich.so /usr/lib64/mpich/bin /usr/lib64/mpich /usr/include/mpich-x86_64) + endif() + + # Check for Open MPI RHEL installation + if (NOT MPI_MPICXX) + check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/openmpi/bin /usr/lib64/openmpi /usr/include/openmpi-x64_64) endif() - find_package(MPI REQUIRED MODULE) - add_definitions(-DOMPI_SKIP_MPICXX -DMPI_SUPPORT) + + if (NOT MPI_MPICXX) + message ("-- no MPI library found") + endif() +else() + message ("-- MPI support explicitely disabled") endif() set(ROCM_USE_DEV_COMPONENT OFF) # This repo doesn't have a dev component @@ -55,7 +108,7 @@ add_subdirectory(src) # Create ROCm standard packages rocm_create_package( - NAME rccl-separate-tests + NAME rccl-tests DESCRIPTION "Tests for the ROCm Communication Collectives Library" MAINTAINER "RCCL Maintainer " ) diff --git a/Makefile b/Makefile index 4025f10..8e0154a 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,9 @@ # See LICENCE.txt for license information # +BUILDDIR ?= build +override BUILDDIR := $(abspath $(BUILDDIR)) + .PHONY : all clean default : src.build @@ -14,7 +17,7 @@ all: ${TARGETS:%=%.build} clean: ${TARGETS:%=%.clean} %.build: - ${MAKE} -C $* build + ${MAKE} -C $* build BUILDDIR=${BUILDDIR} %.clean: - ${MAKE} -C $* clean + ${MAKE} -C $* clean BUILDDIR=${BUILDDIR} diff --git a/README.md b/README.md index c284723..74f1551 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,23 @@ RCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If y $ make MPI=1 MPI_HOME=/path/to/mpi HIP_HOME=/path/to/hip RCCL_HOME=/path/to/rccl ``` +RCCL tests can also be built using cmake. A typical sequence will be: + +```shell +$ mkdir build +$ cd build +$ CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/path/to/rccl .. +$ make +``` + +When using the cmake build procedure, please make sure that RCCL has also been built using cmake (i.e. not using the install.sh script), since cmake will check +for cmake target and config files that are created during the RCCL build. + +Using the cmake method also has the advantage that the build is automatically checking for MPI installations, i.e. it is not necessary to explicitly request +MPI builds. A user can request to use a particular MPI library by using the MPI_PATH variable. MPI support can be explicitely disabled by adding the -DNO_MPI=1 +flag to the cmake command line. + + ## Usage RCCL tests can run on multiple processes, multiple threads, and multiple HIP devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=HIP devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread). diff --git a/scripts/rccl_tests_build_run_PTS.sh b/scripts/rccl_tests_build_run_PTS.sh new file mode 100755 index 0000000..a919739 --- /dev/null +++ b/scripts/rccl_tests_build_run_PTS.sh @@ -0,0 +1,73 @@ +#!/bin/bash +echo "This script is for building and running the rccl-tests as well as Unit tests" +echo "Please ensure that the following environment variables are pointing to correct directions!" + +########## Set the appropriate directories ########## +export _HIP_HOME=/opt/rocm/hip +export _MPI_HOME=/path/to/mpi/build +export _RCCL_HOME=/opt/rocm/rccl/build + +export LD_LIBRARY_PATH=$_MPI_HOME/lib:$LD_LIBRARY_PATH +export PATH=$_MPI_HOME/bin/:$PATH +echo "HIP_HOME=$_HIP_HOME" +echo "MPI_HOME=$_MPI_HOME" +echo "RCCL_HOME=$_RCCL_HOME" + +echo "########## Print the system information ##########" +sudo dmidecode | grep "Product Name" +rocm-smi --showtopo + +########## Set the number of GPUs ########## +ngpus=8 +set -x +########## Build the RCCL-tests benchmark ########## +echo "Do you want to run tests on multiple nodes?" +read -p '(y/n) ' RESPONSE +if [ "$RESPONSE" = "y" ]; then + + ########## MPI Installation check ########## + MPI_Installed=$(which mpicc) + + if [ -z "$MPI_Installed" ]; then + echo "MPI is not installed! Install MPI and set the PATH environment variable to include PATH=/path/to/MPI-install/bin/:$PATH"; + exit + else + cd .. + rm -rf rccl-tests + git clone https://github.com/ROCmSoftwarePlatform/rccl-tests.git + cd rccl-tests + make MPI=1 MPI_HOME=$_MPI_HOME HIP_HOME=$_HIP_HOME NCCL_HOME=$_RCCL_HOME + fi +else + cd .. + rm -rf rccl-tests + git clone https://github.com/ROCmSoftwarePlatform/rccl-tests.git + cd rccl-tests + make HIP_HOME=$_HIP_HOME NCCL_HOME=$_RCCL_HOME +fi + +########## Run the RCCL-tests benchmark ########## +cd build +echo "Allreduce Test" +./all_reduce_perf -b 8 -e 1G -f 2 -g $ngpus +echo "Broadcast Test" +./broadcast_perf -b 8 -e 1G -f 2 -g $ngpus +echo "Reduce Test" +./reduce_perf -b 8 -e 1G -f 2 -g $ngpus +echo "Reduce_scatter Test" +./reduce_scatter_perf -b 8 -e 1G -f 2 -g $ngpus +echo "Allgather Test" +./all_gather_perf -b 8 -e 1G -f 2 -g $ngpus +echo "Send_Recv Test" +./sendrecv_perf -b 8 -e 1G -f 2 -g $ngpus +echo "Scatter Test" +./scatter_perf -b 8 -e 1G -f 2 -g $ngpus +echo "Gather Test" +./gather_perf -b 8 -e 1G -f 2 -g $ngpus +echo "Alltoall Test" +./alltoall_perf -b 8 -e 1G -f 2 -g $ngpus +echo "Alltoallv Test" +./alltoallv_perf -b 8 -e 1G -f 2 -g $ngpus + + + diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b5a40ae..6511a41 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -3,8 +3,8 @@ # ######################################################################## # Compile common object library -set_property(SOURCE common.cu PROPERTY LANGUAGE CXX) -add_library(rccl_common OBJECT common.cu) +set_property(SOURCE common.cu timer.cc ../verifiable/verifiable.cu PROPERTY LANGUAGE CXX) +add_library(rccl_common OBJECT common.cu timer.cc ../verifiable/verifiable.cu) if(USE_MPI) target_link_libraries(rccl_common roc::rccl MPI::MPI_CXX) else() diff --git a/src/Makefile b/src/Makefile index 3dbd41f..e07f12d 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,6 +1,6 @@ # -# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. -# Modifications are Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. +# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. # # See LICENSE.txt for license information # @@ -12,7 +12,7 @@ VERBOSE ?= 0 DEBUG ?= 0 NCCL_HOME ?= "" -HIPCC = $(ROCM_PATH)/hip/bin/hipcc +HIPCC = $(ROCM_PATH)/bin/hipcc CXX = $(HIPCC) HIPCUFLAGS := -std=c++14 @@ -20,14 +20,13 @@ LDFLAGS := HIPLDFLAGS := ifneq ($(NCCL_HOME), "") -HIPCUFLAGS += -I$(NCCL_HOME) -I$(NCCL_HOME)/rccl/include -HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME) +HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include +HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME) -L$(NCCL_HOME)/lib endif HIPCUFLAGS += -I$(ROCM_PATH)/include -HIPCUFLAGS += -I$(ROCM_PATH)/include/rccl -HIPCUFLAGS += -I$(ROCM_PATH)/hip/include/hip +HIPCUFLAGS += -I$(ROCM_PATH)/include/hip LDFLAGS += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt -HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt +HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt -pthread ifeq ($(DEBUG), 0) HIPCUFLAGS += -O3 @@ -65,15 +64,23 @@ build: ${BIN_FILES} clean: rm -rf ${DST_DIR} -${DST_DIR}/%.o: %.cu common.h +TEST_VERIFIABLE_SRCDIR := ../verifiable +TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable +include ../verifiable/verifiable.mk + +${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS) @printf "Compiling %-35s > %s\n" $< $@ @mkdir -p ${DST_DIR} echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<" $(HIPCC) -o $@ $(HIPCUFLAGS) -c $< -${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o +${DST_DIR}/timer.o: timer.cc timer.h + @printf "Compiling %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(CXX) $(CXXFLAGS) -o $@ -c timer.cc + +${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) @printf "Linking %-35s > %s\n" $< $@ @mkdir -p ${DST_DIR} echo "$(HIPCC) -o $@ $(HIPCUFLAGS) $^ ${HIPLDFLAGS}" $(HIPCC) -o $@ $(HIPCUFLAGS) $^ ${HIPLDFLAGS} - diff --git a/src/all_gather.cu b/src/all_gather.cu index bc1c599..759f347 100644 --- a/src/all_gather.cu +++ b/src/all_gather.cu @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,24 +8,15 @@ #include #include "common.h" -void print_header() { - PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", ""); - PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", - "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", - "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); -} - -void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %8s", size, count, typeName); -} +#define ALIGN 4 void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { - *sendcount = count/nranks; - *recvcount = (count/nranks)*nranks; - *sendInplaceOffset = count/nranks; + size_t base = (count/(ALIGN*nranks))*ALIGN; + *sendcount = base; + *recvcount = base*nranks; + *sendInplaceOffset = base; *recvInplaceOffset = 0; - *paramcount = *sendcount; + *paramcount = base; } testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { @@ -35,18 +26,15 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc int k=0; for (int i=0; inGpus; i++) { - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - if (args->enable_multiranks) - gpuid = gpuid % args->localNumDevices; - HIPCHECK(hipSetDevice(gpuid)); + HIPCHECK(hipSetDevice(args->gpus[i])); for (int l=0; lnRanks; l++) { int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k]; - TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); for (int j=0; jexpected[k])+args->sendBytes*j, sendcount, type, rep, j)); + TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0)); } k++; } @@ -98,7 +86,7 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t } for (int i=0; i #include "common.h" -void print_header() { - PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); - PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", - "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", - "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); -} - -void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %8s %6s", size, count, typeName, opName); -} - void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { *sendcount = count; *recvcount = count; @@ -35,16 +23,13 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc int k = 0; for (int i=0; inGpus; i++) { - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - if (args->enable_multiranks) - gpuid = gpuid % args->localNumDevices; - HIPCHECK(hipSetDevice(gpuid)); + HIPCHECK(hipSetDevice(args->gpus[i])); for (int l=0; lnRanks; l++) { int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k]; - TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank)); TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks)); k++; } diff --git a/src/alltoall.cu b/src/alltoall.cu index 48020e4..77546f4 100644 --- a/src/alltoall.cu +++ b/src/alltoall.cu @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,18 +8,6 @@ #include #include "common.h" -void print_header() { - PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); - PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", - "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", - "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); -} - -void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %8s %6s", size, count, typeName, opName); -} - void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { *sendcount = (count/nranks)*nranks; *recvcount = (count/nranks)*nranks; @@ -35,19 +23,16 @@ testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, nccl int k=0; for (int i=0; inGpus; i++) { - char* str = getenv("NCCL_TESTS_DEVICE"); - int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - if (args->enable_multiranks) - gpuid = gpuid % args->localNumDevices; - HIPCHECK(hipSetDevice(gpuid)); + HIPCHECK(hipSetDevice(args->gpus[i])); for (int l=0; lnRanks; l++) { int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k]; - TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); for (int j=0; jexpected[k])+args->sendBytes/nranks*j, sendcount/nranks, type, rep+rank*sendcount/nranks, j)); + size_t partcount = sendcount/nranks; + TESTCHECK(InitData(((char*)args->expected[k])+ j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0)); } k++; } @@ -101,7 +86,7 @@ testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t t } for (int i=0; inGpus; i++) { - char* str = getenv("NCCL_TESTS_DEVICE"); - int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - if (args->enable_multiranks) - gpuid = gpuid % args->localNumDevices; - HIPCHECK(hipSetDevice(gpuid)); + HIPCHECK(hipSetDevice(args->gpus[i])); for (int l=0; lnRanks; l++) { int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k]; - TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep+rank, 1, 0)); + #if 0 int *dataHost = (int *)malloc(args->sendBytes); hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost); @@ -66,24 +51,25 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc printf("\n"); free(dataHost); #endif + size_t rdisp = 0; size_t data_count = sendcount*2/nranks; size_t chunksize = data_count/nranks; for (int j=0; jexpected[k])+rdisp*wordSize(type), rcount, type, rep+sdisp, j)); - rdisp += rcount; + size_t sdisp = 0; + for (int kk=0; kkexpected[k])+rdisp*wordSize(type), rcount, sdisp, type, ncclSum, 33*rep+j, 1, 0)); + rdisp += rcount; } k++; } diff --git a/src/broadcast.cu b/src/broadcast.cu index dffb6b6..3797a84 100644 --- a/src/broadcast.cu +++ b/src/broadcast.cu @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,18 +8,6 @@ #include #include "common.h" -void print_header() { - PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); - PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root", - "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", - "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); -} - -void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %8s %6i", size, count, typeName, root); -} - void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { *sendcount = count; *recvcount = count; @@ -34,17 +22,14 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc int k=0; for (int i=0; inGpus; i++) { - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - if (args->enable_multiranks) - gpuid = gpuid % args->localNumDevices; - HIPCHECK(hipSetDevice(gpuid)); + HIPCHECK(hipSetDevice(args->gpus[i])); for (int l=0; lnRanks; l++) { int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k]; - if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank)); - TESTCHECK(InitData(args->expected[k], recvcount, type, rep, root)); + if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0)); + TESTCHECK(InitData(args->expected[k], recvcount, 0, type, ncclSum, rep, 1, 0)); k++; } HIPCHECK(hipDeviceSynchronize()); @@ -114,7 +99,7 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t for (int i=0; i #include +#include #include #include //#define DEBUG_PRINT +#include "../verifiable/verifiable.h" + int test_ncclVersion = 0; // init'd with ncclGetVersion() #if NCCL_MAJOR >= 2 @@ -54,6 +57,12 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion() const char *test_memorytypes[nccl_NUM_MTYPES] = {"coarse", "fine", "host", "managed"}; +// For libnccl's < 2.13 +extern "C" __attribute__((weak)) char const* ncclGetLastError(ncclComm_t comm) { + return ""; +} + +int is_main_proc = 0; thread_local int is_main_thread = 0; // Command line parameter defaults @@ -75,12 +84,16 @@ static int blocking_coll = 0; static int memorytype = 0; static int stress_cycles = 1; static uint32_t cumask[4]; +static int streamnull = 0; +static int timeout = 0; static int cudaGraphLaunches = 0; +static int report_cputime = 0; // Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX) static int average = 1; static int numDevices = 1; static int ranksPerGpu = 1; static int enable_multiranks = 0; +static int delay_inout_place = 0; #define NUM_BLOCKS 32 @@ -152,374 +165,164 @@ static bool minReqVersion(int rmajor, int rminor, int rpatch) return true; } -double DeltaMaxValue(ncclDataType_t type) { - switch(type) { - case ncclHalf: return 1e-2; -#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1 - case ncclBfloat16: return 1e-2; -#endif - case ncclFloat: return 1e-5; - case ncclDouble: return 1e-12; - case ncclInt: -#if NCCL_MAJOR >= 2 - case ncclUint8: - //case ncclInt32: - case ncclUint32: -#endif - case ncclInt64: - case ncclUint64: return 1e-200; - } - return 1e-200; -} - -template __device__ -double absDiff(T a, T b) { - return fabs((double)(b - a)); -} - -template<> __device__ -double absDiff(half a, half b) { - float x = __half2float(a); - float y = __half2float(b); - return fabs((double)(y-x)); -} - -template __device__ -float toFloat(T a) { - return (float)a; -} -template<> __device__ -float toFloat(half a) { - return __half2float(a); -} -#if defined(RCCL_BFLOAT16) -template<> __device__ -float toFloat(rccl_bfloat16 a) { - return (float)(a); -} -#endif - -template __global__ -void deltaKern(void* A_, void* B_, size_t count, double* max) { - const T* A = (const T*)A_; - const T* B = (const T*)B_; - __shared__ double temp[BSIZE]; - int tid = blockIdx.x*blockDim.x + threadIdx.x; - double locmax = 0.0; - for(size_t i=tid; i locmax ) { - locmax = delta; -#ifdef DEBUG_PRINT - if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i])); -#endif - } - } - - tid = threadIdx.x; - temp[tid] = locmax; - for(int stride = BSIZE/2; stride > 1; stride>>=1) { - __syncthreads(); - if( tid < stride ) - temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride]; - } - __syncthreads(); - if( threadIdx.x == 0) - max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1]; -} - -testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) { - switch (type) { -#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1 - case ncclBfloat16: - hipLaunchKernelGGL((deltaKern), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break; -#endif - case ncclHalf: - hipLaunchKernelGGL((deltaKern), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break; - case ncclFloat: - hipLaunchKernelGGL((deltaKern), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break; - case ncclDouble: - hipLaunchKernelGGL((deltaKern), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break; - case ncclChar: -#if NCCL_MAJOR >= 2 - case ncclUint8: -#endif - hipLaunchKernelGGL((deltaKern), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break; - case ncclInt: -#if NCCL_MAJOR >= 2 - case ncclUint32: -#endif - hipLaunchKernelGGL((deltaKern), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break; - case ncclInt64: - case ncclUint64: - hipLaunchKernelGGL((deltaKern), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break; - } +testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) { + ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, hipStreamDefault); HIPCHECK(hipDeviceSynchronize()); - for (int i=1; i -__device__ T testValue(const size_t offset, const int rep, const int rank) { - uint8_t v = (rep+rank+offset) % 256; - return (T)v; -} - -// For floating point datatype, we use values between 0 and 1 otherwise the -// Product operation will produce NaNs. -template<> -__device__ double testValue(const size_t offset, const int rep, const int rank) { - return 1.0/(1.0+(double)testValue(offset, rep, rank)); -} -template<> -__device__ float testValue(const size_t offset, const int rep, const int rank) { - return 1.0/(1.0+(float)testValue(offset, rep, rank)); -} -template<> -__device__ half testValue(const size_t offset, const int rep, const int rank) { - return __float2half(testValue(offset, rep, rank)); -} -#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1 -template<> -__device__ rccl_bfloat16 testValue(const size_t offset, const int rep, const int rank) { - return rccl_bfloat16(testValue(offset, rep, rank)); -} -#endif - -// Operations -template -__device__ T ncclOpSum(T a, T b) { return a+b; } -template -__device__ T ncclOpProd(T a, T b) { return a*b; } -template -__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; } -template -__device__ T ncclOpMin(T a, T b) { return a -__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); } -template<> -__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); } -template<> -__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; } -template<> -__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; } - -template -__device__ T ncclPPOpIdent(T x, int arg) { return x; } -template -__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); } -template -__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); } -template<> -__device__ half ncclPPOpMul(half x, int arg) { - return __float2half(__half2float(x)*float(arg)); -} -template<> -__device__ half ncclPPOpDiv(half x, int n) { - return __float2half(__half2float(x)/n); -} -#if RCCL_BFLOAT16 == 1 -template<> -__device__ rccl_bfloat16 ncclPPOpMul(rccl_bfloat16 x, int arg) { - return (rccl_bfloat16)((float)(x)*float(arg)); -} -template<> -__device__ rccl_bfloat16 ncclPPOpDiv(rccl_bfloat16 x, int n) { - return (rccl_bfloat16)((float)(x)/(float)(n));; -} -#endif - -__host__ __device__ int preMulScalar(int rank) { - return 1 + rank%2; -} - -template -__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) { - for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o+offset, rep, 0); - val = PreOp(val, preMulScalar(0)); - for (int i=1; i(o+offset, rep, i); - val1 = PreOp(val1, preMulScalar(i)); - val = Op(val, val1); - } - data[o] = PostOp(val, nranks); - } -} - -#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel, preop, postop > -#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) - #define OPS(type) \ - KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \ - KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent) -#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - #define OPS(type) \ - KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv) -#else - #define OPS(type) \ - KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent) -#endif - -static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = { - OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double), -#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1 - OPS(rccl_bfloat16) -#endif -}; - -testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) { - dim3 grid = { 32, 1, 1 }; - dim3 block = { 256, 1, 1 }; - void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks }; - HIPCHECK(hipLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, hipStreamDefault)); +testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) { + ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, hipStreamDefault); return testSuccess; } -template -__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) { - for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o, rep, rank); -} - -static void* const initDataKerns[ncclNumTypes] = { - (void*)InitDataKernel< int8_t>, - (void*)InitDataKernel< uint8_t>, - (void*)InitDataKernel< int32_t>, - (void*)InitDataKernel, - (void*)InitDataKernel< int64_t>, - (void*)InitDataKernel, - (void*)InitDataKernel< half>, - (void*)InitDataKernel< float>, - (void*)InitDataKernel< double>, -#if RCCL_BFLOAT16 == 1 && NCCL_MAJOR >= 2 - (void*)InitDataKernel -#endif -}; - -template -testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) { - T* ptr = (T*)dest; - hipLaunchKernelGGL((InitDataKernel), dim3(16), dim3(512), 0, 0, ptr, N, rep, rank); +testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) { + ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, hipStreamDefault); return testSuccess; } -testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) { - dim3 grid = { 32, 1, 1 }; - dim3 block = { 256, 1, 1 }; - void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank }; - HIPCHECK(hipLaunchKernel(initDataKerns[type], grid, block, args, 0, hipStreamDefault)); - return testSuccess; +void Barrier(struct threadArgs *args) { + thread_local int epoch = 0; + static pthread_mutex_t lock[2] = {PTHREAD_MUTEX_INITIALIZER, PTHREAD_MUTEX_INITIALIZER}; + static pthread_cond_t cond[2] = {PTHREAD_COND_INITIALIZER, PTHREAD_COND_INITIALIZER}; + static int counter[2] = {0, 0}; + + pthread_mutex_lock(&lock[epoch]); + if(++counter[epoch] == args->nThreads) + pthread_cond_broadcast(&cond[epoch]); + + if(args->thread+1 == args->nThreads) { + while(counter[epoch] != args->nThreads) + pthread_cond_wait(&cond[epoch], &lock[epoch]); + #ifdef MPI_SUPPORT + MPI_Barrier(MPI_COMM_WORLD); + #endif + counter[epoch] = 0; + pthread_cond_broadcast(&cond[epoch]); + } + else { + while(counter[epoch] != 0) + pthread_cond_wait(&cond[epoch], &lock[epoch]); + } + pthread_mutex_unlock(&lock[epoch]); + epoch ^= 1; } -void Barrier(struct threadArgs* args) { - while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); - args->barrier[args->barrier_idx] = args->thread + 1; - if (args->thread+1 == args->nThreads) { -#ifdef MPI_SUPPORT - MPI_Barrier(MPI_COMM_WORLD); -#endif - args->barrier[args->barrier_idx] = 0; +// Inter-thread/process barrier+allreduce. The quality of the return value +// for average=0 (which means broadcast from rank=0) is dubious. The returned +// value will actually be the result of process-local broadcast from the local thread=0. +template +void Allreduce(struct threadArgs* args, T* value, int average) { + thread_local int epoch = 0; + static pthread_mutex_t lock[2] = {PTHREAD_MUTEX_INITIALIZER, PTHREAD_MUTEX_INITIALIZER}; + static pthread_cond_t cond[2] = {PTHREAD_COND_INITIALIZER, PTHREAD_COND_INITIALIZER}; + static T accumulator[2]; + static int counter[2] = {0, 0}; + + pthread_mutex_lock(&lock[epoch]); + if(counter[epoch] == 0) { + if(average != 0 || args->thread == 0) accumulator[epoch] = *value; } else { - while (args->barrier[args->barrier_idx]) pthread_yield(); + switch(average) { + case /*r0*/ 0: if(args->thread == 0) accumulator[epoch] = *value; break; + case /*avg*/1: accumulator[epoch] += *value; break; + case /*min*/2: accumulator[epoch] = std::min(accumulator[epoch], *value); break; + case /*max*/3: accumulator[epoch] = std::max(accumulator[epoch], *value); break; + case /*sum*/4: accumulator[epoch] += *value; break; + } } - args->barrier_idx=!args->barrier_idx; -} -// Inter-thread/process barrier+allreduce -void Allreduce(struct threadArgs* args, double* value, int average) { - while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); - double val = *value; - if (args->thread > 0) { - double val2 = args->reduce[args->barrier_idx]; - if (average == 1) val += val2; - if (average == 2) val = std::min(val, val2); - if (average == 3) val = std::max(val, val2); - } - if (average || args->thread == 0) args->reduce[args->barrier_idx] = val; - args->barrier[args->barrier_idx] = args->thread + 1; - if (args->thread+1 == args->nThreads) { -#ifdef MPI_SUPPORT - if (average != 0) { - MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX; - MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD); + if(++counter[epoch] == args->nThreads) + pthread_cond_broadcast(&cond[epoch]); + + if(args->thread+1 == args->nThreads) { + while(counter[epoch] != args->nThreads) + pthread_cond_wait(&cond[epoch], &lock[epoch]); + + #ifdef MPI_SUPPORT + if(average != 0) { + static_assert(std::is_same::value || std::is_same::value, "Allreduce only for T in {long long, double}"); + MPI_Datatype ty = std::is_same::value ? MPI_LONG_LONG : + std::is_same::value ? MPI_DOUBLE : + MPI_Datatype(); + MPI_Op op = average == 1 ? MPI_SUM : + average == 2 ? MPI_MIN : + average == 3 ? MPI_MAX : + average == 4 ? MPI_SUM : MPI_Op(); + MPI_Allreduce(MPI_IN_PLACE, (void*)&accumulator[epoch], 1, ty, op, MPI_COMM_WORLD); } -#endif - if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads; - args->reduce[1-args->barrier_idx] = 0; - args->barrier[args->barrier_idx] = 0; - } else { - while (args->barrier[args->barrier_idx]) pthread_yield(); + #endif + + if(average == 1) accumulator[epoch] /= args->totalProcs*args->nThreads; + counter[epoch] = 0; + pthread_cond_broadcast(&cond[epoch]); + } + else { + while(counter[epoch] != 0) + pthread_cond_wait(&cond[epoch], &lock[epoch]); } - *value = args->reduce[args->barrier_idx]; - args->barrier_idx=!args->barrier_idx; + pthread_mutex_unlock(&lock[epoch]); + + *value = accumulator[epoch]; + epoch ^= 1; } -testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) { +testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int64_t *wrongElts) { + int nranks = args->nProcs*args->nGpus*args->nThreads; size_t count = args->expectedBytes/wordSize(type); - double maxDelta = 0.0; + + int64_t *wrongPerGpu = nullptr; + HIPCHECK(hipHostMalloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), hipHostMallocMapped)); + for (int i=0; inGpus*args->nRanks; i++) { int device; int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i); NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); HIPCHECK(hipSetDevice(device)); void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; - TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost)); - maxDelta = std::max(*(args->deltaHost), maxDelta); -#ifdef DEBUG_PRINT - //if (rank == 0) { - int *expectedHost = (int *)malloc(args->expectedBytes); - int *dataHost = (int *)malloc(args->expectedBytes); + TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i)); - hipMemcpy(expectedHost, args->expected[rank], args->expectedBytes, hipMemcpyDeviceToHost); +#if 1 && DEBUG_PRINT + if (args->reportErrors && wrongPerGpu[i] != 0) { + printf("rank=%d #wrong=%d\n", rank, (int)wrongPerGpu[i]); + char *expectedHost = (char*)malloc(args->expectedBytes); + char *dataHost = (char*)malloc(args->expectedBytes); + int eltsz = wordSize(type); + hipMemcpy(expectedHost, args->expected[i], args->expectedBytes, hipMemcpyDeviceToHost); hipMemcpy(dataHost, data, args->expectedBytes, hipMemcpyDeviceToHost); - int j, k, l; - for (j=0; jexpectedBytes/sizeof(int); j++) - if (expectedHost[j] != dataHost[j]) break; - k = j; - for (; jexpectedBytes/sizeof(int); j++) - if (expectedHost[j] == dataHost[j]) break; - l = j; - printf("\n Rank [%d] Expected: ", rank); - for (j=k; jexpectedBytes/sizeof(int) && jexpectedBytes/sizeof(int) && jexpectedBytes/eltsz; j++) { + unsigned long long want, got; + want = 0; + memcpy(&want, expectedHost + j*eltsz, eltsz); + got = 0; + memcpy(&got, dataHost + j*eltsz, eltsz); + if(want != got) { + printf(" rank=%d elt[%d]: want=0x%llx got=0x%llx\n", rank, j, want, got); + } } - printf("\n"); free(expectedHost); free(dataHost); - //} + } #endif } - double nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; - if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++; - *delta = maxDelta; + + *wrongElts = 0; + for (int i=0; i < args->nGpus; i++) *wrongElts += wrongPerGpu[i]; + hipFree(wrongPerGpu); + + if (args->reportErrors && *wrongElts) args->errors[0]++; return testSuccess; } - + testResult_t testStreamSynchronize(int nStreams, hipStream_t* streams, ncclComm_t* comms) { hipError_t hipErr; int remaining = nStreams; int* done = (int*)malloc(sizeof(int)*nStreams); memset(done, 0, sizeof(int)*nStreams); + timer tim; + while (remaining) { int idle = 1; for (int i=0; i timeout && timeout > 0) { + for (int i=0; i(rank); break; + case ncclUint8: u8 = ncclVerifiablePremulScalar(rank); break; + case ncclInt32: i32 = ncclVerifiablePremulScalar(rank); break; + case ncclUint32: u32 = ncclVerifiablePremulScalar(rank); break; + case ncclInt64: i64 = ncclVerifiablePremulScalar(rank); break; + case ncclUint64: u64 = ncclVerifiablePremulScalar(rank); break; + case ncclFloat16: f16 = ncclVerifiablePremulScalar(rank); break; + case ncclFloat32: f32 = ncclVerifiablePremulScalar(rank); break; + case ncclFloat64: f64 = ncclVerifiablePremulScalar(rank); break; #if defined(RCCL_BFLOAT16) - case ncclBfloat16: bf16 = (rccl_bfloat16)(float(scalar)); break; + case ncclBfloat16: bf16 = ncclVerifiablePremulScalar(rank); break; #endif } NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i])); @@ -637,7 +452,7 @@ testResult_t completeColl(struct threadArgs* args) { return testSuccess; } -//EDGAR: Revisit because of cudaGraphLaunches +//RCCL: Revisit because of cudaGraphLaunches testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { size_t count = args->nbytes / wordSize(type); if (datacheck) { @@ -645,9 +460,11 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place)); } - // Sync - TESTCHECK(startColl(args, type, op, root, in_place, 0)); - TESTCHECK(completeColl(args)); + if (warmup_iters) { + // Sync + TESTCHECK(startColl(args, type, op, root, in_place, 0)); + TESTCHECK(completeColl(args)); + } Barrier(args); @@ -657,16 +474,17 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t if (cudaGraphLaunches >= 1) { // Begin cuda graph capture for (int i=0; inGpus*args->nRanks; i++) { - // Thread local mode is needed for: - // - Multi-thread mode - // - P2P pre-connect + // Thread local mdoe is needed for: + // - Multi-thread mode: where graph capture and instantiation can happen concurrently across threads + // - P2P pre-connect: when there is no warm-up, P2P pre-connect is done during graph capture. + // Since pre-connect calls cudaMalloc, we cannot use global capture mode HIPCHECK(hipStreamBeginCapture(args->streams[i], hipStreamCaptureModeThreadLocal)); } } #endif // Performance Benchmark - auto start = std::chrono::high_resolution_clock::now(); + timer tim; for (int iter = 0; iter < iters; iter++) { if (agg_iters>1) NCCLCHECK(ncclGroupStart()); for (int aiter = 0; aiter < agg_iters; aiter++) { @@ -687,7 +505,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } // Resync CPU, restart timing, launch cuda graph Barrier(args); - start = std::chrono::high_resolution_clock::now(); + tim.reset(); for (int l=0; lnGpus*args->nRanks; i++) { HIPCHECK(hipGraphLaunch(graphExec[i], args->streams[i])); @@ -696,10 +514,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } #endif + double cputimeSec = tim.elapsed()/(iters*agg_iters); TESTCHECK(completeColl(args)); - auto delta = std::chrono::high_resolution_clock::now() - start; - double deltaSec = std::chrono::duration_cast>(delta).count(); + double deltaSec = tim.elapsed(); deltaSec = deltaSec/(iters*agg_iters); if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches; Allreduce(args, &deltaSec, average); @@ -719,8 +537,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t Barrier(args); - double maxDelta = 0; - bool error = false; + int64_t wrongElts = 0; static __thread int rep = 0; rep++; if (datacheck) { @@ -768,13 +585,15 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } #endif - TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); + TESTCHECK(CheckData(args, type, op, root, in_place, &wrongElts)); //aggregate delta from all threads and procs - Allreduce(args, &maxDelta, 3); + long long wrongElts1 = wrongElts; + Allreduce(args, &wrongElts1, /*sum*/4); + wrongElts = wrongElts1; } - double timeUsec = deltaSec*1.0E6; + double timeUsec = (report_cputime ? cputimeSec : deltaSec)*1.0E6; char timeStr[100]; if (timeUsec >= 10000.0) { sprintf(timeStr, "%7.0f", timeUsec); @@ -783,10 +602,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } else { sprintf(timeStr, "%7.2f", timeUsec); } - if (datacheck) { - PRINT(" %7s %6.2f %6.2f %5.0le%s", timeStr, algBw, busBw, maxDelta, error ? "*" : ""); + if (args->reportErrors) { + PRINT(" %7s %6.2f %6.2f %5g", timeStr, algBw, busBw, (double)wrongElts); } else { - PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); + PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); } args->bw[0] += busBw; @@ -809,6 +628,9 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) { } testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) { + // Sync to avoid first-call timeout + Barrier(args); + // Warm-up for large size setupArgs(args->maxbytes, type, args); for (int iter = 0; iter < warmup_iters; iter++) { @@ -828,8 +650,11 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* // Benchmark for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { setupArgs(size, type, args); - print_line_header(std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); + char rootName[100]; + sprintf(rootName, "%6i", root); + PRINT("%12li %12li %8s %6s %6s", std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName); TESTCHECK(BenchTime(args, type, op, root, 0)); + usleep(delay_inout_place); TESTCHECK(BenchTime(args, type, op, root, 1)); PRINT("\n"); } @@ -841,10 +666,7 @@ testResult_t threadRunTests(struct threadArgs* args) { // Set device to the first of our GPUs. If we don't do that, some operations // will be done on the current GPU (by default : 0) and if the GPUs are in // exclusive mode those operations will fail. - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus; - if (enable_multiranks) - gpuid = gpuid % numDevices; - HIPCHECK(hipSetDevice(gpuid)); + HIPCHECK(hipSetDevice(args->gpus[0])); TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop])); return testSuccess; } @@ -855,14 +677,11 @@ testResult_t threadInit(struct threadArgs* args) { int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; //set main thread again - is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0; + is_main_thread = (is_main_proc && args->thread == 0) ? 1 : 0; NCCLCHECK(ncclGroupStart()); for (int i=0; inGpus; i++) { - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - if (enable_multiranks) - gpuid = gpuid % numDevices; - HIPCHECK(hipSetDevice(gpuid)); + HIPCHECK(hipSetDevice(args->gpus[i])); for (int j=0; jnRanks; j++) { int rank = (args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + j; @@ -894,7 +713,7 @@ testResult_t threadLaunch(struct testThread* thread) { return testSuccess; } -testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) { +testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) { if (memorytype == ncclFine) { HIPCHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained)); HIPCHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained)); @@ -968,10 +787,13 @@ int main(int argc, char* argv[]) { {"datatype", required_argument, 0, 'd'}, {"root", required_argument, 0, 'r'}, {"blocking", required_argument, 0, 'z'}, - {"memory_type", required_argument, 0, 'y'}, - {"stress_cycles", required_argument, 0, 's'}, - {"cumask", required_argument, 0, 'u'}, + {"memory_type", required_argument, 0, 'y'}, //RCCL + {"stress_cycles", required_argument, 0, 's'}, //RCCL + {"cumask", required_argument, 0, 'u'}, //RCCL + {"stream_null", required_argument, 0, 'y'}, //NCCL + {"timeout", required_argument, 0, 'T'}, //NCCL {"cudagraph", required_argument, 0, 'G'}, + {"report_cputime", required_argument, 0, 'C'}, {"average", required_argument, 0, 'a'}, #ifdef RCCL_MULTIRANKPERGPU {"enable_multiranks", required_argument, 0, 'x'}, @@ -983,10 +805,11 @@ int main(int argc, char* argv[]) { while(1) { int c; -#ifdef RCCL_MULTIRANKPERGPU - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:G:a:y:s:u:h:R:x:", longopts, &longindex); + +#ifdef RCCL_MULTIRANKPERGPU + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:R:x:q:", longopts, &longindex); #else - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:G:a:y:s:u:h:", longopts, &longindex); + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:q:", longopts, &longindex); #endif if (c == -1) @@ -1052,7 +875,7 @@ int main(int argc, char* argv[]) { case 'z': blocking_coll = strtol(optarg, NULL, 0); break; - case 'y': + case 'Y': memorytype = ncclstringtomtype(optarg); break; case 's': @@ -1067,6 +890,12 @@ int main(int argc, char* argv[]) { mask = strtok(NULL, ","); }; } + break; + case 'y': + streamnull = strtol(optarg, NULL, 0); + break; + case 'T': + timeout = strtol(optarg, NULL, 0); break; case 'G': #if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && HIP_VERSION >= 50221310 @@ -1075,6 +904,9 @@ int main(int argc, char* argv[]) { printf("Option -G (HIP graph) not supported before NCCL 2.9 + ROCm 5.2 Ignoring\n"); #endif break; + case 'C': + report_cputime = strtol(optarg, NULL, 0); + break; case 'a': average = (int)strtol(optarg, NULL, 0); break; @@ -1086,6 +918,9 @@ int main(int argc, char* argv[]) { ranksPerGpu = (int)strtol(optarg, NULL, 0); break; #endif + case 'q': + delay_inout_place = (int)strtol(optarg, NULL, 10); + break; case 'h': default: if (c != 'h') printf("invalid option '%c'\n", c); @@ -1111,18 +946,22 @@ int main(int argc, char* argv[]) { "[-d,--datatype ] \n\t" "[-r,--root ] \n\t" "[-z,--blocking <0/1>] \n\t" - "[-y,--memory_type ] \n\t" + "[-Y,--memory_type ] \n\t" "[-s,--stress_cycles ] \n\t" "[-u,--cumask ] \n\t" + "[-y,--stream_null <0/1>] \n\t" + "[-T,--timeout