diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy
index 7426d35..fe58a3e 100644
--- a/.jenkins/common.groovy
+++ b/.jenkins/common.groovy
@@ -14,7 +14,7 @@ def runCompileCommand(platform, project, jobName)
                 ${auxiliary.exitIfNotSuccess()}
                 cd ${project.paths.project_build_prefix}
                 cmake \
-                    -DCMAKE_CXX_COMPILER=/opt/rocm/hip/bin/hipcc \
+                    -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
                     -S . -B build
                 make -C build -j\$(nproc)
                 ${auxiliary.exitIfNotSuccess()}
@@ -30,9 +30,9 @@ def runTestCommand (platform, project)
     def command = """#!/usr/bin/env bash
                 set -x
                 cd ${project.paths.project_build_prefix}
-		python3 -m pip install --upgrade pytest
-		python3 -m pytest --version
-		python3 -m pytest -k "not MPI and not host and not fine" --verbose --junitxml=./testreport.xml
+                python3 -m pip install --upgrade pytest
+                python3 -m pytest --version
+                python3 -m pytest -k "not MPI and not host and not fine" --verbose --junitxml=./testreport.xml
             """
 
    platform.runCommand(this, command)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 539a1ea..5577fb8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,29 @@
 # ########################################################################
 # Copyright 2022 Advanced Micro Devices, Inc.
 # ########################################################################
+#Adding pthread flag for linking
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
+macro(check_mpi mpi_compiler mpi_lib_a mpi_lib_so mpi_bin_dir mpi_base_lib_dir mpi_inc_dir)
+    find_program(MPI_MPICXX ${mpi_compiler} PATHS ${mpi_bin_dir} NO_DEFAULT_PATH)
+    if (MPI_MPICXX)
+        message ("-- ${mpi_compiler} found @ ${MPI_MPICXX}")
+        find_file(MPI_H mpi.h PATHS ${mpi_inc_dir} NO_DEFAULT_PATH)
+        message ("-- mpi.h is in ${MPI_H}")
+        find_file(MPI_LIB NAMES ${mpi_lib_so} ${mpi_lib_a} PATHS ${mpi_base_lib_dir} PATH_SUFFIXES lib lib64 lib/x86_64-linux-gnu NO_DEFAULT_PATH)
+        message ("-- libmpi is ${MPI_LIB}")
+	if (NOT MPI_H OR NOT MPI_LIB)
+	    set (MPI_MPICXX "MPI_MPICXX-NOTFOUND")
+	    set (MPI_H "MPI_H-NOTFOUND")
+	    set (MPI_LIB "MPI_LIB-NOTFOUND")
+	else()
+            add_definitions(-DMPI_SUPPORT)
+            include_directories(${mpi_inc_dir})
+            link_libraries(${MPI_LIB})
+	endif()
+    else()
+        message ("-- ${mpi_compiler} not found")
+    endif()
+endmacro()
 
 cmake_minimum_required(VERSION 3.16.3 FATAL_ERROR)
 
@@ -30,8 +53,8 @@ include(ROCMCheckTargetIds)
 include(ROCMClients)
 
 # Build variables
-option(USE_MPI "Build RCCL-tests with MPI support. Requires the MPI path to be set.")
-set(MPI_PATH "" CACHE PATH "Path to MPI installation")
+option(NO_MPI "Build RCCL-tests without MPI support.")
+option(MPI_PATH "Use MPI in the specified directory.")
 ## Get default GPU targets using rocm_check_target_ids
 rocm_check_target_ids(
     DEFAULT_AMDGPU_TARGETS
@@ -39,13 +62,43 @@ rocm_check_target_ids(
 )
 set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for these tests to target.")
 
-# Find the MPI package if we're using MPI
-if (USE_MPI)
-    if(NOT MPI_PATH STREQUAL "")
-        set(MPI_HOME "${MPI_PATH}")
+if (NOT NO_MPI)
+    # CHECK for MPI Path first. User requested this directory explicitely
+    if (MPI_PATH)
+        set(mpi_spec_bin_dir "${MPI_PATH}/bin")
+	set(mpi_spec_inc_dir "${MPI_PATH}/include")
+        check_mpi(mpicxx libmpi.a libmpi.so ${mpi_spec_bin_dir} ${MPI_PATH} ${mpi_spec_inc_dir})
+	if (NOT MPI_MPICXX)
+            # Since the user explicitely requested this directory, abort if something went wrong.
+	    MESSAGE(FATAL_ERROR "Could not find MPI in ${MPI_PATH}")
+        endif()
+    endif()
+
+    # Check for MPICH Ubuntu installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx.mpich libmpich.a libmpich.so /usr/bin /usr /usr/include/x86_64-linux-gnu/mpich)
+    endif()
+
+    # Check for Open MPI Ubuntu installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx.openmpi libmpi.a libmpi.so /usr/bin  /usr/lib/x86_64-linux-gnu/openmpi /usr/lib/x86_64-linux-gnu/openmpi/include)
+    endif()
+
+    # Check for MPICH RHEL installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpich.a libmpich.so /usr/lib64/mpich/bin /usr/lib64/mpich /usr/include/mpich-x86_64)
+    endif()
+
+    # Check for Open MPI RHEL installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/openmpi/bin /usr/lib64/openmpi /usr/include/openmpi-x64_64)
     endif()
-    find_package(MPI REQUIRED MODULE)
-    add_definitions(-DOMPI_SKIP_MPICXX -DMPI_SUPPORT)
+
+    if (NOT MPI_MPICXX)
+        message ("-- no MPI library found")
+    endif()
+else()
+    message ("-- MPI support explicitely disabled")
 endif()
 
 set(ROCM_USE_DEV_COMPONENT OFF)  # This repo doesn't have a dev component
@@ -55,7 +108,7 @@ add_subdirectory(src)
 
 # Create ROCm standard packages
 rocm_create_package(
-    NAME rccl-separate-tests
+    NAME rccl-tests
     DESCRIPTION "Tests for the ROCm Communication Collectives Library"
     MAINTAINER "RCCL Maintainer <rccl-maintainer@amd.com>"
 )
diff --git a/Makefile b/Makefile
index 4025f10..8e0154a 100644
--- a/Makefile
+++ b/Makefile
@@ -4,6 +4,9 @@
 # See LICENCE.txt for license information
 #
 
+BUILDDIR ?= build
+override BUILDDIR := $(abspath $(BUILDDIR))
+
 .PHONY : all clean
 
 default : src.build
@@ -14,7 +17,7 @@ all:   ${TARGETS:%=%.build}
 clean: ${TARGETS:%=%.clean}
 
 %.build:
-	${MAKE} -C $* build
+	${MAKE} -C $* build BUILDDIR=${BUILDDIR}
 
 %.clean:
-	${MAKE} -C $* clean
+	${MAKE} -C $* clean BUILDDIR=${BUILDDIR}
diff --git a/README.md b/README.md
index c284723..74f1551 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,23 @@ RCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If y
 $ make MPI=1 MPI_HOME=/path/to/mpi HIP_HOME=/path/to/hip RCCL_HOME=/path/to/rccl
 ```
 
+RCCL tests can also be built using cmake. A typical sequence will be:
+
+```shell
+$ mkdir build
+$ cd build
+$ CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/path/to/rccl ..
+$ make
+```
+
+When using the cmake build procedure, please make sure that RCCL has also been built using cmake (i.e. not using the install.sh script), since cmake will check
+for cmake target and config files that are created during the RCCL build.
+
+Using the cmake method also has the advantage that the build is automatically checking for MPI installations, i.e. it is not necessary to explicitly request
+MPI builds. A user can request to use a particular MPI library by using the MPI_PATH variable. MPI support can be explicitely disabled by adding the -DNO_MPI=1
+flag to the cmake command line.
+
+
 ## Usage
 
 RCCL tests can run on multiple processes, multiple threads, and multiple HIP devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=HIP devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread).
diff --git a/scripts/rccl_tests_build_run_PTS.sh b/scripts/rccl_tests_build_run_PTS.sh
new file mode 100755
index 0000000..a919739
--- /dev/null
+++ b/scripts/rccl_tests_build_run_PTS.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+echo "This script is for building and running the rccl-tests as well as Unit tests"
+echo "Please ensure that the following environment variables are pointing to correct directions!"
+
+########## Set the appropriate directories ##########
+export _HIP_HOME=/opt/rocm/hip
+export _MPI_HOME=/path/to/mpi/build
+export _RCCL_HOME=/opt/rocm/rccl/build
+
+export LD_LIBRARY_PATH=$_MPI_HOME/lib:$LD_LIBRARY_PATH
+export PATH=$_MPI_HOME/bin/:$PATH
+echo "HIP_HOME=$_HIP_HOME"
+echo "MPI_HOME=$_MPI_HOME"
+echo "RCCL_HOME=$_RCCL_HOME"
+
+echo "########## Print the system information ##########"
+sudo dmidecode | grep "Product Name"
+rocm-smi --showtopo
+
+########## Set the number of GPUs ##########
+ngpus=8
+set -x
+########## Build the RCCL-tests benchmark ##########
+echo "Do you want to run tests on multiple nodes?"
+read -p '(y/n) ' RESPONSE
+if [ "$RESPONSE" = "y" ]; then
+
+        ########## MPI Installation check ##########
+        MPI_Installed=$(which mpicc)
+        
+        if [ -z "$MPI_Installed" ]; then
+                echo "MPI is not installed! Install MPI and set the PATH environment variable to include PATH=/path/to/MPI-install/bin/:$PATH";
+                exit
+        else
+                cd ..
+                rm -rf rccl-tests
+                git clone https://github.com/ROCmSoftwarePlatform/rccl-tests.git
+                cd rccl-tests
+                make MPI=1 MPI_HOME=$_MPI_HOME HIP_HOME=$_HIP_HOME NCCL_HOME=$_RCCL_HOME
+        fi
+else
+        cd ..
+        rm -rf rccl-tests
+        git clone https://github.com/ROCmSoftwarePlatform/rccl-tests.git
+        cd rccl-tests
+        make HIP_HOME=$_HIP_HOME NCCL_HOME=$_RCCL_HOME
+fi       
+
+########## Run the RCCL-tests benchmark ##########
+cd build
+echo "Allreduce Test"
+./all_reduce_perf -b 8 -e 1G -f 2 -g $ngpus
+echo "Broadcast Test"
+./broadcast_perf -b 8 -e 1G -f 2 -g $ngpus
+echo "Reduce Test"
+./reduce_perf -b 8 -e 1G -f 2 -g $ngpus
+echo "Reduce_scatter Test"
+./reduce_scatter_perf -b 8 -e 1G -f 2 -g $ngpus
+echo "Allgather Test"
+./all_gather_perf -b 8 -e 1G -f 2 -g $ngpus
+echo "Send_Recv Test"
+./sendrecv_perf -b 8 -e 1G -f 2 -g $ngpus
+echo "Scatter Test"
+./scatter_perf -b 8 -e 1G -f 2 -g $ngpus
+echo "Gather Test"
+./gather_perf -b 8 -e 1G -f 2 -g $ngpus
+echo "Alltoall Test"
+./alltoall_perf -b 8 -e 1G -f 2 -g $ngpus
+echo "Alltoallv Test"
+./alltoallv_perf -b 8 -e 1G -f 2 -g $ngpus
+
+
+
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b5a40ae..6511a41 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -3,8 +3,8 @@
 # ########################################################################
 
 # Compile common object library
-set_property(SOURCE common.cu PROPERTY LANGUAGE CXX)
-add_library(rccl_common OBJECT common.cu)
+set_property(SOURCE common.cu timer.cc ../verifiable/verifiable.cu PROPERTY LANGUAGE CXX)
+add_library(rccl_common OBJECT common.cu timer.cc ../verifiable/verifiable.cu)
 if(USE_MPI)
     target_link_libraries(rccl_common roc::rccl MPI::MPI_CXX)
 else()
diff --git a/src/Makefile b/src/Makefile
index 3dbd41f..e07f12d 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,6 +1,6 @@
 #
-# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
-# Modifications are Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -12,7 +12,7 @@ VERBOSE ?= 0
 DEBUG ?= 0
 NCCL_HOME ?= ""
 
-HIPCC = $(ROCM_PATH)/hip/bin/hipcc
+HIPCC = $(ROCM_PATH)/bin/hipcc
 CXX = $(HIPCC)
 
 HIPCUFLAGS := -std=c++14
@@ -20,14 +20,13 @@ LDFLAGS    :=
 HIPLDFLAGS :=
 
 ifneq ($(NCCL_HOME), "")
-HIPCUFLAGS += -I$(NCCL_HOME) -I$(NCCL_HOME)/rccl/include
-HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
+HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
+HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME) -L$(NCCL_HOME)/lib
 endif
 HIPCUFLAGS += -I$(ROCM_PATH)/include
-HIPCUFLAGS += -I$(ROCM_PATH)/include/rccl
-HIPCUFLAGS += -I$(ROCM_PATH)/hip/include/hip
+HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
 LDFLAGS    += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
-HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
+HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt -pthread
 
 ifeq ($(DEBUG), 0)
 HIPCUFLAGS += -O3
@@ -65,15 +64,23 @@ build: ${BIN_FILES}
 clean:
 	rm -rf ${DST_DIR}
 
-${DST_DIR}/%.o: %.cu common.h
+TEST_VERIFIABLE_SRCDIR := ../verifiable
+TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable
+include ../verifiable/verifiable.mk
+
+${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
 	@printf "Compiling  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
 	echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"
 	$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<
 
-${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o
+${DST_DIR}/timer.o: timer.cc timer.h
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(CXX) $(CXXFLAGS) -o $@ -c timer.cc
+
+${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS)
 	@printf "Linking  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
 	echo "$(HIPCC) -o $@ $(HIPCUFLAGS) $^ ${HIPLDFLAGS}"
 	$(HIPCC) -o $@ $(HIPCUFLAGS) $^ ${HIPLDFLAGS}
-
diff --git a/src/all_gather.cu b/src/all_gather.cu
index bc1c599..759f347 100644
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,24 +8,15 @@
 #include <hip/hip_runtime.h>
 #include "common.h"
 
-void print_header() {
-  PRINT("# %10s  %12s  %8s            out-of-place                       in-place          \n", "", "", "");
-  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %8s", size, count, typeName);
-}
+#define ALIGN 4
 
 void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  *sendcount = count/nranks;
-  *recvcount = (count/nranks)*nranks;
-  *sendInplaceOffset = count/nranks;
+  size_t base = (count/(ALIGN*nranks))*ALIGN;
+  *sendcount = base;
+  *recvcount = base*nranks;
+  *sendInplaceOffset = base;
   *recvInplaceOffset = 0;
-  *paramcount = *sendcount;
+  *paramcount = base;
 }
 
 testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
@@ -35,18 +26,15 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc
 
   int k=0;
   for (int i=0; i<args->nGpus; i++) {
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    if (args->enable_multiranks)
-      gpuid = gpuid % args->localNumDevices;
-    HIPCHECK(hipSetDevice(gpuid));
+    HIPCHECK(hipSetDevice(args->gpus[i]));
 
     for (int l=0; l<args->nRanks; l++) {
       int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
       HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
       void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
-      TESTCHECK(InitData(data, sendcount, type, rep, rank));
+      TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
       for (int j=0; j<nranks; j++) {
-	TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, type, rep, j));
+	TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
       }
       k++;
     }
@@ -98,7 +86,7 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t
   }
 
   for (int i=0; i<type_count; i++) {
-    TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
+    TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
   }
   return testSuccess;
 }
diff --git a/src/all_reduce.cu b/src/all_reduce.cu
index e76ee38..92fdbad 100644
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,18 +8,6 @@
 #include <hip/hip_runtime.h>
 #include "common.h"
 
-void print_header() {
-  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
-}
-
 void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   *sendcount = count;
   *recvcount = count;
@@ -35,16 +23,13 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
 
   int k = 0;
   for (int i=0; i<args->nGpus; i++) {
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    if (args->enable_multiranks)
-      gpuid = gpuid % args->localNumDevices;
-    HIPCHECK(hipSetDevice(gpuid));
+    HIPCHECK(hipSetDevice(args->gpus[i]));
 
     for (int l=0; l<args->nRanks; l++) {
       int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
       HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
       void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
-      TESTCHECK(InitData(data, sendcount, type, rep, rank));
+      TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
       TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks));
       k++;
     }
diff --git a/src/alltoall.cu b/src/alltoall.cu
index 48020e4..77546f4 100644
--- a/src/alltoall.cu
+++ b/src/alltoall.cu
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,18 +8,6 @@
 #include <hip/hip_runtime.h>
 #include "common.h"
 
-void print_header() {
-  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
-}
-
 void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   *sendcount = (count/nranks)*nranks;
   *recvcount = (count/nranks)*nranks;
@@ -35,19 +23,16 @@ testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, nccl
 
   int k=0;
   for (int i=0; i<args->nGpus; i++) {
-    char* str = getenv("NCCL_TESTS_DEVICE");
-    int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    if (args->enable_multiranks)
-      gpuid = gpuid % args->localNumDevices;
-    HIPCHECK(hipSetDevice(gpuid));
+    HIPCHECK(hipSetDevice(args->gpus[i]));
 
     for (int l=0; l<args->nRanks; l++) {
       int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
       HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
       void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
-      TESTCHECK(InitData(data, sendcount, type, rep, rank));
+      TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
       for (int j=0; j<nranks; j++) {
-	TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes/nranks*j, sendcount/nranks, type, rep+rank*sendcount/nranks, j));
+        size_t partcount = sendcount/nranks;
+	TESTCHECK(InitData(((char*)args->expected[k])+ j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
       }
       k++;
     }
@@ -101,7 +86,7 @@ testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t t
   }
 
   for (int i=0; i<type_count; i++) {
-      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
   }
   return testSuccess;
 }
diff --git a/src/alltoallv.cu b/src/alltoallv.cu
index cb8fcaf..c5818d9 100644
--- a/src/alltoallv.cu
+++ b/src/alltoallv.cu
@@ -10,18 +10,6 @@
 
 #define USE_RCCL_GATHER_SCATTER
 
-void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %6s  %6s", size, count, typeName, opName);
-}
-
 void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   if (count < nranks*nranks/2) {
     *sendcount = 0;
@@ -45,17 +33,14 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
 
   int k=0;
   for (int i=0; i<args->nGpus; i++) {
-    char* str = getenv("NCCL_TESTS_DEVICE");
-    int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    if (args->enable_multiranks)
-      gpuid = gpuid % args->localNumDevices;
-    HIPCHECK(hipSetDevice(gpuid));
+    HIPCHECK(hipSetDevice(args->gpus[i]));
 
     for (int l=0; l<args->nRanks; l++) {
       int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
       HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
       void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
-      TESTCHECK(InitData(data, sendcount, type, rep, rank));
+      TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep+rank, 1, 0));
+
 #if 0
       int *dataHost = (int *)malloc(args->sendBytes);
       hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost);
@@ -66,24 +51,25 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
       printf("\n");
       free(dataHost);
 #endif
+
       size_t rdisp = 0;
       size_t data_count = sendcount*2/nranks;
       size_t chunksize = data_count/nranks;
       for (int j=0; j<nranks; j++) {
-	size_t scount = 0, rcount = ((j+rank)%nranks)*chunksize;
-	if ((j+rank)%nranks == 0)
+        size_t scount = 0, rcount = ((j+rank)%nranks)*chunksize;
+        if ((j+rank)%nranks == 0)
           rcount += (sendcount-chunksize*(nranks-1)*nranks/2);
-	size_t sdisp = 0;
-	for (int k=0; k<nranks; k++) {
-	  scount = ((k+j)%nranks)*chunksize;
-	  if ((k+j)%nranks == 0)
-	    scount += (sendcount-chunksize*(nranks-1)*nranks/2);
-	  if (k == rank)
-	    break;
-	  sdisp += scount;
-	}
-	TESTCHECK(InitData(((char*)args->expected[k])+rdisp*wordSize(type), rcount, type, rep+sdisp, j));
-	rdisp += rcount;
+        size_t sdisp = 0;
+        for (int kk=0; kk<nranks; kk++) {
+          scount = ((kk+j)%nranks)*chunksize;
+          if ((kk+j)%nranks == 0)
+            scount += (sendcount-chunksize*(nranks-1)*nranks/2);
+          if (kk == rank)
+            break;
+          sdisp += scount;
+        }
+        TESTCHECK(InitData(((char*)args->expected[k])+rdisp*wordSize(type), rcount, sdisp, type, ncclSum, 33*rep+j, 1, 0));
+        rdisp += rcount;
       }
       k++;
     }
diff --git a/src/broadcast.cu b/src/broadcast.cu
index dffb6b6..3797a84 100644
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,18 +8,6 @@
 #include <hip/hip_runtime.h>
 #include "common.h"
 
-void print_header() {
-  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "root",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %8s  %6i", size, count, typeName, root);
-}
-
 void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   *sendcount = count;
   *recvcount = count;
@@ -34,17 +22,14 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc
 
   int k=0;
   for (int i=0; i<args->nGpus; i++) {
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    if (args->enable_multiranks)
-      gpuid = gpuid % args->localNumDevices;
-    HIPCHECK(hipSetDevice(gpuid));
+    HIPCHECK(hipSetDevice(args->gpus[i]));
 
     for (int l=0; l<args->nRanks; l++) {
       int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
       HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
       void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
-      if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
-      TESTCHECK(InitData(args->expected[k], recvcount, type, rep, root));
+      if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
+      TESTCHECK(InitData(args->expected[k], recvcount, 0, type, ncclSum, rep, 1, 0));
       k++;
     }
     HIPCHECK(hipDeviceSynchronize());
@@ -114,7 +99,7 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t
 
   for (int i=0; i<type_count; i++) {
     for (int j=begin_root; j<=end_root; j++) {
-      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
     }
   }
   return testSuccess;
diff --git a/src/common.cu b/src/common.cu
index 332cc3f..bdac4b6 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -1,7 +1,7 @@
 
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -11,11 +11,14 @@
 #include "common.h"
 #include <pthread.h>
 #include <cstdio>
+#include <type_traits>
 #include <getopt.h>
 #include <libgen.h>
 
 //#define DEBUG_PRINT
 
+#include "../verifiable/verifiable.h"
+
 int test_ncclVersion = 0; // init'd with ncclGetVersion()
 
 #if NCCL_MAJOR >= 2
@@ -54,6 +57,12 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion()
 
 const char *test_memorytypes[nccl_NUM_MTYPES] = {"coarse", "fine", "host", "managed"};
 
+// For libnccl's < 2.13
+extern "C" __attribute__((weak)) char const* ncclGetLastError(ncclComm_t comm) {
+  return "";
+}
+
+int is_main_proc = 0;
 thread_local int is_main_thread = 0;
 
 // Command line parameter defaults
@@ -75,12 +84,16 @@ static int blocking_coll = 0;
 static int memorytype = 0;
 static int stress_cycles = 1;
 static uint32_t cumask[4];
+static int streamnull = 0;
+static int timeout = 0;
 static int cudaGraphLaunches = 0;
+static int report_cputime = 0;
 // Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
 static int average = 1;
 static int numDevices = 1;
 static int ranksPerGpu = 1;
 static int enable_multiranks = 0;
+static int delay_inout_place = 0;
 
 #define NUM_BLOCKS 32
 
@@ -152,374 +165,164 @@ static bool minReqVersion(int rmajor, int rminor, int rpatch)
   return true;
 }
 
-double DeltaMaxValue(ncclDataType_t type) {
-  switch(type) {
-    case ncclHalf: return 1e-2;
-#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
-    case ncclBfloat16: return 1e-2;
-#endif
-    case ncclFloat: return 1e-5;
-    case ncclDouble: return 1e-12;
-    case ncclInt:
-#if NCCL_MAJOR >= 2
-    case ncclUint8:
-    //case ncclInt32:
-    case ncclUint32:
-#endif
-    case ncclInt64:
-    case ncclUint64: return 1e-200;
-  }
-  return 1e-200;
-}
-
-template<typename T> __device__
-double absDiff(T a, T b) {
-  return fabs((double)(b - a));
-}
-
-template<> __device__
-double absDiff<half>(half a, half b) {
-  float x = __half2float(a);
-  float y = __half2float(b);
-  return fabs((double)(y-x));
-}
-
-template<typename T> __device__
-float toFloat(T a) {
-  return (float)a;
-}
-template<> __device__
-float toFloat(half a) {
-  return __half2float(a);
-}
-#if defined(RCCL_BFLOAT16)
-template<> __device__
-float toFloat(rccl_bfloat16 a) {
-  return (float)(a);
-}
-#endif
-
-template<typename T, int BSIZE> __global__
-void deltaKern(void* A_, void* B_, size_t count, double* max) {
-  const T* A = (const T*)A_;
-  const T* B = (const T*)B_;
-  __shared__ double temp[BSIZE];
-  int tid = blockIdx.x*blockDim.x + threadIdx.x;
-  double locmax = 0.0;
-  for(size_t i=tid; i<count; i+=blockDim.x*gridDim.x) {
-
-    double delta = absDiff(A[i], B[i]);
-    if( delta > locmax ) {
-      locmax = delta;
-#ifdef DEBUG_PRINT
-      if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i]));
-#endif
-    }
-  }
-
-  tid = threadIdx.x;
-  temp[tid] = locmax;
-  for(int stride = BSIZE/2; stride > 1; stride>>=1) {
-    __syncthreads();
-    if( tid < stride )
-      temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride];
-  }
-  __syncthreads();
-  if( threadIdx.x == 0)
-    max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
-}
-
-testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
-  switch (type) {
-#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
-    case ncclBfloat16:
-      hipLaunchKernelGGL((deltaKern<rccl_bfloat16, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
-#endif
-    case ncclHalf:
-      hipLaunchKernelGGL((deltaKern<half, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
-    case ncclFloat:
-      hipLaunchKernelGGL((deltaKern<float, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
-    case ncclDouble:
-      hipLaunchKernelGGL((deltaKern<double, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
-    case ncclChar:
-#if NCCL_MAJOR >= 2
-    case ncclUint8:
-#endif
-      hipLaunchKernelGGL((deltaKern<uint8_t, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
-    case ncclInt:
-#if NCCL_MAJOR >= 2
-    case ncclUint32:
-#endif
-      hipLaunchKernelGGL((deltaKern<uint32_t, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
-    case ncclInt64:
-    case ncclUint64:
-      hipLaunchKernelGGL((deltaKern<uint64_t, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
-  }
+testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) {
+  ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, hipStreamDefault);
   HIPCHECK(hipDeviceSynchronize());
-  for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
   return testSuccess;
 }
 
-// For integer values, we use values between 0 and 255
-template<typename T>
-__device__ T testValue(const size_t offset, const int rep, const int rank) {
-  uint8_t v = (rep+rank+offset) % 256;
-  return (T)v;
-}
-
-// For floating point datatype, we use values between 0 and 1 otherwise the
-// Product operation will produce NaNs.
-template<>
-__device__ double testValue<double>(const size_t offset, const int rep, const int rank) {
-  return 1.0/(1.0+(double)testValue<int>(offset, rep, rank));
-}
-template<>
-__device__ float testValue<float>(const size_t offset, const int rep, const int rank) {
-  return 1.0/(1.0+(float)testValue<int>(offset, rep, rank));
-}
-template<>
-__device__ half testValue<half>(const size_t offset, const int rep, const int rank) {
-  return __float2half(testValue<float>(offset, rep, rank));
-}
-#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
-template<>
-__device__ rccl_bfloat16 testValue<rccl_bfloat16>(const size_t offset, const int rep, const int rank) {
-  return rccl_bfloat16(testValue<float>(offset, rep, rank));
-}
-#endif
-
-// Operations
-template<typename T>
-__device__ T ncclOpSum(T a, T b) { return a+b; }
-template<typename T>
-__device__ T ncclOpProd(T a, T b) { return a*b; }
-template<typename T>
-__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; }
-template<typename T>
-__device__ T ncclOpMin(T a, T b) { return a<b ? a : b; }
-
-// Definitions for half
-template<>
-__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); }
-template<>
-__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); }
-template<>
-__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; }
-template<>
-__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; }
-
-template<typename T>
-__device__ T ncclPPOpIdent(T x, int arg) { return x; }
-template<typename T>
-__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); }
-template<typename T>
-__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); }
-template<>
-__device__ half ncclPPOpMul(half x, int arg) {
-  return __float2half(__half2float(x)*float(arg));
-}
-template<>
-__device__ half ncclPPOpDiv(half x, int n) {
-  return __float2half(__half2float(x)/n);
-}
-#if RCCL_BFLOAT16 == 1
-template<>
-__device__ rccl_bfloat16 ncclPPOpMul(rccl_bfloat16 x, int arg) {
-  return (rccl_bfloat16)((float)(x)*float(arg));
-}
-template<>
-__device__ rccl_bfloat16 ncclPPOpDiv(rccl_bfloat16 x, int n) {
-  return (rccl_bfloat16)((float)(x)/(float)(n));;
-}
-#endif
-
-__host__ __device__ int preMulScalar(int rank) {
-  return 1 + rank%2;
-}
-
-template<typename T, T (*Op)(T, T), T(*PreOp)(T,int), T(*PostOp)(T,int)>
-__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) {
-  for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x) {
-    T val = testValue<T>(o+offset, rep, 0);
-    val = PreOp(val, preMulScalar(0));
-    for (int i=1; i<nranks; i++) {
-      T val1 = testValue<T>(o+offset, rep, i);
-      val1 = PreOp(val1, preMulScalar(i));
-      val = Op(val, val1);
-    }
-    data[o] = PostOp(val, nranks);
-  }
-}
-
-#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel<type, op<type>, preop<type>, postop<type> >
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
-  #define OPS(type) \
-    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \
-    KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent)
-#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-  #define OPS(type) \
-    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv)
-#else
-  #define OPS(type) \
-    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent)
-#endif
-
-static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = {
-  OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double),
-#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
-  OPS(rccl_bfloat16)
-#endif
-};
-
-testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) {
-  dim3 grid = { 32, 1, 1 };
-  dim3 block = { 256, 1, 1 };
-  void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks };
-  HIPCHECK(hipLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, hipStreamDefault));
+testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) {
+  ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, hipStreamDefault);
   return testSuccess;
 }
 
-template<typename T>
-__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) {
-  for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x)
-    data[o] = testValue<T>(o, rep, rank);
-}
-
-static void* const initDataKerns[ncclNumTypes] = {
-  (void*)InitDataKernel<  int8_t>,
-  (void*)InitDataKernel< uint8_t>,
-  (void*)InitDataKernel< int32_t>,
-  (void*)InitDataKernel<uint32_t>,
-  (void*)InitDataKernel< int64_t>,
-  (void*)InitDataKernel<uint64_t>,
-  (void*)InitDataKernel<    half>,
-  (void*)InitDataKernel<   float>,
-  (void*)InitDataKernel<  double>,
-#if RCCL_BFLOAT16 == 1 && NCCL_MAJOR >= 2
-  (void*)InitDataKernel<rccl_bfloat16>
-#endif
-};
-
-template<typename T>
-testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) {
-  T* ptr = (T*)dest;
-  hipLaunchKernelGGL((InitDataKernel), dim3(16), dim3(512), 0, 0, ptr, N, rep, rank);
+testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) {
+  ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, hipStreamDefault);
   return testSuccess;
 }
 
-testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) {
-  dim3 grid = { 32, 1, 1 };
-  dim3 block = { 256, 1, 1 };
-  void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank };
-  HIPCHECK(hipLaunchKernel(initDataKerns[type], grid, block, args, 0, hipStreamDefault));
-  return testSuccess;
+void Barrier(struct threadArgs *args) {
+  thread_local int epoch = 0;
+  static pthread_mutex_t lock[2] = {PTHREAD_MUTEX_INITIALIZER, PTHREAD_MUTEX_INITIALIZER};
+  static pthread_cond_t cond[2] = {PTHREAD_COND_INITIALIZER, PTHREAD_COND_INITIALIZER};
+  static int counter[2] = {0, 0};
+
+  pthread_mutex_lock(&lock[epoch]);
+  if(++counter[epoch] == args->nThreads)
+    pthread_cond_broadcast(&cond[epoch]);
+
+  if(args->thread+1 == args->nThreads) {
+    while(counter[epoch] != args->nThreads)
+      pthread_cond_wait(&cond[epoch], &lock[epoch]);
+    #ifdef MPI_SUPPORT
+      MPI_Barrier(MPI_COMM_WORLD);
+    #endif
+    counter[epoch] = 0;
+    pthread_cond_broadcast(&cond[epoch]);
+  }
+  else {
+    while(counter[epoch] != 0)
+      pthread_cond_wait(&cond[epoch], &lock[epoch]);
+  }
+  pthread_mutex_unlock(&lock[epoch]);
+  epoch ^= 1;
 }
 
-void Barrier(struct threadArgs* args) {
-  while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
-  args->barrier[args->barrier_idx] = args->thread + 1;
-  if (args->thread+1 == args->nThreads) {
-#ifdef MPI_SUPPORT
-    MPI_Barrier(MPI_COMM_WORLD);
-#endif
-    args->barrier[args->barrier_idx] = 0;
+// Inter-thread/process barrier+allreduce. The quality of the return value
+// for average=0 (which means broadcast from rank=0) is dubious. The returned
+// value will actually be the result of process-local broadcast from the local thread=0.
+template<typename T>
+void Allreduce(struct threadArgs* args, T* value, int average) {
+  thread_local int epoch = 0;
+  static pthread_mutex_t lock[2] = {PTHREAD_MUTEX_INITIALIZER, PTHREAD_MUTEX_INITIALIZER};
+  static pthread_cond_t cond[2] = {PTHREAD_COND_INITIALIZER, PTHREAD_COND_INITIALIZER};
+  static T accumulator[2];
+  static int counter[2] = {0, 0};
+
+  pthread_mutex_lock(&lock[epoch]);
+  if(counter[epoch] == 0) {
+    if(average != 0 || args->thread == 0) accumulator[epoch] = *value;
   } else {
-    while (args->barrier[args->barrier_idx]) pthread_yield();
+    switch(average) {
+    case /*r0*/ 0: if(args->thread == 0) accumulator[epoch] = *value; break;
+    case /*avg*/1: accumulator[epoch] += *value; break;
+    case /*min*/2: accumulator[epoch] = std::min<T>(accumulator[epoch], *value); break;
+    case /*max*/3: accumulator[epoch] = std::max<T>(accumulator[epoch], *value); break;
+    case /*sum*/4: accumulator[epoch] += *value; break;
+    }
   }
-  args->barrier_idx=!args->barrier_idx;
-}
 
-// Inter-thread/process barrier+allreduce
-void Allreduce(struct threadArgs* args, double* value, int average) {
-  while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
-  double val = *value;
-  if (args->thread > 0) {
-    double val2 = args->reduce[args->barrier_idx];
-    if (average == 1) val += val2;
-    if (average == 2) val = std::min(val, val2);
-    if (average == 3) val = std::max(val, val2);
-  }
-  if (average || args->thread == 0) args->reduce[args->barrier_idx] = val;
-  args->barrier[args->barrier_idx] = args->thread + 1;
-  if (args->thread+1 == args->nThreads) {
-#ifdef MPI_SUPPORT
-    if (average != 0) {
-      MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX;
-      MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD);
+  if(++counter[epoch] == args->nThreads)
+    pthread_cond_broadcast(&cond[epoch]);
+
+  if(args->thread+1 == args->nThreads) {
+    while(counter[epoch] != args->nThreads)
+      pthread_cond_wait(&cond[epoch], &lock[epoch]);
+
+    #ifdef MPI_SUPPORT
+    if(average != 0) {
+      static_assert(std::is_same<T, long long>::value || std::is_same<T, double>::value, "Allreduce<T> only for T in {long long, double}");
+      MPI_Datatype ty = std::is_same<T, long long>::value ? MPI_LONG_LONG :
+                        std::is_same<T, double>::value ? MPI_DOUBLE :
+                        MPI_Datatype();
+      MPI_Op op = average == 1 ? MPI_SUM :
+                  average == 2 ? MPI_MIN :
+                  average == 3 ? MPI_MAX :
+                  average == 4 ? MPI_SUM : MPI_Op();
+      MPI_Allreduce(MPI_IN_PLACE, (void*)&accumulator[epoch], 1, ty, op, MPI_COMM_WORLD);
     }
-#endif
-    if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads;
-    args->reduce[1-args->barrier_idx] = 0;
-    args->barrier[args->barrier_idx] = 0;
-  } else {
-    while (args->barrier[args->barrier_idx]) pthread_yield();
+    #endif
+
+    if(average == 1) accumulator[epoch] /= args->totalProcs*args->nThreads;
+    counter[epoch] = 0;
+    pthread_cond_broadcast(&cond[epoch]);
+  }
+  else {
+    while(counter[epoch] != 0)
+      pthread_cond_wait(&cond[epoch], &lock[epoch]);
   }
-  *value = args->reduce[args->barrier_idx];
-  args->barrier_idx=!args->barrier_idx;
+  pthread_mutex_unlock(&lock[epoch]);
+
+  *value = accumulator[epoch];
+  epoch ^= 1;
 }
 
-testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) {
+testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int64_t *wrongElts) {
+  int nranks = args->nProcs*args->nGpus*args->nThreads;
   size_t count = args->expectedBytes/wordSize(type);
-  double maxDelta = 0.0;
+
+  int64_t *wrongPerGpu = nullptr;
+  HIPCHECK(hipHostMalloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), hipHostMallocMapped));
+  
   for (int i=0; i<args->nGpus*args->nRanks; i++) {
     int device;
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i);
     NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
     HIPCHECK(hipSetDevice(device));
     void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
-    TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost));
-    maxDelta = std::max(*(args->deltaHost), maxDelta);
 
-#ifdef DEBUG_PRINT
-    //if (rank == 0) {
-      int *expectedHost = (int *)malloc(args->expectedBytes);
-      int *dataHost = (int *)malloc(args->expectedBytes);
+    TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i));
 
-      hipMemcpy(expectedHost, args->expected[rank], args->expectedBytes, hipMemcpyDeviceToHost);
+#if 1 && DEBUG_PRINT
+    if (args->reportErrors && wrongPerGpu[i] != 0) {
+      printf("rank=%d #wrong=%d\n", rank, (int)wrongPerGpu[i]);
+      char *expectedHost = (char*)malloc(args->expectedBytes);
+      char *dataHost = (char*)malloc(args->expectedBytes);
+      int eltsz = wordSize(type);
+      hipMemcpy(expectedHost, args->expected[i], args->expectedBytes, hipMemcpyDeviceToHost);
       hipMemcpy(dataHost, data, args->expectedBytes, hipMemcpyDeviceToHost);
-      int j, k, l;
-      for (j=0; j<args->expectedBytes/sizeof(int); j++)
-        if (expectedHost[j] != dataHost[j]) break;
-      k = j;
-      for (; j<args->expectedBytes/sizeof(int); j++)
-        if (expectedHost[j] == dataHost[j]) break;
-      l = j;
-      printf("\n Rank [%d] Expected: ", rank);
-      for (j=k; j<args->expectedBytes/sizeof(int) && j<l; j++) {
-        printf("%d:%d ", j, expectedHost[j]);
-      }
-      printf("\n Rank [%d] Actual  : ", rank);
-      for (j=k; j<args->expectedBytes/sizeof(int) && j<l; j++) {
-        printf("%d:%d ", j, dataHost[j]);
+
+      for(int j=0; j<args->expectedBytes/eltsz; j++) {
+        unsigned long long want, got;
+        want = 0;
+        memcpy(&want, expectedHost + j*eltsz, eltsz);
+        got = 0;
+        memcpy(&got, dataHost + j*eltsz, eltsz);
+        if(want != got) {
+          printf(" rank=%d elt[%d]: want=0x%llx got=0x%llx\n", rank, j, want, got);
+        }
       }
-      printf("\n");
       free(expectedHost);
       free(dataHost);
-    //}
+    }
 #endif
   }
-  double nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
-  if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
-  *delta = maxDelta;
+
+  *wrongElts = 0;
+  for (int i=0; i < args->nGpus; i++) *wrongElts += wrongPerGpu[i];
+  hipFree(wrongPerGpu);
+
+  if (args->reportErrors && *wrongElts) args->errors[0]++;
   return testSuccess;
 }
-
+    
 testResult_t testStreamSynchronize(int nStreams, hipStream_t* streams, ncclComm_t* comms) {
   hipError_t hipErr;
   int remaining = nStreams;
   int* done = (int*)malloc(sizeof(int)*nStreams);
   memset(done, 0, sizeof(int)*nStreams);
+  timer tim;
+  
   while (remaining) {
    int idle = 1;
    for (int i=0; i<nStreams; i++) {
@@ -548,11 +351,24 @@ testResult_t testStreamSynchronize(int nStreams, hipStream_t* streams, ncclComm_
          NCCLCHECK(ncclAsyncErr);
        }
      }
+     double delta = tim.elapsed();
+     if (delta > timeout && timeout > 0) {
+       for (int i=0; i<nStreams; i++)
+         NCCLCHECK(ncclCommAbort(comms[i]));
+       char hostname[1024];
+       getHostName(hostname, 1024);
+       printf("%s: Test timeout (%ds) %s:%d\n",
+           hostname,
+           timeout,
+           __FILE__,__LINE__);
+       free(done);
+       return testTimeout;
+     }
 #endif
    }
 
    // We might want to let other threads (including NCCL threads) use the CPU.
-   if (idle) pthread_yield();
+   if (idle) sched_yield();
   }
   free(done);
   return testSuccess;
@@ -590,19 +406,18 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
         rccl_bfloat16 bf16;
         #endif
       };
-      int scalar = preMulScalar(rank);
       switch(type) {
-      case ncclInt8: i8 = int8_t(scalar); break;
-      case ncclUint8: u8 = uint8_t(scalar); break;
-      case ncclInt32: i32 = int32_t(scalar); break;
-      case ncclUint32: u32 = uint32_t(scalar); break;
-      case ncclInt64: i64 = int32_t(scalar); break;
-      case ncclUint64: u64 = uint32_t(scalar); break;
-      case ncclFloat16: f16 = __float2half(float(scalar)); break;
-      case ncclFloat32: f32 = float(scalar); break;
-      case ncclFloat64: f64 = double(scalar); break;
+      case ncclInt8: i8 = ncclVerifiablePremulScalar<int8_t>(rank); break;
+      case ncclUint8: u8 = ncclVerifiablePremulScalar<uint8_t>(rank); break;
+      case ncclInt32: i32 = ncclVerifiablePremulScalar<int32_t>(rank); break;
+      case ncclUint32: u32 = ncclVerifiablePremulScalar<uint32_t>(rank); break;
+      case ncclInt64: i64 = ncclVerifiablePremulScalar<int64_t>(rank); break;
+      case ncclUint64: u64 = ncclVerifiablePremulScalar<uint64_t>(rank); break;
+      case ncclFloat16: f16 = ncclVerifiablePremulScalar<half>(rank); break;
+      case ncclFloat32: f32 = ncclVerifiablePremulScalar<float>(rank); break;
+      case ncclFloat64: f64 = ncclVerifiablePremulScalar<double>(rank); break;
       #if defined(RCCL_BFLOAT16)
-      case ncclBfloat16: bf16 = (rccl_bfloat16)(float(scalar)); break;
+      case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<rccl_bfloat16>(rank); break;
       #endif
       }
       NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
@@ -637,7 +452,7 @@ testResult_t completeColl(struct threadArgs* args) {
   return testSuccess;
 }
 
-//EDGAR: Revisit because of cudaGraphLaunches
+//RCCL: Revisit because of cudaGraphLaunches
 testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
   size_t count = args->nbytes / wordSize(type);
   if (datacheck) {
@@ -645,9 +460,11 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
   }
 
-  // Sync
-  TESTCHECK(startColl(args, type, op, root, in_place, 0));
-  TESTCHECK(completeColl(args));
+  if (warmup_iters) {
+    // Sync
+    TESTCHECK(startColl(args, type, op, root, in_place, 0));
+    TESTCHECK(completeColl(args));
+  }
 
   Barrier(args);
 
@@ -657,16 +474,17 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   if (cudaGraphLaunches >= 1) {
     // Begin cuda graph capture
     for (int i=0; i<args->nGpus*args->nRanks; i++) {
-      // Thread local mode is needed for:
-      // - Multi-thread mode
-      // - P2P pre-connect
+      // Thread local mdoe is needed for:
+      // - Multi-thread mode: where graph capture and instantiation can happen concurrently across threads
+      // - P2P pre-connect: when there is no warm-up, P2P pre-connect is done during graph capture.
+      //   Since pre-connect calls cudaMalloc, we cannot use global capture mode
       HIPCHECK(hipStreamBeginCapture(args->streams[i], hipStreamCaptureModeThreadLocal));
     }
   }
 #endif
 
   // Performance Benchmark
-  auto start = std::chrono::high_resolution_clock::now();
+  timer tim;
   for (int iter = 0; iter < iters; iter++) {
     if (agg_iters>1) NCCLCHECK(ncclGroupStart());
     for (int aiter = 0; aiter < agg_iters; aiter++) {
@@ -687,7 +505,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     }
     // Resync CPU, restart timing, launch cuda graph
     Barrier(args);
-    start = std::chrono::high_resolution_clock::now();
+    tim.reset();
     for (int l=0; l<cudaGraphLaunches; l++) {
       for (int i=0; i<args->nGpus*args->nRanks; i++) {
         HIPCHECK(hipGraphLaunch(graphExec[i], args->streams[i]));
@@ -696,10 +514,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   }
 #endif
 
+  double cputimeSec = tim.elapsed()/(iters*agg_iters);
   TESTCHECK(completeColl(args));
 
-  auto delta = std::chrono::high_resolution_clock::now() - start;
-  double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
+  double deltaSec = tim.elapsed();
   deltaSec = deltaSec/(iters*agg_iters);
   if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
   Allreduce(args, &deltaSec, average);
@@ -719,8 +537,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 
   Barrier(args);
 
-  double maxDelta = 0;
-  bool error = false;
+  int64_t wrongElts = 0;
   static __thread int rep = 0;
   rep++;
   if (datacheck) {
@@ -768,13 +585,15 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       }
 #endif
 
-      TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
+      TESTCHECK(CheckData(args, type, op, root, in_place, &wrongElts));
 
       //aggregate delta from all threads and procs
-      Allreduce(args, &maxDelta, 3);
+      long long wrongElts1 = wrongElts;
+      Allreduce(args, &wrongElts1, /*sum*/4);
+      wrongElts = wrongElts1;
   }
 
-  double timeUsec = deltaSec*1.0E6;
+  double timeUsec = (report_cputime ? cputimeSec : deltaSec)*1.0E6;
   char timeStr[100];
   if (timeUsec >= 10000.0) {
     sprintf(timeStr, "%7.0f", timeUsec);
@@ -783,10 +602,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   } else {
     sprintf(timeStr, "%7.2f", timeUsec);
   }
-  if (datacheck) {
-     PRINT("  %7s  %6.2f  %6.2f  %5.0le%s", timeStr, algBw, busBw, maxDelta, error ? "*" : "");
+  if (args->reportErrors) {
+    PRINT("  %7s  %6.2f  %6.2f  %5g", timeStr, algBw, busBw, (double)wrongElts);
   } else {
-     PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
+    PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
   }
 
   args->bw[0] += busBw;
@@ -809,6 +628,9 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
 }
 
 testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) {
+  // Sync to avoid first-call timeout
+  Barrier(args);
+
   // Warm-up for large size
   setupArgs(args->maxbytes, type, args);
   for (int iter = 0; iter < warmup_iters; iter++) {
@@ -828,8 +650,11 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
     // Benchmark
     for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
         setupArgs(size, type, args);
-        print_line_header(std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
+	char rootName[100];
+	sprintf(rootName, "%6i", root);	
+	PRINT("%12li  %12li  %8s  %6s  %6s", std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
         TESTCHECK(BenchTime(args, type, op, root, 0));
+        usleep(delay_inout_place);
         TESTCHECK(BenchTime(args, type, op, root, 1));
         PRINT("\n");
     }
@@ -841,10 +666,7 @@ testResult_t threadRunTests(struct threadArgs* args) {
   // Set device to the first of our GPUs. If we don't do that, some operations
   // will be done on the current GPU (by default : 0) and if the GPUs are in
   // exclusive mode those operations will fail.
-  int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus;
-  if (enable_multiranks)
-    gpuid = gpuid % numDevices;
-  HIPCHECK(hipSetDevice(gpuid));
+  HIPCHECK(hipSetDevice(args->gpus[0]));
   TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
   return testSuccess;
 }
@@ -855,14 +677,11 @@ testResult_t threadInit(struct threadArgs* args) {
   int nranks =  args->nProcs*args->nThreads*args->nGpus*args->nRanks;
 
   //set main thread again
-  is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0;
+  is_main_thread = (is_main_proc && args->thread == 0) ? 1 : 0;
 
   NCCLCHECK(ncclGroupStart());
   for (int i=0; i<args->nGpus; i++) {
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    if (enable_multiranks)
-      gpuid = gpuid % numDevices;
-    HIPCHECK(hipSetDevice(gpuid));
+    HIPCHECK(hipSetDevice(args->gpus[i]));
 
     for (int j=0; j<args->nRanks; j++) {
       int rank = (args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + j;
@@ -894,7 +713,7 @@ testResult_t threadLaunch(struct testThread* thread) {
   return testSuccess;
 }
 
-testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) {
+testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) {
   if (memorytype == ncclFine) {
     HIPCHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
     HIPCHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
@@ -968,10 +787,13 @@ int main(int argc, char* argv[]) {
     {"datatype", required_argument, 0, 'd'},
     {"root", required_argument, 0, 'r'},
     {"blocking", required_argument, 0, 'z'},
-    {"memory_type", required_argument, 0, 'y'},
-    {"stress_cycles", required_argument, 0, 's'},
-    {"cumask", required_argument, 0, 'u'},
+    {"memory_type", required_argument, 0, 'y'}, //RCCL
+    {"stress_cycles", required_argument, 0, 's'}, //RCCL
+    {"cumask", required_argument, 0, 'u'},        //RCCL
+    {"stream_null", required_argument, 0, 'y'}, //NCCL
+    {"timeout", required_argument, 0, 'T'},     //NCCL
     {"cudagraph", required_argument, 0, 'G'},
+    {"report_cputime", required_argument, 0, 'C'},
     {"average", required_argument, 0, 'a'},
 #ifdef RCCL_MULTIRANKPERGPU
     {"enable_multiranks", required_argument, 0, 'x'},
@@ -983,10 +805,11 @@ int main(int argc, char* argv[]) {
 
   while(1) {
     int c;
-#ifdef RCCL_MULTIRANKPERGPU
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:G:a:y:s:u:h:R:x:", longopts, &longindex);
+
+#ifdef RCCL_MULTIRANKPERGPU    
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:R:x:q:", longopts, &longindex);
 #else
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:G:a:y:s:u:h:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:q:", longopts, &longindex);
 #endif
 
     if (c == -1)
@@ -1052,7 +875,7 @@ int main(int argc, char* argv[]) {
       case 'z':
         blocking_coll = strtol(optarg, NULL, 0);
         break;
-      case 'y':
+      case 'Y':
         memorytype = ncclstringtomtype(optarg);
         break;
       case 's':
@@ -1067,6 +890,12 @@ int main(int argc, char* argv[]) {
             mask = strtok(NULL, ",");
           };
         }
+	break;
+      case 'y':
+        streamnull = strtol(optarg, NULL, 0);
+        break;
+      case 'T':
+        timeout = strtol(optarg, NULL, 0);
         break;
       case 'G':
 #if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && HIP_VERSION >= 50221310
@@ -1075,6 +904,9 @@ int main(int argc, char* argv[]) {
         printf("Option -G (HIP graph) not supported before NCCL 2.9 + ROCm 5.2 Ignoring\n");
 #endif
         break;
+      case 'C':
+        report_cputime = strtol(optarg, NULL, 0);
+        break;
       case 'a':
         average = (int)strtol(optarg, NULL, 0);
         break;
@@ -1086,6 +918,9 @@ int main(int argc, char* argv[]) {
         ranksPerGpu = (int)strtol(optarg, NULL, 0);
         break;
 #endif
+      case 'q':
+        delay_inout_place = (int)strtol(optarg, NULL, 10);
+        break;
       case 'h':
       default:
         if (c != 'h') printf("invalid option '%c'\n", c);
@@ -1111,18 +946,22 @@ int main(int argc, char* argv[]) {
             "[-d,--datatype <nccltype/all>] \n\t"
             "[-r,--root <root>] \n\t"
             "[-z,--blocking <0/1>] \n\t"
-            "[-y,--memory_type <coarse/fine/host/managed>] \n\t"
+            "[-Y,--memory_type <coarse/fine/host/managed>] \n\t"
             "[-s,--stress_cycles <number of cycles>] \n\t"
             "[-u,--cumask <d0,d1,d2,d3>] \n\t"
+            "[-y,--stream_null <0/1>] \n\t"
+            "[-T,--timeout <time in seconds>] \n\t"
             "[-G,--cudagraph <num graph launches>] \n\t"
+            "[-C,--report_cputime <0/1>] \n\t"
             "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
 #ifdef RCCL_MULTIRANKPERGPU
             "[-x,--enable_multiranks <0/1> enable using multiple ranks per GPU] \n\t"
             "[-R,--ranks_per_gpu] \n\t"
 #endif
+            "[-q,--delay <delay between out-of-place and in-place in microseconds>] \n\t"
             "[-h,--help]\n",
-	    basename(argv[0]));
-	return 0;
+          basename(argv[0]));
+        return 0;
     }
   }
 
@@ -1161,26 +1000,36 @@ int main(int argc, char* argv[]) {
 }
 
 testResult_t run() {
-  int nProcs = 1, proc = 0;
+  int totalProcs = 1, proc = 0, ncclProcs = 1, ncclProc = 0, color = 0;
   int localRank = 0;
   char hostname[1024];
   getHostName(hostname, 1024);
 
 #ifdef MPI_SUPPORT
-  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
+  MPI_Comm_size(MPI_COMM_WORLD, &totalProcs);
   MPI_Comm_rank(MPI_COMM_WORLD, &proc);
-  uint64_t hostHashs[nProcs];
+  uint64_t hostHashs[totalProcs];
   hostHashs[proc] = getHostHash(hostname);
   MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
-  for (int p=0; p<nProcs; p++) {
+  for (int p=0; p<totalProcs; p++) {
     if (p == proc) break;
     if (hostHashs[p] == hostHashs[proc]) localRank++;
   }
+
+  char* str = getenv("NCCL_TESTS_SPLIT_MASK");
+  uint64_t mask = str ? strtoul(str, NULL, 16) : 0;
+  MPI_Comm mpi_comm;
+  color = proc & mask;
+  MPI_Comm_split(MPI_COMM_WORLD, color, proc, &mpi_comm);
+  MPI_Comm_size(mpi_comm, &ncclProcs);
+  MPI_Comm_rank(mpi_comm, &ncclProc);
 #endif
-  is_main_thread = (proc == 0) ? 1 : 0;
+  is_main_thread = is_main_proc = (proc == 0) ? 1 : 0;
 
-  PRINT("# nThreads: %d nGpus: %d nRanks: %d minBytes: %ld maxBytes: %ld step: %ld(%s) warmupIters: %d iters: %d validation: %d \n", nThreads, nGpus, ranksPerGpu, minBytes, maxBytes,
-      (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck);
+  PRINT("# nThreads: %d nGpus: %d nRanks: %d minBytes: %ld maxBytes: %ld step: %ld(%s) warmupIters: %d iters: %d agg iters: %d validation: %d graph: %d\n",
+	nThreads, nGpus, ranksPerGpu, minBytes, maxBytes,
+	(stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes",
+	warmup_iters, iters, agg_iters, datacheck, cudaGraphLaunches);
   if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
   if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
   PRINT("#\n");
@@ -1190,6 +1039,8 @@ testResult_t run() {
   char line[MAX_LINE];
   int len = 0;
   size_t maxMem = ~0;
+  char* envstr = getenv("NCCL_TESTS_DEVICE");
+  int gpu0 = envstr ? atoi(envstr) : -1;
   for (int i=0; i<nThreads*nGpus; i++) {
     int hipDev = localRank*nThreads*nGpus+i;
     if (enable_multiranks)
@@ -1207,11 +1058,11 @@ testResult_t run() {
     }
   }
 #if MPI_SUPPORT
-  char *lines = (proc == 0) ? (char *)malloc(nProcs*MAX_LINE) : NULL;
+  char *lines = (proc == 0) ? (char *)malloc(totalProcs*MAX_LINE) : NULL;
   // Gather all output in rank order to root (0)
   MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD);
   if (proc == 0) {
-    for (int p = 0; p < nProcs; p++)
+    for (int p = 0; p < totalProcs; p++)
       PRINT("%s", lines+MAX_LINE*p);
     free(lines);
   }
@@ -1228,66 +1079,67 @@ testResult_t run() {
   }
 
   ncclUniqueId ncclId;
-  if (proc == 0) {
+  if (ncclProc == 0) {
     NCCLCHECK(ncclGetUniqueId(&ncclId));
   }
 #ifdef MPI_SUPPORT
-  MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD);
-  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, mpi_comm);
 #endif
+
+  int gpus[nGpus*nThreads];
   hipStream_t streams[nGpus*nThreads*ranksPerGpu];
   void* sendbuffs[nGpus*nThreads*ranksPerGpu];
   void* recvbuffs[nGpus*nThreads*ranksPerGpu];
   void* expected[nGpus*nThreads*ranksPerGpu];
   size_t sendBytes, recvBytes;
 
-  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads*ranksPerGpu);
+  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)ncclProcs*nGpus*nThreads*ranksPerGpu);
 
+  envstr = getenv("NCCL_TESTS_DEVICE");
+  gpu0 = envstr ? atoi(envstr) : -1;
   for (int ii=0; ii<nGpus*nThreads; ii++) {
     int gpuid = localRank*nThreads*nGpus+ii;
     if (enable_multiranks)
       gpuid = gpuid % numDevices;
-    HIPCHECK(hipSetDevice(gpuid));
+
+    gpus[ii] = gpu0 != -1 ? gpu0+ii : gpuid;
+    HIPCHECK(hipSetDevice(gpus[ii]));
+
     for (int j=0; j<ranksPerGpu; j++) {
       int i = ii*ranksPerGpu+j;
-      TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus*ranksPerGpu));
-      //PRINT("sendbuffs[%d]=%p(size=%lu) recvbuffs[%d]=%p(size=%lu)\n", i, sendbuffs[i], sendBytes, i, recvbuffs[i], recvBytes);
-      if (cumask[0] || cumask[1] || cumask[2] || cumask[3]) {
-	PRINT("cumask: ");
-	for (int i = 0; i < 4 ; i++) PRINT("%x,", cumask[i]);
-	PRINT("\n");
-	HIPCHECK(hipExtStreamCreateWithCUMask(streams+i, 4, cumask));
-      } else
-	HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
-      // initialize data buffer to avoid all zero data
-      TESTCHECK(InitData(sendbuffs[i], sendBytes, ncclUint8, 0, i));
+
+      TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
+      if (streamnull)
+      	streams[i] = NULL;
+      else {
+	      if (cumask[0] || cumask[1] || cumask[2] || cumask[3]) {
+	         PRINT("cumask: ");
+	         for (int i = 0; i < 4 ; i++) PRINT("%x,", cumask[i]);
+	         PRINT("\n");
+	         HIPCHECK(hipExtStreamCreateWithCUMask(streams+i, 4, cumask));
+	      } else
+	         HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
+      }
     }
-    HIPCHECK(hipDeviceSynchronize());
   }
-
   //if parallel init is not selected, use main thread to initialize NCCL
   ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus*ranksPerGpu);
   if (!parallel_init) {
-     if (nProcs == 1 && !enable_multiranks) {
-       int gpuArray[nGpus*nThreads];
-       for (int i=0; i<nGpus*nThreads; i++) gpuArray[i] = i;
-       NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpuArray));
+     if (ncclProcs == 1 && !enable_multiranks) {
+       NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpus));
      } else {
        NCCLCHECK(ncclGroupStart());
        for (int ii=0; ii<nGpus*nThreads; ii++) {
-	 int gpuid = localRank*nThreads*nGpus+ii;
-         if (enable_multiranks) {
-	   gpuid = gpuid % numDevices;
-	 }
-         HIPCHECK(hipSetDevice(gpuid));
+         HIPCHECK(hipSetDevice(gpus[ii]));
 	 if (!enable_multiranks) {
-	   NCCLCHECK(ncclCommInitRank(comms+ii, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+ii));
+	   NCCLCHECK(ncclCommInitRank(comms+ii, ncclProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+ii));
 	 }
 #ifdef RCCL_MULTIRANKPERGPU
 	 else
 	   for (int j=0; j<ranksPerGpu; j++) {
 	     int i = ii*ranksPerGpu+j;
-	     NCCLCHECK(ncclCommInitRankMulti(comms+i, nProcs*nThreads*nGpus*ranksPerGpu, ncclId, proc*nThreads*nGpus*ranksPerGpu+i, proc*nThreads*nGpus*ranksPerGpu+i));
+	     NCCLCHECK(ncclCommInitRankMulti(comms+i, ncclProcs*nThreads*nGpus*ranksPerGpu, ncclId,
+					     proc*nThreads*nGpus*ranksPerGpu+i, proc*nThreads*nGpus*ranksPerGpu+i));
 	   }
 #endif
        }
@@ -1305,12 +1157,13 @@ testResult_t run() {
     errors[t] = bw_count[t] = 0;
   }
 
+  const char* timeStr = report_cputime ? "cputime" : "time";
   PRINT("#\n");
-  print_header();
-
-  int* sync = (int*)calloc(2, sizeof(int));
-  int* barrier = (int*)calloc(2, sizeof(int));
-  double* reduce = (double*)calloc(2, sizeof(double));
+  PRINT("# %10s  %12s  %8s  %6s  %6s           out-of-place                       in-place          \n", "", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s %6s  %7s  %6s  %6s %6s\n", "size", "count", "type", "redop", "root",
+      timeStr, "algbw", "busbw", "#wrong", timeStr, "algbw", "busbw", "#wrong");
+  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
+      "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
 
   struct testThread threads[nThreads];
   memset(threads, 0, sizeof(struct testThread)*nThreads);
@@ -1321,14 +1174,17 @@ testResult_t run() {
     threads[t].args.stepbytes=stepBytes;
     threads[t].args.stepfactor=stepFactor;
     threads[t].args.localRank = localRank;
+
+    threads[t].args.totalProcs = totalProcs;
     threads[t].args.localNumDevices = numDevices;
     threads[t].args.enable_multiranks = enable_multiranks;
     threads[t].args.nRanks = ranksPerGpu;
-    threads[t].args.nProcs=nProcs;
-    threads[t].args.proc=proc;
+    threads[t].args.nProcs=ncclProcs;
+    threads[t].args.proc=ncclProc;
     threads[t].args.nThreads=nThreads;
     threads[t].args.thread=t;
     threads[t].args.nGpus=nGpus;
+    threads[t].args.gpus=gpus+t*nGpus;
     threads[t].args.sendbuffs = sendbuffs+t*nGpus*ranksPerGpu;
     threads[t].args.recvbuffs = recvbuffs+t*nGpus*ranksPerGpu;
     threads[t].args.expected = expected+t*nGpus*ranksPerGpu;
@@ -1336,17 +1192,11 @@ testResult_t run() {
     threads[t].args.comms=comms+t*nGpus*ranksPerGpu;
     threads[t].args.streams=streams+t*nGpus*ranksPerGpu;
 
-    threads[t].args.barrier = (volatile int*)barrier;
-    threads[t].args.barrier_idx = 0;
-    threads[t].args.reduce = (volatile double*)reduce;
-    threads[t].args.sync = (volatile int*)sync;
-    threads[t].args.sync_idx = 0;
-    threads[t].args.deltaHost = (delta + t*NUM_BLOCKS);
     threads[t].args.errors=errors+t;
     threads[t].args.bw=bw+t;
     threads[t].args.bw_count=bw_count+t;
 
-    threads[t].args.reportErrors = 1;
+    threads[t].args.reportErrors = datacheck;
 
     threads[t].func = parallel_init ? threadInit : threadRunTests;
     if (t)
@@ -1395,8 +1245,8 @@ testResult_t run() {
   }
   HIPCHECK(hipHostFree(delta));
 
-  char* str = getenv("NCCL_TESTS_MIN_BW");
-  double check_avg_bw = str ? atof(str) : -1;
+  envstr = getenv("NCCL_TESTS_MIN_BW");
+  double check_avg_bw = envstr ? atof(envstr) : -1;
   bw[0] /= bw_count[0];
 
   if (datacheck) PRINT("# Errors with asterisks indicate errors that have exceeded the maximum threshold.\n");
@@ -1408,6 +1258,7 @@ testResult_t run() {
 #endif
 
   // 'hip-memcheck --leak-check full' requires this
+  PRINT("%s\n", ncclGetLastError(NULL));
   hipDeviceReset();
 
   if (errors[0] || bw[0] < check_avg_bw*(0.9))
diff --git a/src/common.h b/src/common.h
index 1071272..cb3bd3f 100644
--- a/src/common.h
+++ b/src/common.h
@@ -1,13 +1,13 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 #ifndef __COMMON_H__
 #define __COMMON_H__
 
-#include "rccl.h"
+#include "rccl/rccl.h"
 #include <stdio.h>
 #include <cstdint>
 #include <algorithm>
@@ -16,19 +16,38 @@
 #endif
 #include <pthread.h>
 #include "nccl1_compat.h"
+#include "timer.h"
 
-#define HIPCHECK(cmd) do {                         \
-  hipError_t e = cmd;                              \
-  if( e != hipSuccess ) {                          \
+// For nccl.h < 2.13 since we define a weak fallback
+extern "C" char const* ncclGetLastError(ncclComm_t comm);
+
+#define HIPCHECK(cmd) do {                          \
+  hipError_t e = cmd;                               \
+  if( e != hipSuccess ) {                           \
     char hostname[1024];                            \
     getHostName(hostname, 1024);                    \
-    printf("%s: Test HIP failure %s:%d '%s'\n",    \
+    printf("%s: Test HIP failure %s:%d '%s'\n",     \
          hostname,                                  \
-        __FILE__,__LINE__,hipGetErrorString(e));   \
+        __FILE__,__LINE__,hipGetErrorString(e));    \
     return testCudaError;                           \
   }                                                 \
 } while(0)
 
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,13,0)
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t res = cmd;                           \
+  if (res != ncclSuccess) {                         \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test NCCL failure %s:%d "           \
+           "'%s / %s'\n",                           \
+           hostname,__FILE__,__LINE__,              \
+           ncclGetErrorString(res),                 \
+           ncclGetLastError(NULL));                 \
+    return testNcclError;                           \
+  }                                                 \
+} while(0)
+#else
 #define NCCLCHECK(cmd) do {                         \
   ncclResult_t res = cmd;                           \
   if (res != ncclSuccess) {                         \
@@ -40,13 +59,15 @@
     return testNcclError;                           \
   }                                                 \
 } while(0)
+#endif
 
 typedef enum {
   testSuccess = 0,
   testInternalError = 1,
   testCudaError = 2,
   testNcclError = 3,
-  testCuRandError = 4
+  testTimeout = 4,
+  testNumResults = 5
 } testResult_t;
 
 // Relay errors up and trace
@@ -96,11 +117,13 @@ struct threadArgs {
   size_t stepbytes;
   size_t stepfactor;
 
+  int totalProcs;
   int nProcs;
   int proc;
   int nThreads;
   int thread;
   int nGpus;
+  int* gpus;
   int localRank;
   int localNumDevices;
   int enable_multiranks;
@@ -116,14 +139,6 @@ struct threadArgs {
 
   void** expected;
   size_t expectedBytes;
-  volatile int* sync;
-  int sync_idx;
-  volatile int* barrier;
-  int barrier_idx;
-  volatile double* reduce;
-  int syncRank;
-  int syncNranks;
-  double* deltaHost;
   int* errors;
   double* bw;
   int* bw_count;
@@ -141,19 +156,13 @@ struct testThread {
   testResult_t ret;
 };
 
-#include <chrono>
-
 // Provided by common.cu
 extern void Barrier(struct threadArgs* args);
 extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root);
-extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks);
-extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank);
+extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks);
+extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank);
 extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
 
-// Provided by each coll
-extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root);
-extern void print_header();
-
 #include <unistd.h>
 
 static void getHostName(char* hostname, int maxlen) {
@@ -168,46 +177,15 @@ static void getHostName(char* hostname, int maxlen) {
 
 #include <stdint.h>
 
-static uint64_t getHash(const char* string, size_t n) {
-  // Based on DJB2a, result = result * 33 ^ char
+static uint64_t getHostHash(const char* string) {
+  // Based on DJB2, result = result * 33 + char
   uint64_t result = 5381;
-  for (size_t c = 0; c < n; c++) {
-    result = ((result << 5) + result) ^ string[c];
+  for (int c = 0; string[c] != '\0'; c++){
+    result = ((result << 5) + result) + string[c];
   }
   return result;
 }
 
-/* Generate a hash of the unique identifying string for this host
- * that will be unique for both bare-metal and container instances
- * Equivalent of a hash of;
- *
- * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
- *
- */
-#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
-static uint64_t getHostHash(const char* hostname) {
-  char hostHash[1024];
-
-  // Fall back is the hostname if something fails
-  (void) strncpy(hostHash, hostname, sizeof(hostHash));
-  int offset = strlen(hostHash);
-
-  FILE *file = fopen(HOSTID_FILE, "r");
-  if (file != NULL) {
-    char *p;
-    if (fscanf(file, "%ms", &p) == 1) {
-        strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
-        free(p);
-    }
-  }
-  fclose(file);
-
-  // Make sure the string is terminated
-  hostHash[sizeof(hostHash)-1]='\0';
-
-  return getHash(hostHash, strlen(hostHash));
-}
-
 static size_t wordSize(ncclDataType_t type) {
   switch(type) {
     case ncclChar:
@@ -233,7 +211,7 @@ static size_t wordSize(ncclDataType_t type) {
     case ncclInt64:
     case ncclUint64:
     case ncclDouble:
-    //case ncclFloat64: 
+    //case ncclFloat64:
       return 8;
     default: return 0;
   }
@@ -290,6 +268,7 @@ static int ncclstringtomtype (char *str) {
     return ncclCoarse;
 }
 
+extern int is_main_proc;
 extern thread_local int is_main_thread;
 #define PRINT if (is_main_thread) printf
 
diff --git a/src/gather.cu b/src/gather.cu
index c293793..c3e6fd0 100644
--- a/src/gather.cu
+++ b/src/gather.cu
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,18 +8,6 @@
 #include "hip/hip_runtime.h"
 #include "common.h"
 
-void print_header() {
-  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "root",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %8s  %6i", size, count, typeName, root);
-}
-
 void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   *sendcount = count/nranks;
   *recvcount = (count/nranks)*nranks;
@@ -35,20 +23,17 @@ testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
 
   int k=0;
   for (int i=0; i<args->nGpus; i++) {
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    if (args->enable_multiranks)
-      gpuid = gpuid % args->localNumDevices;
-    HIPCHECK(hipSetDevice(gpuid));
+    HIPCHECK(hipSetDevice(args->gpus[i]));
 
     for (int l=0; l<args->nRanks; l++) {
       int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
       HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
       void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
-      TESTCHECK(InitData(data, sendcount, type, rep, rank));
+      TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
       HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
       if (rank == root) {
 	for (int j=0; j<nranks; j++) {
-	  TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, type, rep, j));
+	  TESTCHECK(InitData(((char*)args->expected[k]), nranks*sendcount, 0, type, ncclSum, rep, 1, 0));
 	}
       }
       k++;
@@ -125,7 +110,7 @@ testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t typ
 
   for (int i=0; i<type_count; i++) {
     for (int j=begin_root; j<=end_root; j++) {
-      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
     }
   }
   return testSuccess;
diff --git a/src/hypercube.cu b/src/hypercube.cu
index d654617..5cab39c 100644
--- a/src/hypercube.cu
+++ b/src/hypercube.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,18 +10,6 @@
 
 #define ALIGN 4
 
-void print_header() {
-  PRINT("# %10s  %12s  %8s            out-of-place                       in-place          \n", "", "", "");
-  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %8s", size, count, typeName);
-}
-
 void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   size_t base = (count/(ALIGN*nranks))*ALIGN;
   *sendcount = base;
@@ -37,18 +26,15 @@ testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncc
 
   int k=0;
   for (int i=0; i<args->nGpus; i++) {
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    if (args->enable_multiranks)
-      gpuid = gpuid % args->localNumDevices;
-    HIPCHECK(hipSetDevice(gpuid));
+    HIPCHECK(hipSetDevice(args->gpus[i]));
 
     for (int l=0; l<args->nRanks; l++) {
       int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
       HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
       void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
-      TESTCHECK(InitData(data, sendcount, type, rep, rank));
+      TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
       for (int j=0; j<nranks; j++) {
-	TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, type, rep, j));
+	TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
       }
       k++;
     }
@@ -116,9 +102,16 @@ testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t
     run_typenames = test_typenames;
   }
 
-  for (int i=0; i<type_count; i++) {
-    TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
+  // Check if this is a power of 2
+  int nRanks = args->nProcs*args->nThreads*args->nGpus;
+  if (nRanks && !(nRanks & (nRanks - 1))) {
+    for (int i=0; i<type_count; i++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
+    }
+  } else {
+    printf("nRanks %d is not a power of 2, skipping\n", nRanks);
   }
+
   return testSuccess;
 }
 
diff --git a/src/reduce.cu b/src/reduce.cu
index 7ea7b0f..44c8c4f 100644
--- a/src/reduce.cu
+++ b/src/reduce.cu
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,18 +8,6 @@
 #include <hip/hip_runtime.h>
 #include "common.h"
 
-void print_header() {
-  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop", "root",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %8s  %6s  %6i", size, count, typeName, opName, root);
-}
-
 void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   *sendcount = count;
   *recvcount = count;
@@ -35,16 +23,13 @@ testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
 
   int k=0;
   for (int i=0; i<args->nGpus; i++) {
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    if (args->enable_multiranks)
-      gpuid = gpuid % args->localNumDevices;
-    HIPCHECK(hipSetDevice(gpuid));
+    HIPCHECK(hipSetDevice(args->gpus[i]));
 
     for (int l=0; l<args->nRanks; l++) {
       int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
       HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
       void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
-      TESTCHECK(InitData(data, sendcount, type, rep, rank));
+      TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
       HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
       if (rank == root) TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks));
       k++;
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
index 23b99de..2abfa8a 100644
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,24 +8,15 @@
 #include <hip/hip_runtime.h>
 #include "common.h"
 
-void print_header() {
-  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
-}
+#define ALIGN 4
 
 void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  *sendcount = (count/nranks)*nranks;
-  *recvcount = count/nranks;
+  size_t base = (count/(ALIGN*nranks))*ALIGN;
+  *sendcount = base*nranks;
+  *recvcount = base;
   *sendInplaceOffset = 0;
-  *recvInplaceOffset = count/nranks;
-  *paramcount = *recvcount;
+  *recvInplaceOffset = base;
+  *paramcount = base;
 }
 
 testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
@@ -35,16 +26,13 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type,
 
   int k=0;
   for (int i=0; i<args->nGpus; i++) {
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    if (args->enable_multiranks)
-      gpuid = gpuid % args->localNumDevices;
-    HIPCHECK(hipSetDevice(gpuid));
+    HIPCHECK(hipSetDevice(args->gpus[i]));
 
     for (int l=0; l<args->nRanks; l++) {
       int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
       HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
       void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
-      TESTCHECK(InitData(data, sendcount, type, rep, rank));
+      TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
       HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
       TESTCHECK(InitDataReduce(args->expected[k], recvcount, rank*recvcount, type, op, rep, nranks));
       k++;
diff --git a/src/scatter.cu b/src/scatter.cu
index ec8c06b..517596d 100644
--- a/src/scatter.cu
+++ b/src/scatter.cu
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,18 +8,6 @@
 #include <hip/hip_runtime.h>
 #include "common.h"
 
-void print_header() {
-  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "root",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %8s  %6i", size, count, typeName, root);
-}
-
 void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   *sendcount = (count/nranks)*nranks;
   *recvcount = count/nranks;
@@ -34,17 +22,14 @@ testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclR
 
   int k=0;
   for (int i=0; i<args->nGpus; i++) {
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    if (args->enable_multiranks)
-      gpuid = gpuid % args->localNumDevices;
-    HIPCHECK(hipSetDevice(gpuid));
+    HIPCHECK(hipSetDevice(args->gpus[i]));
 
     for (int l=0; l<args->nRanks; l++) {
       int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
       HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
       void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
-      if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
-      TESTCHECK(InitData(args->expected[k], recvcount, type, rep+rank*recvcount, root));
+      if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
+      TESTCHECK(InitData(args->expected[k], recvcount, rank*recvcount, type, ncclSum, rep, 1, 0));
       k++;
 
     }
@@ -120,7 +105,7 @@ testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t ty
 
   for (int i=0; i<type_count; i++) {
     for (int j=begin_root; j<=end_root; j++) {
-      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
     }
   }
   return testSuccess;
diff --git a/src/sendrecv.cu b/src/sendrecv.cu
index 84d7398..0d2ae9b 100644
--- a/src/sendrecv.cu
+++ b/src/sendrecv.cu
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,18 +8,6 @@
 #include <hip/hip_runtime.h>
 #include "common.h"
 
-void print_header() {
-  PRINT("# %10s  %12s  %8s            out-of-place                       in-place          \n", "", "", "");
-  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %8s", size, count, typeName);
-}
-
 void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   *sendcount = count;
   *recvcount = count;
@@ -35,18 +23,15 @@ testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, nccl
 
   int k=0;
   for (int i=0; i<args->nGpus; i++) {
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    if (args->enable_multiranks)
-      gpuid = gpuid % args->localNumDevices;
-    HIPCHECK(hipSetDevice(gpuid));
+    HIPCHECK(hipSetDevice(args->gpus[i]));
 
     for (int l=0; l<args->nRanks; l++) {
       int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
       HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
       void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
-      TESTCHECK(InitData(data, sendcount, type, rep, rank));
+      TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
       int peer = (rank-1+nranks)%nranks;
-      TESTCHECK(InitData(args->expected[k], recvcount, type, rep, peer));
+      TESTCHECK(InitData(args->expected[k], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0));
       k++;
     }
     HIPCHECK(hipDeviceSynchronize());
diff --git a/src/timer.cc b/src/timer.cc
new file mode 100644
index 0000000..f65be4d
--- /dev/null
+++ b/src/timer.cc
@@ -0,0 +1,28 @@
+#include "timer.h"
+
+// Make sure to compile this translation unit with the host compiler and not
+// nvcc, lest you hit an internal compiler error (ICE) with GCC 10.3.0
+#include <chrono>
+
+namespace {
+  std::uint64_t now() {
+    using clock = std::chrono::steady_clock;
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(clock::now().time_since_epoch()).count();
+  }
+}
+
+timer::timer() {
+  t0 = now();
+}
+
+double timer::elapsed() const {
+  std::uint64_t t1 = now();
+  return 1.e-9*(t1 - t0);
+}
+
+double timer::reset() {
+  std::uint64_t t1 = now();
+  double ans = 1.e-9*(t1 - t0);
+  t0 = t1;
+  return ans;
+}
diff --git a/src/timer.h b/src/timer.h
new file mode 100644
index 0000000..33aed4d
--- /dev/null
+++ b/src/timer.h
@@ -0,0 +1,15 @@
+#ifndef _408319ecdd5b47b28bf8f511c4fdf816
+#define _408319ecdd5b47b28bf8f511c4fdf816
+
+#include <cstdint>
+
+// Can't include <chrono> because of bug with gcc 10.3.0
+class timer {
+  std::uint64_t t0;
+public:
+  timer();
+  double elapsed() const;
+  double reset();
+};
+
+#endif
diff --git a/verifiable/Makefile b/verifiable/Makefile
new file mode 100644
index 0000000..182d44e
--- /dev/null
+++ b/verifiable/Makefile
@@ -0,0 +1,73 @@
+#
+# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+#include ../../makefiles/common.mk
+
+.PHONY: all clean
+
+BUILDDIR := $(abspath ../../build)
+DST_DIR := $(BUILDDIR)/test/verifiable
+
+ROCM_PATH ?= /opt/rocm
+MPI_HOME ?= /usr/lib/openmpi
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+DEBUG ?= 0
+NCCL_HOME ?= ""
+
+HIPCC = $(ROCM_PATH)/bin/hipcc
+CXX = $(HIPCC)
+
+HIPCUFLAGS := -std=c++14
+LDFLAGS    :=
+HIPLDFLAGS :=
+
+ifneq ($(NCCL_HOME), "")
+HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
+HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
+endif
+HIPCUFLAGS += -I$(ROCM_PATH)/include
+HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
+LDFLAGS    += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
+HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
+
+ifeq ($(DEBUG), 0)
+HIPCUFLAGS += -O3
+else
+HIPCUFLAGS += -O0 -g -ggdb3
+endif
+
+ifeq ($(VERBOSE), 0)
+.SILENT:
+endif
+
+ifeq ($(MPI), 1)
+HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
+HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi
+else ifeq ($(MPICH), 1)
+HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich -I/usr/include/x86_64-linux-gnu/mpich
+HIPLDFLAGS += -L/usr/lib -lmpich
+endif
+
+LIBRARIES += rccl
+HIPLDFLAGS   += $(LIBRARIES:%=-l%)
+
+all: $(DST_DIR)/verifiable.o $(DST_DIR)/self_test 
+
+clean:
+	rm -rf $(DST_DIR)
+
+TEST_VERIFIABLE_SRCDIR := .
+TEST_VERIFIABLE_BUILDDIR := $(DST_DIR)
+include verifiable.mk
+
+self_test: $(DST_DIR)/self_test
+
+$(DST_DIR)/self_test: verifiable.cu verifiable.h
+	@printf "Linking  %s\n" $@
+	@mkdir -p $(DST_DIR)
+	$(HIPCC) -o $@ $(HIPCUFLAGS) -DSELF_TEST=1 verifiable.cu $(HIPLDFLAGS)
diff --git a/verifiable/inexact_regress.cu b/verifiable/inexact_regress.cu
new file mode 100644
index 0000000..973b965
--- /dev/null
+++ b/verifiable/inexact_regress.cu
@@ -0,0 +1,195 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+/* Generate parameters for our error bound model of floating point average
+ * (sum of scaled values) by sampling sums of random sequences for each
+ * floating point type.
+ *
+ * The model has parameters "coef" and "power", where for two floats a & b,
+ * they are close enough if and only if:
+ *   abs(intBits(a) - intBits(b)) <= 1 + coef*pow(rank_n, power);
+ *
+ * Where intBits(x) is the reinterpretation of the float bitpattern as an integer.
+ *
+ * Compile with:
+ *   nvcc -gencode=arch=compute_80,code=sm_80
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdint>
+#include <hip/hip_bfloat16.h>
+#include <hip/hip_fp16.h>
+
+using std::uint64_t;
+using std::uint32_t;
+using bfloat16 = hip_bfloat16;
+
+template<typename T>
+struct float_traits;
+
+template<>
+struct float_traits<float> {
+  static constexpr int mantissa_bits = 23;
+  static constexpr int exponent_bits = 8;
+  using uint_t = uint32_t;
+  __device__ static float make(double x) { return (float)x; }
+  __device__ static float make(uint64_t x) { return (float)x; }
+  __device__ static double todouble(float x) { return x; }
+  __device__ static float add(float a, float b) { return a+b; }
+  __device__ static float mul(float a, float b) { return a*b; }
+};
+template<>
+struct float_traits<double> {
+  static constexpr int mantissa_bits = 52;
+  static constexpr int exponent_bits = 11;
+  using uint_t = uint64_t;
+  __device__ static double make(double x) { return x; }
+  __device__ static double make(uint64_t x) { return (double)x; }
+  __device__ static double todouble(double x) { return x; }
+  __device__ static double add(double a, double b) { return a+b; }
+  __device__ static double mul(double a, double b) { return a*b; }
+};
+template<>
+struct float_traits<__half> {
+  static constexpr int mantissa_bits = 10;
+  static constexpr int exponent_bits = 5;
+  using uint_t = uint16_t;
+  __device__ static __half make(double x) { return __float2half((float)x); }
+  __device__ static __half make(uint64_t x) { return __int2half_rn(x); }
+  __device__ static double todouble(__half x) { return __half2float(x); }
+  __device__ static __half add(__half a, __half b) { return __hadd(a, b); }
+  __device__ static __half mul(__half a, __half b) { return __hmul(a, b); }
+};
+template<>
+struct float_traits<bfloat16> {
+  static constexpr int mantissa_bits = 7;
+  static constexpr int exponent_bits = 8;
+  using uint_t = uint16_t;
+  __device__ static bfloat16 make(double x) { return bfloat16(x); }
+  __device__ static bfloat16 make(uint64_t x) { return bfloat16(x); }
+  __device__ static double todouble(bfloat16 x) { return double(x); }
+  __device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return bfloat16(__hadd((float)a, (float)b)); }
+  __device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return bfloat16(__hmul((float)a, (float)b)); }
+};
+
+template<typename F>
+__device__ int compare(F a, F b) {
+  union { typename float_traits<F>::uint_t ua; F fa; };
+  union { typename float_traits<F>::uint_t ub; F fb; };
+  ua=0; ub=0;
+  fa=a; fb=b;
+  //std::printf("bits(%1.10f)=%x bits(%1.10f)=%x\n", fa, ua, fb, ub);
+  return ua < ub ? ub-ua : ua-ub;
+}
+
+struct xoshiro256ss {
+	uint64_t s[4];
+  __device__ xoshiro256ss(int seed) {
+    constexpr uint64_t src[4] = {0xbb99e851d1f545cc, 0xbfc4022389ca40cb, 0xe84aff5cb1914af5, 0x845999858284de77};
+    for(int i=0; i < 4; i++)
+      s[i] = src[i] + (seed + i)*0xb45de8a52fdb65d3;
+  }
+  __device__ uint64_t operator()() {
+    auto rol64 = [](uint64_t x, int k) {
+      return (x << k) | (x >> (64 - k));
+    };
+    uint64_t const result = rol64(s[1] * 5, 7) * 9;
+    uint64_t const t = s[1] << 17;
+    s[2] ^= s[0];
+    s[3] ^= s[1];
+    s[1] ^= s[2];
+    s[0] ^= s[3];
+    s[2] ^= t;
+    s[3] = rol64(s[3], 45);
+    return result;
+  }
+};
+
+static __device__ int __reduce_max_sync(unsigned int mask, int value)
+{
+  //We ignore mask, since all bits are set when calling them in the
+  //test code below.
+  int width = warpSize;
+  for (unsigned int i = warpSize; i; i >>= 1) {
+    value = max(__shfl_down(value, i, width), value);
+  }
+  return value;
+}
+
+template<typename F>
+__global__ void kernel() {
+  using traits = float_traits<F>;
+  constexpr int samps = 4<<10;
+  __shared__ F accf[samps];
+  __shared__ double accd[samps];
+
+  xoshiro256ss rng(threadIdx.x);
+  float expo_avg = 1;
+  for(int pass=0; pass < 2; pass++) {
+    F scalar = traits::make(1.0/(3.14159 + .5*threadIdx.x));
+    int err_max = 0;
+    float coef = 0;
+    double expo_sum = 0;
+    int expo_n = 0;
+    int max_ranks = std::is_same<F,float>::value ? 16<<10 : 1<<traits::mantissa_bits;
+    for(int round=0; round < 1 + (16<<10)/max_ranks; round++) {
+    //for(int round=0; round < 2; round++) {
+      for(int i=threadIdx.x; i < samps; i += blockDim.x) {
+        accf[i] = (F)0;
+        accd[i] = 0;
+      }
+      __syncthreads();
+      for(int r=0; r < max_ranks; r++) {
+        int err = 0;
+        for(int i=threadIdx.x; i < samps; i+=blockDim.x) {
+          constexpr uint64_t m = (1ll<<traits::mantissa_bits)-1;
+          double d = std::is_same<F,float>::value ? double(rng() & m) : 1.0;
+          F f = traits::make(d);
+          accf[i] = traits::add(accf[i], traits::mul(scalar, f));
+          accd[i] += traits::todouble(f);
+          //if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d f=%f\n", r, traits::todouble(accf[i]));
+          int e = compare(accf[i], traits::mul(scalar, traits::make(accd[i])));
+          err = err > e ? err : e;
+        }
+        err = __reduce_max_sync(-1u, err);
+        err_max = err_max > err ? err_max : err;
+        if (r >= 2) {
+          // err = 1 + coef*pow(r,expo)
+          float c = float(err-1)/powf(float(r), expo_avg);
+          coef = coef > c ? coef : c;
+        }
+        if (r >= 2) {
+          double expo = log2f(1+err_max)/log2f(r);
+          expo_sum += expo;
+          expo_n++;
+          //if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d err=%d errmax=%d expo=%f sum=%f n=%d\n", r, err, err_max, expo, expo_sum, expo_n);
+        }
+      }
+    }
+    if(pass==0)
+      expo_avg = expo_sum/expo_n;
+    else if(threadIdx.x == 0)
+      printf("  coef=%1.10f expo=%1.10f\n", coef, expo_avg);
+  }
+}
+
+int main() {
+  std::printf("type=float:\n");
+  kernel<float><<<1,32>>>();
+  hipDeviceSynchronize();
+
+  std::printf("\ntype=half:\n");
+  kernel<half><<<1,32>>>();
+  hipDeviceSynchronize();
+
+  std::printf("\ntype=bfloat16:\n");
+  kernel<bfloat16><<<1,32>>>();
+  hipDeviceSynchronize();
+  return 0;
+}
diff --git a/verifiable/verifiable.cu b/verifiable/verifiable.cu
new file mode 100644
index 0000000..a375809
--- /dev/null
+++ b/verifiable/verifiable.cu
@@ -0,0 +1,1195 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+//#pragma nv_diag_suppress declared_but_not_referenced
+
+#include "verifiable.h"
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_bfloat16.h>
+
+#include "rccl/rccl.h"
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && RCCL_BFLOAT16 ==1
+  #define HAVE_ncclBfloat16 1
+#else
+  #define HAVE_ncclBfloat16 0
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  #define HAVE_ncclAvg 1
+#else
+  #define HAVE_ncclAvg 0
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+  #define HAVE_ncclPreMulSum 1
+#else
+  #define HAVE_ncclPreMulSum 0
+#endif
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cstdint>
+#include <cmath>
+#include <unistd.h>
+
+using std::size_t;
+using std::int8_t;
+using std::int16_t;
+using std::int32_t;
+using std::int64_t;
+using std::uint8_t;
+using std::uint16_t;
+using std::uint32_t;
+using std::uint64_t;
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+template<typename T>
+__device__ unsigned long long bitsOf(T x) {
+  union { unsigned long long ull; T val; } u;
+  u.ull = 0;
+  u.val = x;
+  return u.ull;
+}
+
+__host__ __device__ uint64_t mixBits(uint64_t x) {
+  union { uint32_t u32[2]; uint64_t u64; };
+  u64 = x;
+  u32[1] += 1;
+  u32[0] ^= u32[1];
+  u64 *= 0x9e3779b97f4a7c13u;
+  u32[0] ^= u32[1]<<16 ^ u32[1]>>16;
+  return u64;
+}
+
+__host__ __device__ uint64_t hashOf(uint64_t a, uint64_t b=0) {
+  a += uint64_t(1)<<32;
+  a += b;
+  a ^= a>>32;
+  a *= 0x9e3779b97f4a7c13u;
+  a += b>>16 ^ b<<48;
+  a ^= a>>32;
+  a *= 0xc4ceb9fe1a85ec53u;
+  return a;
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+template<typename T>
+struct IsIntegral: std::is_integral<T> {};
+template<>
+struct IsIntegral<__half>: std::false_type {};
+#if RCCL_BFLOAT16 == 1
+template<>
+struct IsIntegral<hip_bfloat16>: std::false_type {};
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Hide a value from arithmetic optimizations. Hopefully compiler cannot detect
+// that this is equivalent to the identity function.
+template<typename T>
+__host__ __device__ T inhibit(T x) {
+  union { uint64_t u64; T val; };
+  u64 = 0;
+  val = x;
+  u64 *= 0x0000000100000001u;
+  u64 *= 0xffffffff00000001u;
+  return val;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+  template<typename Y, typename X>
+  __host__ __device__ Y castTo(X x) {
+    return Y(x);
+  }
+  template<typename Y>
+  __host__ __device__ Y castTo(float x) {
+    return Y(x);
+  }
+  template<>
+  __host__ __device__ __half castTo<__half>(float x) {
+    return __float2half(x);
+  }
+  #if RCCL_BFLOAT16 == 1
+  template<>
+  __host__ __device__ hip_bfloat16 castTo<hip_bfloat16>(float x) {
+    return hip_bfloat16(x);
+  }
+  #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// The reduction functions
+
+namespace {
+struct ReduceNil {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T>
+  __host__ __device__ T operator()(T a, T /*b*/) const { return a; }
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReduceSum {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T, typename=decltype(T()+T())>
+  __host__ __device__ T operator()(T a, T b) const { return a + b; }
+  __host__ __device__ __half operator()(__half a, __half b) const {
+      return __float2half(__half2float(a) + __half2float(b));
+  }
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return hip_bfloat16(static_cast<float>(a) + static_cast<float>(b));
+  }
+  #endif
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReduceProd {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T, typename=decltype(T()*T())>
+  __host__ __device__ T operator()(T a, T b) const { return a * b; }
+  __host__ __device__ __half operator()(__half a, __half b) const {
+      return __float2half(__half2float(a) * __half2float(b));
+  }
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return hip_bfloat16(static_cast<float>(a) * static_cast<float>(b));
+  }
+  #endif
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReduceMin {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T, typename=decltype(T()<T())>
+  __host__ __device__ T operator()(T a, T b) const { return a < b ? a : b; }
+  __host__ __device__ __half operator()(__half a, __half b) const {
+    return __half2float(a) < __half2float(b) ? a : b;
+  }
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return static_cast<float>(a) < static_cast<float>(b) ? a : b;
+  }
+  #endif
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReduceMax {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T, typename=decltype(T()>T())>
+  __host__ __device__ T operator()(T a, T b) const { return a > b ? a : b; }
+  __host__ __device__ __half operator()(__half a, __half b) const {
+      return __half2float(a) > __half2float(b) ? a : b;
+  }
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return static_cast<float>(a) > static_cast<float>(b) ? a : b;
+  }
+  #endif
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReducePreMulSum {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int rank_me) const {
+    return ReduceProd()(x, ncclVerifiablePremulScalar<T>(rank_me));
+  }
+  template<typename T>
+  __host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); }
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+
+template<typename T, bool integral = IsIntegral<T>::value>
+struct ReduceAvg_Base;
+
+template<typename T>
+struct ReduceAvg_Base<T, /*integral=*/true> {
+  int rank_n;
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  __host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); }
+  __host__ __device__ T postOp(T x) const { return x/rank_n; }
+};
+
+template<typename T>
+struct ReduceAvg_Base<T, /*integral=*/false> {
+  int rank_n;
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const {
+    using T1 = typename std::conditional<(sizeof(T)<sizeof(double)), float, double>::type;
+    return ReduceProd()(inhibit(castTo<T>(T1(1)/T1(rank_n))), inhibit(x));
+  }
+  __host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); }
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+
+struct ReduceAvg {
+  int rank_n;
+  template<typename T>
+  __host__ __device__ T preOp(T x, int rank_me) const {
+    return ReduceAvg_Base<T>{rank_n}.preOp(x, rank_me);
+  }
+  template<typename T>
+  __host__ __device__ T operator()(T a, T b) const {
+    return ReduceAvg_Base<T>{rank_n}(a, b);
+  }
+  template<typename T>
+  __host__ __device__ T postOp(T x) const {
+    return ReduceAvg_Base<T>{rank_n}.postOp(x);
+  }
+};
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+template<typename T>
+struct FloatLayout;
+template<>
+struct FloatLayout<float> {
+  static constexpr int exponent_bits = 8, mantissa_bits = 23;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+template<>
+struct FloatLayout<double> {
+  static constexpr int exponent_bits = 11, mantissa_bits = 52;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+template<>
+struct FloatLayout<__half> {
+  static constexpr int exponent_bits = 5, mantissa_bits = 10;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+#if RCCL_BFLOAT16 == 1
+template<>
+struct FloatLayout<hip_bfloat16> {
+  static constexpr int exponent_bits = 8, mantissa_bits = 7;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+#endif
+
+template<typename T>
+__host__ __device__ T makeFloat(int sign, int exp, uint64_t mant) {
+  union { T ans; uint64_t bits; };
+  bits = sign;
+  bits <<= FloatLayout<T>::exponent_bits;
+  bits |= exp;
+  bits <<= FloatLayout<T>::mantissa_bits;
+  bits |= mant;
+  return ans;
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+// High bits of multiplcation are useful for generating bounded random values
+// from unbounded random values. For instance, given X a totally random 32-bit
+// integer, `umul32hi(X,n)` will be totally random within [0,n).
+__host__ __device__ uint64_t umul32hi(uint32_t a, uint32_t b) {
+#if HIP_VERSION > 50200000
+  return __umulhi(a, b);
+#else
+  return uint64_t(a)*b >> 32;
+#endif
+}
+__host__ __device__ uint64_t umul64hi(uint64_t a, uint64_t b) {
+#if HIP_VERSION > 50200000
+  return __umul64hi(a, b);
+#else
+  return uint64_t(__uint128_t(a)*__uint128_t(b) >> 64);
+#endif
+}
+
+__host__ __device__ int clz32(int x) {
+#if HIP_VERSION > 50200000
+  return __clz(x);
+#else
+  return x==0 ? 32 : __builtin_clz(x);
+#endif
+}
+__host__ __device__ int clz64(long long x) {
+#if HIP_VERSION > 50200000
+  return __clzll(x);
+#else
+  return x==0 ? 64 : __builtin_clzll(x);
+#endif
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+// Returns a wildly permuted rank index. Useful when we know we want exactly N
+// random ranks to exhibit some behavior, we can just test if:
+// `shuffleRank(rank_n, rank_me, rng) < N`. Note that rank_n > 0 must be true
+// for well defined results. This mixes the bits of rng.
+__host__ __device__ int shuffleRank(int rank_n, int rank_me, uint64_t &rng) {
+  uint32_t a = uint32_t(rng);
+  uint32_t b = uint32_t(rng>>32);
+  rng = mixBits(rng);
+
+  uint32_t r = rank_me;
+  // round down rank_n to largest pow2, then subtract 1
+  uint32_t n2 = (~uint32_t(0)>>1) >> clz32(rank_n);
+
+  // These are 1:1 functions modulo 2^n:
+  //   f(x) = x*a + b : for odd a, any b
+  //   f(x) = (x*x + x)/2
+  // So we apply both to the bottom n2+1 ranks, then rotate the top
+  // (rank_n-n2-1) to the bottom and apply both again.
+
+  if(r <= n2) {
+    // shuffle bottom n2+1 ranks
+    r = (r*(a|1) + b) & n2;
+    r = (r*r + r)/2 & n2;
+    // rotate top to bottom
+    r += rank_n - (n2+1);
+  }
+  else
+    r -= n2+1; // rotate top to bottom
+
+  if(r <= n2) {
+    // shuffle bottom n2+1 again
+    r = (r*(b|1) + a) & n2;
+    r = (r*r + r)/2 & n2;
+  }
+  return r;
+}
+}
+
+namespace {
+// Generate wild integers x and y such that if every rank submits its x into a
+// summation the result will be y with y <= y_max. Ranks should be shuffled
+// before calling.
+template<typename Uint>
+__host__ __device__ void genSumXY(
+    int rank_n, int rank_me, uint64_t &rng, Uint y_max, Uint &x, Uint &y,
+    bool avoid_y=false // if true then returned y will not equal given y
+  ) {
+  static_assert(std::is_unsigned<Uint>::value, "Type must be unsigned integral.");
+
+  { // Pick y as a random value in [y_max/2, y_max]
+    Uint d, y_min = (y_max+1)/2;
+    if(8*sizeof(Uint) > 32)
+      d = umul64hi(rng, y_max/2 + (avoid_y ? 0 : 1));
+    else
+      d = umul32hi(uint32_t(rng), y_max/2 + (avoid_y ? 0 : 1));
+    Uint y1 = (avoid_y ? y+1 : y_min) + d;
+    y = y1 - (avoid_y && (y1 < y_min || y_max < y1) ? y_max/2 : 0);
+  }
+  rng = mixBits(rng);
+
+  unsigned r = unsigned(rank_me);
+  unsigned rn = unsigned(rank_n);
+  // Partition our rn ranks into pn distinct subsets each of size rn/pn. If each
+  // rank submits 1+p (where p is 0-based partition index) then the sum be:
+  //   (rn/pn) * pn*(pn+1)/2
+  // So set this equal to our desired sum y and solve for pn.
+  //   (rn/pn) * pn*(pn+1)/2 = y
+  //   rn*(pn+1)/2 = y
+  //   pn = 2*(y/rn)-1
+  Uint pn = rn == 1 ? 1 : 2*(y/rn) - 1;
+  // In the case where rn is huge (compared to y) use only one partition meaning
+  // that all rn ranks will submit 1 (since p=0).
+  pn = pn == 0 ? 1 : pn;
+  // Can't have more partitions than ranks.
+  pn = rn < pn ? rn : pn;
+  // Compute sum of contribution from pn partitions where each submits p+1.
+  Uint p_sum;
+  if(y_max <= ~uint32_t(0)>>1) // compile time known
+    p_sum = Uint(uint32_t(pn)*uint32_t(pn+1)/2);
+  else
+    p_sum = Uint(uint64_t(pn)*uint64_t(pn+1)/2);
+  // Let s be the number of ranks per partition. This is either rn/pn as we
+  // intended, or y/p_sum if that's smaller to prevent overshooting our target y.
+  uint32_t s = y/p_sum < rn/pn ? y/p_sum : rn/pn;
+  x = (s != 0 && r/s < pn) ? 1 + r/s : 0; //  First s*pn ranks contribute partition index +1.
+  x += r == rn-1 ? y - s*p_sum : 0; // Last rank contributes discrepancy.
+}
+}
+
+namespace {
+template<typename T>
+__host__ __device__ T genInOutFloatSum(
+    bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    bool same_sign
+  ) {
+  constexpr int exp_lo = 1 + FloatLayout<T>::mantissa_bits;
+  constexpr int exp_hi = (1<<FloatLayout<T>::exponent_bits)-1;
+  using uintmant_t = typename std::conditional<(8*sizeof(T) > 32), uint64_t, uint32_t>::type;
+  constexpr uintmant_t mant_mask = (uintmant_t(1) << FloatLayout<T>::mantissa_bits)-1;
+  constexpr uintmant_t max_mant = 2*mant_mask + 1; // add implicit leading 1
+  uint64_t rng = hashOf(seed, index);
+
+  int y_sign = rng & 1;
+  int x_sign = y_sign;
+  int xy_exp = exp_lo + umul32hi(uint32_t(rng>>32), exp_hi-exp_lo);
+  rng = mixBits(rng);
+  rank_me = shuffleRank(rank_n, rank_me, rng);
+
+  // If we're using mixed signs then partition into evens and odds.
+  int subrank_n = same_sign ? rank_n : (rank_n+1)/2;
+  int subrank_me = same_sign ? rank_me : rank_me/2;
+  uintmant_t x0_mant, y0_mant;
+  genSumXY(subrank_n, subrank_me, rng, max_mant, x0_mant, y0_mant);
+
+  if (!same_sign && (rank_n+0)/2 != 0) {
+    uintmant_t x1_mant, y1_mant = y0_mant;
+    // Avoid generating y1_mant == y0_mant so we don't have to worry about
+    // signed zero as the result.
+    genSumXY((rank_n+0)/2, rank_me/2, rng, max_mant, x1_mant, y1_mant, /*avoid_y=*/true);
+    y_sign ^= y0_mant < y1_mant ? 1 : 0;
+    y0_mant = (y0_mant < y1_mant ? -1 : 1)*(y0_mant - y1_mant);
+    x_sign ^= rank_me%2;
+    x0_mant = rank_me%2 == 0 ? x0_mant : x1_mant;
+  }
+
+  uintmant_t ans_mant = input_not_output ? x0_mant : y0_mant;
+  if(ans_mant == 0)
+    return T(0.0f);
+  else {
+    int shift = clz64(ans_mant) - (64-FloatLayout<T>::mantissa_bits-1);
+    int ans_sign = input_not_output ? x_sign : y_sign;
+    int ans_exp = xy_exp - shift;
+    ans_mant <<= shift;
+    return makeFloat<T>(ans_sign, ans_exp, ans_mant & mant_mask);
+  }
+}
+}
+
+namespace {
+template<typename T>
+__host__ __device__ T genInOutFloatPreMulSum(
+    bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index
+  ) {
+  constexpr int exp_lo = 1 + FloatLayout<T>::mantissa_bits;
+  constexpr int exp_hi = (1<<FloatLayout<T>::exponent_bits)-1;
+  using uintmant_t = typename std::conditional<(8*sizeof(T) > 32), uint64_t, uint32_t>::type;
+  constexpr uintmant_t mant_mask = (uintmant_t(1) << FloatLayout<T>::mantissa_bits)-1;
+  constexpr uintmant_t max_mant = 2*mant_mask + 1; // add implicit leading 1
+  uint64_t rng = hashOf(seed, index);
+
+  int y_sign = rng & 1;
+  int y_exp = exp_lo + umul32hi(uint32_t(rng>>32), exp_hi-exp_lo);
+  rng = mixBits(rng);
+  int subrank_me0 = shuffleRank((rank_n+1)/2, rank_me/2, rng);
+  int subrank_me1 = shuffleRank((rank_n+0)/2, rank_me/2, rng);
+
+  // when ncclVerifiablePremulScalar() = 1.0 (rank_me%2 == 0)
+  uintmant_t x0_mant, y0_mant;
+  genSumXY((rank_n+1)/2, subrank_me0, rng, max_mant>>1, x0_mant, y0_mant);
+
+  // when ncclVerifiablePremulScalar() = 2.0 (rank_me%2 == 1)
+  uintmant_t x1_mant=0, y1_mant=0;
+  if((rank_n+0)/2 != 0)
+    genSumXY((rank_n+0)/2, subrank_me1, rng, max_mant>>2, x1_mant, y1_mant);
+
+  uintmant_t x_mant = rank_me%2 == 0 ? x0_mant : x1_mant;
+  uintmant_t y_mant = y0_mant + 2*y1_mant;
+  uintmant_t ans_mant = input_not_output ? x_mant : y_mant;
+
+  if(ans_mant == 0)
+    return T(0.0f);
+  else {
+    int shift = clz64(ans_mant) - (64-FloatLayout<T>::mantissa_bits-1);
+    int ans_sign = y_sign;
+    int ans_exp = y_exp - shift;
+    ans_mant <<= shift;
+    return makeFloat<T>(ans_sign, ans_exp, ans_mant & mant_mask);
+  }
+}
+}
+
+namespace {
+template<typename T>
+__host__ __device__ T genInOutFloatProd(
+    bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index
+  ) {
+  // Three kinds of contributions (values for x):
+  // 1) x = random value: only one rank does this
+  // 2) x = 2^n: random positive n
+  // 3) x = 1
+  // Since only one rank submits a random value, the result of the product
+  // will have the same mantissa as that value but with an exponent incorporating
+  // the sum of the exponents from case (2)
+
+  uint64_t rng = hashOf(seed, index);
+  rank_me = shuffleRank(rank_n, rank_me, rng);
+  int y_sign = (rank_n/2)%2;
+  int x_sign = rank_me%2;
+
+  constexpr unsigned max_exp = -1 + (1<<(FloatLayout<T>::exponent_bits-1));
+  unsigned x_exp=0, y_exp=0;
+  genSumXY(rank_n, rank_me, rng, max_exp, x_exp, y_exp);
+  x_exp += FloatLayout<T>::exponent_bias;
+  y_exp += FloatLayout<T>::exponent_bias;
+
+  constexpr uint64_t mant_mask = (uint64_t(1)<<FloatLayout<T>::mantissa_bits)-1;
+  uint64_t y_mant = rng & mant_mask;
+  if (y_mant == 0) y_mant = 1;
+
+  return makeFloat<T>(
+    input_not_output ? x_sign : y_sign,
+    input_not_output ? x_exp : y_exp,
+    !input_not_output || rank_me==0 ? y_mant : 0
+  );
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// What follows is lots of overloads for genInput/genOutput to generate data
+
+namespace {
+// General case for integral data for all ops but ReduceNil/premulsum
+template<typename T, typename ReduceFn,
+         typename = typename std::enable_if<
+             !std::is_same<ReduceFn, ReduceNil>::value
+           >::type>
+__host__ __device__ void genInput(
+    T &ans, ReduceFn, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::true_type /*integral*/
+  ) {
+  (void)rank_n; // silence unused warnings
+  union { uint64_t bits; T tmp; };
+  bits = uint64_t(-1)>>(64 - 8*sizeof(T));
+  bits &= hashOf(index ^ index<<16 ^ rank_me, seed);
+  // make sure we never return 0 in products
+  ans = std::is_same<ReduceFn, ReduceProd>::value && bits == 0 ? T(1) : tmp;
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Dumb/generic case for genOutput just reduces results of genInput
+
+namespace {
+template<typename T, typename ReduceFn, bool IsIntegral>
+__host__ __device__ void genOutput(
+    T &ans, ReduceFn op, int rank_n, uint64_t seed, intptr_t index,
+    std::integral_constant<bool, IsIntegral>
+  ) {
+  T acc = genInput<T>(op, rank_n, 0, seed, index);
+  acc = op.preOp(acc, 0);
+  for(int r=1; r < rank_n; r++)
+    acc = op(acc, op.preOp(genInput<T>(op, rank_n, r, seed, index), r));
+  ans = op.postOp(acc);
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Nil reduction (byte copy functions). Optimized to assume rank_n=1
+
+namespace {
+template<typename T, bool IsIntegral>
+__host__ __device__ void genInput(
+    T &ans, ReduceNil, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::integral_constant<bool, IsIntegral>
+  ) {
+  (void)rank_n, (void)rank_me; // silence unused warnings
+  union { uint64_t bits; T tmp; };
+  bits = mixBits(seed ^ index);
+  bits >>= 64 - 8*sizeof(T);
+  bits &= uint64_t(-1)>>(64 - 8*sizeof(T));
+  ans = tmp;
+}
+
+template<typename T, typename ReduceFn, bool IsIntegral>
+__host__ __device__ void genOutput(
+    T &ans, ReduceNil op, int rank_n, uint64_t seed, intptr_t index,
+    std::integral_constant<bool, IsIntegral>
+  ) {
+  ans = genInput<T>(op, rank_n, 0, seed, index);
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Sum of float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceSum, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index, /*same_sign=*/false);
+}
+
+template<typename T>
+__host__ __device__ void genOutput(
+    T &ans, ReduceSum, int rank_n, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/false);
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Product of float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceProd, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatProd<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index);
+}
+
+template<typename T>
+__host__ __device__ void genOutput(
+    T &ans, ReduceProd, int rank_n, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatProd<T>(/*input_not_output=*/false, rank_n, 0, seed, index);
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// PreMulSum of int/float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReducePreMulSum, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::true_type integral
+  ) {
+  genInput(ans, ReduceSum(), rank_n, rank_me, seed, index, integral);
+}
+
+// No genOutput overload specific to premulsum(int), just use generic case.
+
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReducePreMulSum, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatPreMulSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index);
+}
+
+template<typename T>
+__host__ __device__ void genOutput(
+    T &ans, ReducePreMulSum, int rank_n, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatPreMulSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index);
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////
+// Average of float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index, /*same_sign=*/true);
+}
+
+template<typename T>
+__host__ __device__ void genOutput(
+    T &ans, ReduceAvg, int rank_n, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/true);
+  using T1 = typename std::conditional<(sizeof(T)<sizeof(double)), float, double>::type;
+  //ans = ReduceProd()(ans, T1(1)/T1(rank_n));
+  ans = ReduceProd()(ans, inhibit(castTo<T>(T1(1)/T1(rank_n))));
+ }
+}
+
+/////////////////////////////////////////////////////////////////////////////////
+// min/max of float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceMin, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type integral
+  ) {
+  genInput<T>(ans, ReduceMax(), rank_n, rank_me, seed, index, integral);
+}
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceMax, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  (void)rank_n; // silence unused warnings
+  constexpr uint64_t mant_mask = (uint64_t(1) << FloatLayout<T>::mantissa_bits)-1;
+  uint64_t rng = hashOf(index ^ index<<16 ^ rank_me, seed);
+  int sign = rng & 1;
+  rng ^= rng>>1;
+  int exp = rng & ((1<<(FloatLayout<T>::exponent_bits-1))-1);
+  exp += 1<<(FloatLayout<T>::exponent_bits-2);
+  rng ^= rng >> FloatLayout<T>::exponent_bits;
+  uint64_t mant = rng & mant_mask;
+  ans = makeFloat<T>(sign, exp, mant);
+}
+
+// No genOutput overload specific to floating point min/max, just use generic case.
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Entry API for genInput/genOutput
+
+namespace {
+template<typename T, typename ReduceFn>
+__host__ __device__ T genInput(
+    ReduceFn op, int rank_n, int rank_me, uint64_t seed, intptr_t index
+  ) {
+  T ans;
+  genInput(ans, op, rank_n, rank_me, seed, index,
+    std::integral_constant<bool, IsIntegral<T>::value>());
+  return ans;
+}
+
+template<typename T, typename ReduceFn>
+__host__ __device__ T genOutput(
+    ReduceFn op, int rank_n, uint64_t seed, intptr_t index
+  ) {
+  T ans;
+  genOutput(ans, op, rank_n, seed, index,
+    std::integral_constant<bool, IsIntegral<T>::value>());
+  return ans;
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if !SELF_TEST
+namespace {
+template<typename T, typename ReduceFn>
+__global__ void prepareInput2(
+    T *elts, intptr_t elt_n, ReduceFn op, int rank_n, int rank_me,
+    uint64_t seed, intptr_t elt_ix0
+  ) {
+  intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
+  i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
+  intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
+  i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
+  intptr_t i = i0 + threadIdx.x;
+  while(i < i1) {
+    elts[i] = genInput<T>(op, rank_n, rank_me, seed, elt_ix0+i);
+    #if 0
+    T output = genOutput<T>(op, rank_n, seed, elt_ix0+i);
+    printf("prepareInput2 T=%d seed=0x%llx r=%d ix=%lld x=%g output=%g elts=%p\n",
+      std::is_same<T,int>::value, (long long)seed, int(rank_me), (long long)i, (float)elts[i], (float)output, elts);
+    #endif
+    i += blockDim.x;
+  }
+}
+
+template<typename ReduceOp>
+void prepareInput1(
+    void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me,
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+  ) {
+  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+  #define CASE_TY(T) prepareInput2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break;
+  switch(elt_ty) {
+  case ncclInt8: CASE_TY(int8_t)
+  case ncclUint8: CASE_TY(uint8_t)
+  case ncclInt32: CASE_TY(int32_t)
+  case ncclUint32: CASE_TY(uint32_t)
+  case ncclInt64: CASE_TY(int64_t)
+  case ncclUint64: CASE_TY(uint64_t)
+  case ncclFloat16: CASE_TY(__half)
+  #if HAVE_ncclBfloat16
+  case ncclBfloat16: CASE_TY(hip_bfloat16)
+  #endif
+  case ncclFloat32: CASE_TY(float)
+  case ncclFloat64: CASE_TY(double)
+  default: assert(0);
+  }
+  #undef CASE_TY
+}
+}
+
+void ncclVerifiablePrepareInput(
+    void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+  ) {
+  #define CASE_OP(op) \
+    if(rank_n == 1) \
+      prepareInput1(elts, elt_n, elt_ty, ReduceNil(), rank_n, rank_me, seed, elt_ix0, stream); \
+    else \
+      prepareInput1(elts, elt_n, elt_ty, op, rank_n, rank_me, seed, elt_ix0, stream); \
+    break;
+  switch(red_op) {
+  case ncclSum: CASE_OP(ReduceSum())
+  case ncclMin: CASE_OP(ReduceMin())
+  case ncclMax: CASE_OP(ReduceMax())
+  case ncclProd: CASE_OP(ReduceProd())
+  #if HAVE_ncclAvg
+  case ncclAvg: CASE_OP(ReduceAvg{rank_n})
+  #endif
+  #if HAVE_ncclPreMulSum
+  default: CASE_OP(ReducePreMulSum())
+  #endif
+  }
+  #undef CASE_OP
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if !SELF_TEST
+namespace {
+template<typename T, typename ReduceFn>
+__global__ void prepareExpected2(
+    T *elts, intptr_t elt_n, ReduceFn op, int rank_n,
+    uint64_t seed, intptr_t elt_ix0
+  ) {
+  intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
+  i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
+  intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
+  i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
+  intptr_t i = i0 + threadIdx.x;
+  while(i < i1) {
+    elts[i] = genOutput<T>(op, rank_n, seed, elt_ix0+i);
+    #if 0
+    printf("prepareExpected2 seed=0x%llx ix=%lld x=%g elts=%p\n",
+      (long long)seed, (long long)(elt_ix0+i), (float)elts[i], elts);
+    #endif
+    i += blockDim.x;
+  }
+}
+
+template<typename ReduceOp>
+void prepareExpected1(
+    void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n,
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+  ) {
+  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+  #define CASE_TY(T) prepareExpected2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break;
+  switch(elt_ty) {
+  case ncclInt8: CASE_TY(int8_t)
+  case ncclUint8: CASE_TY(uint8_t)
+  case ncclInt32: CASE_TY(int32_t)
+  case ncclUint32: CASE_TY(uint32_t)
+  case ncclInt64: CASE_TY(int64_t)
+  case ncclUint64: CASE_TY(uint64_t)
+  case ncclFloat16: CASE_TY(__half)
+  #if HAVE_ncclBfloat16
+  case ncclBfloat16: CASE_TY(hip_bfloat16)
+  #endif
+  case ncclFloat32: CASE_TY(float)
+  case ncclFloat64: CASE_TY(double)
+  default: assert(0);
+  }
+  #undef CASE_TY
+}
+}
+
+void ncclVerifiablePrepareExpected(
+    void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+  ) {
+  #define CASE_OP(op) \
+    if(rank_n == 1) \
+      prepareExpected1(elts, elt_n, elt_ty, ReduceNil(), rank_n, seed, elt_ix0, stream); \
+    else \
+      prepareExpected1(elts, elt_n, elt_ty, op, rank_n, seed, elt_ix0, stream); \
+    break;
+  switch(red_op) {
+  case ncclSum: CASE_OP(ReduceSum())
+  case ncclMin: CASE_OP(ReduceMin())
+  case ncclMax: CASE_OP(ReduceMax())
+  case ncclProd: CASE_OP(ReduceProd())
+  #if HAVE_ncclAvg
+  case ncclAvg: CASE_OP(ReduceAvg{rank_n})
+  #endif
+  #if HAVE_ncclPreMulSum
+  default: CASE_OP(ReducePreMulSum())
+  #endif
+  }
+  #undef CASE_OP
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+/* How we compare floating point values when exactness is impossible is interesting.
+ * First, we take note that simply reinterpreting integer bits as floating point
+ * gives us a monotonic mapping which exponentially spaces out floats. Thus
+ * consecutive integers encode consecutive floats. In general, using integer
+ * subraction on the bitpatterns of two floats gives us an integer which is the
+ * logarithm of their relative difference. But, if the floats always have similar
+ * exponents, than the integer difference is actually proportional to the
+ * relative error (this is because we are counting hops in the mantissa bits only,
+ * not the exponent bits). So a cheap way to compare if two floats are relatively
+ * close is: abs(intBits(a), intBits(b)) < tolerance. The following formula
+ * calculates such a tolerance for a summation of n floats. This formula
+ * was derived by inspecting the maximum observed integer difference over many
+ * random runs of summation. The parameter values were computed by the
+ * companion program "inexact_regress.cu".
+ */
+__host__ __device__ unsigned calcSumFloatTolerance(int rank_n, int elt_ty) {
+  float power, coef;
+  switch(elt_ty) {
+  case ncclFloat32:
+  case ncclFloat64:
+    power = .51f;
+    coef = 1.25f;
+    break;
+  case ncclFloat16:
+    power = .91f;
+    coef = .75f;
+    break;
+  #if HAVE_ncclBfloat16
+  case ncclBfloat16:
+    power = .91f;
+    coef = .66f;
+    break;
+  #endif
+  }
+  #if __CUDA_ARCH__
+    return 1 + unsigned(coef*powf(float(rank_n), power));
+  #else
+    return 1 + unsigned(coef*std::pow(float(rank_n), power));
+  #endif
+}
+
+template<typename T>
+__host__ __device__  uint64_t calcDelta(T a, T b) {
+  union { T t; uint8_t i1; uint16_t i2; uint32_t i4; uint64_t i8; } x, y;
+  x.t = a;
+  y.t = b;
+  switch(sizeof(T)) {
+  case 1:  return x.i1 < y.i1 ? y.i1 - x.i1 : x.i1 - y.i1;
+  case 2:  return x.i2 < y.i2 ? y.i2 - x.i2 : x.i2 - y.i2;
+  case 4:  return x.i4 < y.i4 ? y.i4 - x.i4 : x.i4 - y.i4;
+  default: return x.i8 < y.i8 ? y.i8 - x.i8 : x.i8 - y.i8;
+  }
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if !SELF_TEST
+namespace {
+template<typename T>
+__global__ void verifyPrepared(
+    T const *results, T const *expected, intptr_t elt_n, unsigned tolerance, int64_t *bad_elt_n
+  ) {
+  intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
+  i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
+  intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
+  i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
+  intptr_t i = i0 + threadIdx.x;
+  int64_t bad = 0;
+
+  while(i < i1) {
+    T a = results[i], b = expected[i];
+    T delta = a < b ? b - a : a - b;
+    bad += tolerance < delta ? 1 : 0;
+    #if 0
+      if(tolerance < delta) {
+        printf("verifyPrepared ix=%lld got=%g exp=%g\n", (long long)i, (float)results[i], (float)expected[i]);
+      }
+    #endif
+    i += blockDim.x;
+  }
+  //asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  atomicAdd((unsigned long *)bad_elt_n, (unsigned long)bad);
+}
+
+template<typename T, typename Uint, typename ReduceFn>
+__global__ void verifyInline2(
+    T const *results, intptr_t elt_n, ReduceFn op, int rank_n, uint64_t seed,
+    intptr_t elt_ix0, unsigned tolerance, int64_t *bad_elt_n
+  ) {
+  intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
+  i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
+  intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
+  i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
+  intptr_t i = i0 + threadIdx.x;
+  int64_t bad = 0;
+
+  while(i < i1) {
+    union { T t; Uint u; } a, b;
+    a.t = results[i];
+    b.t = genOutput<T>(op, rank_n, seed, elt_ix0+i);
+    Uint delta = a.u < b.u ? b.u - a.u : a.u - b.u;
+    bad += tolerance < delta ? 1 : 0;
+    #if 0
+      T input = genInput<T>(op, rank_n, 0, seed, elt_ix0+i);
+      if(tolerance < delta) {
+        printf("verifyInline2 fail T=%d ix=%lld got=%g exp=%g input=%g\n",
+          std::is_same<T,int>::value, (long long)i, (float)a.t, (float)b.t, (float)input);
+      } else {
+        printf("verifyInline2 pass T=%d ix=%lld got=%g exp=%g input=%g\n",
+          std::is_same<T,int>::value, (long long)i, (float)a.t, (float)b.t, (float)input);
+      }
+    #endif
+    i += blockDim.x;
+  }
+  //asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  atomicAdd((unsigned long*)bad_elt_n, (unsigned long)bad);
+}
+
+template<typename T, typename Uint>
+void verifyInline1(
+    T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
+    unsigned tolerance, int64_t *bad_elt_n, hipStream_t stream, int block_n
+  ) {
+  #define CASE_OP(op) \
+    if(rank_n == 1) \
+    verifyInline2<T, Uint><<<block_n, 512, 0, stream>>> \
+      ((T const*)results, elt_n, ReduceNil(), rank_n, seed, elt_ix0, tolerance, bad_elt_n); \
+    else \
+    verifyInline2<T, Uint><<<block_n, 512, 0, stream>>> \
+      ((T const*)results, elt_n, op, rank_n, seed, elt_ix0, tolerance, bad_elt_n); \
+    break;
+  switch(red_op) {
+  case ncclSum: CASE_OP(ReduceSum())
+  case ncclMin: CASE_OP(ReduceMin())
+  case ncclMax: CASE_OP(ReduceMax())
+  case ncclProd: CASE_OP(ReduceProd())
+  #if HAVE_ncclAvg
+  case ncclAvg: CASE_OP(ReduceAvg{rank_n})
+  #endif
+  #if HAVE_ncclPreMulSum
+  default: CASE_OP(ReducePreMulSum())
+  #endif
+  }
+  #undef CASE_OP
+}
+}
+
+void ncclVerifiableVerify(
+    void const *results, void const *expected, intptr_t elt_n, int elt_ty,
+    int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
+    int64_t *bad_elt_n, hipStream_t stream
+  ) {
+  bool floating = elt_ty == ncclFloat16 || elt_ty == ncclFloat32 || elt_ty == ncclFloat64;
+  #if HAVE_ncclBfloat16
+    floating |= elt_ty == ncclBfloat16;
+  #endif
+
+  unsigned tolerance = 0;
+  #if HAVE_ncclAvg
+  if (floating && red_op == ncclAvg)
+    tolerance = calcSumFloatTolerance(rank_n, elt_ty);
+  #endif
+
+  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+
+  *bad_elt_n = 0;
+  #define CASE_TY(T, Uint) { \
+      if(expected != nullptr) { \
+        verifyPrepared<<<block_n, 512, 0, stream>>>((Uint const*)results, (Uint const*)expected, elt_n, tolerance, bad_elt_n); \
+      } else { \
+        verifyInline1<T, Uint>((T const*)results, elt_n, red_op, rank_n, seed, elt_ix0, tolerance, bad_elt_n, stream, block_n); \
+      } \
+    } break;
+  switch(elt_ty) {
+  case ncclInt8: CASE_TY(int8_t, uint8_t)
+  case ncclUint8: CASE_TY(uint8_t, uint8_t)
+  case ncclInt32: CASE_TY(int32_t, uint32_t)
+  case ncclUint32: CASE_TY(uint32_t, uint32_t)
+  case ncclInt64: CASE_TY(int64_t, uint64_t)
+  case ncclUint64: CASE_TY(uint64_t, uint64_t)
+  case ncclFloat16: CASE_TY(__half, uint16_t)
+  #if HAVE_ncclBfloat16
+  case ncclBfloat16: CASE_TY(hip_bfloat16, uint16_t)
+  #endif
+  case ncclFloat32: CASE_TY(float, uint32_t)
+  case ncclFloat64: CASE_TY(double, uint64_t)
+  default: assert(0);
+  }
+  #undef CASE_TY
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if SELF_TEST
+#include <iostream>
+
+template<typename T, typename Op>
+__device__ void sweep2(int ty, char const *tyname, Op op, char const *opname, int rank_n) {
+  //if(!std::is_same<T,half>::value) return;
+  //if(!std::is_same<Op,ReduceProd>::value) return;
+  //if(rank_n!=3) return;
+
+  unsigned tolerance = !IsIntegral<T>::value && std::is_same<Op,ReduceAvg>::value ? calcSumFloatTolerance(rank_n, ty) : 0;
+  uint64_t seed = 0xc8e2bed69766d533;
+
+  for(int ix=threadIdx.x; ix < 10000; ix+=blockDim.x) {
+    //if(ix!=387) continue;
+    T y = genOutput<T>(op, rank_n, seed, ix);
+    T sum;
+    for(int r=0; r < rank_n; r++) {
+      T x = genInput<T>(op, rank_n, r, seed, ix);
+      x = op.preOp(x, r);
+      sum = r==0 ? x : op(sum, inhibit(x));
+      //std::printf("x = %llx, sum = %llx\n", bitsOf(x), bitsOf(sum));
+    }
+    sum = op.postOp(sum);
+    if(tolerance < calcDelta(sum, y)) {
+      printf(
+        //"%10g != %10g  :  T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n",
+        "%llx != %llx  :  T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n",
+        *(long long*)&sum, *(long long*)&y, tyname, opname, rank_n, ix
+      );
+    }
+  }
+}
+
+template<typename T>
+__device__ void sweep1(int ty, char const *tyname) {
+  for(int i=0; i < 10; i++) {
+    int rank_n = (1<<i) + i;
+    sweep2<T>(ty, tyname, ReduceSum(), "sum", rank_n);
+    sweep2<T>(ty, tyname, ReduceProd(), "prod", rank_n);
+    sweep2<T>(ty, tyname, ReduceMin(), "min", rank_n);
+    sweep2<T>(ty, tyname, ReduceMax(), "max", rank_n);
+    sweep2<T>(ty, tyname, ReducePreMulSum(), "premulsum", rank_n);
+    sweep2<T>(ty, tyname, ReduceAvg{rank_n}, "avg", rank_n);
+  }
+}
+
+__global__ void sweep() {
+  sweep1<int8_t>(ncclInt8, "int8");
+  sweep1<uint8_t>(ncclUint8, "uint8");
+  sweep1<int32_t>(ncclInt32, "int32");
+  sweep1<uint32_t>(ncclUint32, "uint32");
+  sweep1<int64_t>(ncclInt64, "int64");
+  sweep1<uint64_t>(ncclUint64, "uint64");
+  sweep1<__half>(ncclFloat16, "half");
+  #if HAVE_ncclBfloat16
+    sweep1<hip_bfloat16>(ncclBfloat16, "bfloat16");
+  #endif
+  sweep1<float>(ncclFloat32, "float");
+  sweep1<double>(ncclFloat64, "double");
+}
+
+int main(int arg_n, char **args) {
+  std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
+  hipSetDevice(0);
+  sweep<<<1,512>>>();
+  hipDeviceSynchronize();
+  return 0;
+}
+#endif
diff --git a/verifiable/verifiable.h b/verifiable/verifiable.h
new file mode 100644
index 0000000..b41ef1a
--- /dev/null
+++ b/verifiable/verifiable.h
@@ -0,0 +1,66 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _d41d8cd98f00b204e9800998ecf8427e
+#define _d41d8cd98f00b204e9800998ecf8427e
+
+#include <hip/hip_runtime.h>
+
+#include <stdint.h>
+
+/* Routines for launching kernels that verify reduction results. A significant
+ * feature of these routines is they carefully craft floating point input
+ * to produce exactly predictable output.
+ *
+ * int elt_ty: actually just a ncclDataType_t
+ *
+ * int red_op: mostly just a  ncclRedOp_t. Since PreMulSum ops are dynamically
+ * created, these are encoded as the value ncclNumOps and their scalar is
+ * assumed to be `ncclVerifiablePremulScalar(rank_me)`
+ *
+ * uint64_t seed: arbitrary 64-bits to use in seeding the random values
+ *
+ * intptr_t elt_ix0: index of first element pointed to by elts when generating
+ * random values. This makes it possible to generate subsequences independently
+ * as well as in aggregate.
+ *
+ * int rank_n: Number of contributions into the reduction. Non-reduction
+ * collectives like broadcast, gather, etc will always set this to one.
+ *
+ * int rank_me: Index of this contribution
+ */
+
+// Use this as the local scalar for PreMulSum ops
+template<typename T>
+__host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
+  return T(rank_me%2 == 0 ? 1.0f : 2.0f);
+}
+
+// Enqueue kernel to generate data which is to be reduced.
+void ncclVerifiablePrepareInput(
+  void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
+  uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+);
+
+// Enqueue kernel to generate expected results of reduction.
+void ncclVerifiablePrepareExpected(
+  void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
+  uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+);
+
+// Enqueue kernel to verify reduced data matches expectation. The number of
+// failed elements is written to bad_elt_n which must be in cudaHost memory.
+// If `expected == nullptr` then the expected results are generated on-the-fly
+// which can be costly. Thus if you plan to run the same reduction multiple
+// times it is advantageous to precompute the expected values with
+// ncclVerifiablePrepareExpected and pass them as `expected` here.
+void ncclVerifiableVerify(
+  void const *results, void const *expected, intptr_t elt_n, int elt_ty,
+  int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
+  int64_t *bad_elt_n, hipStream_t stream
+);
+#endif
diff --git a/verifiable/verifiable.mk b/verifiable/verifiable.mk
new file mode 100644
index 0000000..fba1fbf
--- /dev/null
+++ b/verifiable/verifiable.mk
@@ -0,0 +1,18 @@
+# Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+# Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE.txt for license information
+
+
+# We requires both of the following paths to be set upon including this makefile
+# TEST_VERIFIABLE_SRCDIR = <points to this directory>
+# TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
+
+TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
+TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
+
+$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS)
+	@printf "Compiling %s\n" $@
+	@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
+	echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu"
+	$(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu