diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml
index 4d96581ad..ff86a3e32 100644
--- a/.azure-pipelines/integration-test.yml
+++ b/.azure-pipelines/integration-test.yml
@@ -13,9 +13,9 @@ jobs:
   strategy:
     matrix:
       cuda11:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2
 
   pool:
     name: mscclpp
@@ -30,10 +30,8 @@ jobs:
     inputs:
       targetType: 'inline'
       script: |
-        curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
-        tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
         mkdir build && cd build
-        MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
+        cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON ..
         make -j
       workingDirectory: '$(System.DefaultWorkingDirectory)'
 
@@ -112,3 +110,15 @@ jobs:
         set -e
         python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl
       workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+  - task: Bash@3
+    name: PythonAllReduceBenchmark
+    displayName: Python Allreduce Benchmark
+    inputs:
+      targetType: 'inline'
+      script: |
+        set -e
+        export PATH=/usr/local/mpi/bin:$PATH
+        python3 -m pip install .
+        mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
+      workingDirectory: '$(System.DefaultWorkingDirectory)'
diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml
index bb158646e..7c9d35094 100644
--- a/.azure-pipelines/multi-nodes-test.yml
+++ b/.azure-pipelines/multi-nodes-test.yml
@@ -10,9 +10,9 @@ jobs:
   strategy:
     matrix:
       cuda11:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2
   pool:
     name: mscclpp-it
   container:
@@ -25,12 +25,9 @@ jobs:
     inputs:
       targetType: 'inline'
       script: |
-        curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
-        tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
         mkdir build && cd build
-        MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON ..
+        cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON ..
         make -j
-        make pylib-copy
       workingDirectory: '$(System.DefaultWorkingDirectory)'
 
   - task: DownloadSecureFile@1
@@ -83,7 +80,7 @@ jobs:
         tail -f output/mscclit-000000 &
         CHILD_PID=$!
         parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh mscclpp-test'
+        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test'
         kill $CHILD_PID
 
   - task: Bash@3
@@ -102,7 +99,7 @@ jobs:
         tail -f output/mscclit-000000 &
         CHILD_PID=$!
         parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh mp-ut'
+        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut'
         kill $CHILD_PID
 
   - task: Bash@3
@@ -121,7 +118,26 @@ jobs:
         tail -f output/mscclit-000000 &
         CHILD_PID=$!
         parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh pytests'
+        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests'
+        kill $CHILD_PID
+
+  - task: Bash@3
+    name: RunMultiNodePythonBenchmark
+    displayName: Run multi-nodes python benchmark
+    inputs:
+      targetType: 'inline'
+      script: |
+        set -e
+        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
+        SSH_OPTION="StrictHostKeyChecking=no"
+        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+        rm -rf output/*
+        mkdir -p output
+        touch output/mscclit-000000
+        tail -f output/mscclit-000000 &
+        CHILD_PID=$!
+        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
+        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark'
         kill $CHILD_PID
 
   - task: AzureCLI@2
diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml
index 31b8091cd..78b679e8d 100644
--- a/.azure-pipelines/ut.yml
+++ b/.azure-pipelines/ut.yml
@@ -15,9 +15,9 @@ jobs:
   strategy:
     matrix:
       cuda11:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2
 
   container:
     image: $[ variables['containerImage'] ]
@@ -30,10 +30,8 @@ jobs:
     inputs:
       targetType: 'inline'
       script: |
-        curl -L -C- https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
-        tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
         mkdir build && cd build
-        MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
+        cmake -DCMAKE_BUILD_TYPE=Release ..
         make -j
       workingDirectory: '$(System.DefaultWorkingDirectory)'
 
@@ -79,11 +77,5 @@ jobs:
       script: |
         set -e
         export PATH=/usr/local/mpi/bin:$PATH
-        cd build && make pylib-copy
-        if [[ '$(containerImage)' == *'cuda11'* ]]; then
-          pip3 install -r ../python/test/requirements_cu11.txt
-        else
-          pip3 install -r ../python/test/requirements_cu12.txt
-        fi
-        mpirun -tag-output -np 8 ~/.local/bin/pytest ../python/test/test_mscclpp.py -x
+        mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
       workingDirectory: '$(System.DefaultWorkingDirectory)'
diff --git a/.github/ISSUE_TEMPLATE/documentation-improvement.md b/.github/ISSUE_TEMPLATE/documentation-improvement.md
new file mode 100644
index 000000000..e552d4db8
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/documentation-improvement.md
@@ -0,0 +1,10 @@
+---
+name: Documentation improvement
+about: Enhance or fix documentation
+title: "[Doc]"
+labels: ''
+assignees: ''
+
+---
+
+
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index 2db0a91fb..7295171e9 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -13,7 +13,7 @@ jobs:
     name: Analyze
     runs-on: 'ubuntu-latest'
     container:
-      image: ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda-version }}
+      image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda-version }}
 
     permissions:
       actions: read
@@ -24,7 +24,7 @@ jobs:
       fail-fast: false
       matrix:
         language: [ 'cpp', 'python' ]
-        cuda-version: [ 'cuda11.8', 'cuda12.1' ]
+        cuda-version: [ 'cuda11.8', 'cuda12.2' ]
 
     steps:
     - name: Checkout repository
@@ -45,7 +45,7 @@ jobs:
 
     - name: Build
       run: |
-        MPI_HOME=/usr/local/mpi cmake -DBYPASS_PEERMEM_CHECK=ON .
+        cmake -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .
         make -j
 
     - name: Perform CodeQL Analysis
diff --git a/.github/workflows/integration-test-backup.yml b/.github/workflows/integration-test-backup.yml
index 24dacf9ec..476ae8f76 100644
--- a/.github/workflows/integration-test-backup.yml
+++ b/.github/workflows/integration-test-backup.yml
@@ -10,10 +10,10 @@ jobs:
         shell: bash
     strategy:
       matrix:
-        cuda: [ cuda11.8, cuda12.1 ]
+        cuda: [ cuda11.8, cuda12.2 ]
 
     container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
+      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
       options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
 
     steps:
@@ -23,7 +23,7 @@ jobs:
       - name: Build
         run: |
           mkdir build && cd build
-          MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
+          cmake -DCMAKE_BUILD_TYPE=Release ..
           make -j
 
       - name: Lock GPU clock frequency
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index aaffe9578..0c1babcdd 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -20,7 +20,7 @@ jobs:
 
     - name: Run cpplint
       run: |
-        CPPSOURCES=$(find ./ -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)' -not -path "./build/*")
+        CPPSOURCES=$(find ./src ./include ./python ./test -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)')
         clang-format -style=file --verbose --Werror --dry-run ${CPPSOURCES}
 
   pylint:
diff --git a/.github/workflows/ut-backup.yml b/.github/workflows/ut-backup.yml
index 620fe46c6..696266c49 100644
--- a/.github/workflows/ut-backup.yml
+++ b/.github/workflows/ut-backup.yml
@@ -11,10 +11,10 @@ jobs:
     timeout-minutes: 30
     strategy:
       matrix:
-        cuda: [ cuda11.8, cuda12.1 ]
+        cuda: [ cuda11.8, cuda12.2 ]
 
     container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
+      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
       options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
 
     steps:
@@ -29,7 +29,7 @@ jobs:
       - name: Build
         run: |
           mkdir build && cd build
-          MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
+          cmake -DCMAKE_BUILD_TYPE=Release ..
           make -j
         working-directory: ${{ github.workspace }}
 
@@ -54,11 +54,11 @@ jobs:
       - name: PyTests
         run: |
           set -e
-          cd build && make pylib-copy
-          mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ../python/test/test_mscclpp.py -x
+          mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ./python/test/test_mscclpp.py -x
 
       - name: ReportCoverage
         run: |
+          set -e
           cd build
           lcov --capture --directory . --output-file coverage.info
           lcov --remove coverage.info \
@@ -68,4 +68,4 @@ jobs:
               '*/test/*' \
               '*/tools/*' \
               --output-file coverage.info
-          lcov --list coverage.info
+          lcov --list coverage.info
\ No newline at end of file
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 000000000..1b8c52dad
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,47 @@
+cff-version: 1.2.0
+title: "MSCCL++: A GPU-driven communication stack for scalable AI applications"
+version: 0.4.2
+message: >-
+  If you use this project in your research, please cite it as below.
+authors:
+  - given-names: Peng
+    family-names: Cheng
+    affiliation: Microsoft Research
+  - given-names: Changho
+    family-names: Hwang
+    affiliation: Microsoft Research
+  - given-names: Abhinav
+    family-names: Jangda
+    affiliation: Microsoft Research
+  - given-names: Suriya
+    family-names: Kalivardhan
+    affiliation: Microsoft Azure
+  - given-names: Binyang
+    family-names: Li
+    affiliation: Microsoft Azure
+  - given-names: Shuguang
+    family-names: Liu
+    affiliation: Microsoft Azure
+  - given-names: Saeed
+    family-names: Maleki
+    affiliation: Microsoft Research
+  - given-names: Madan
+    family-names: Musuvathi
+    affiliation: Microsoft Research
+  - given-names: Olli
+    family-names: Saarikivi
+    affiliation: Microsoft Research
+  - given-names: Wei
+    family-names: Tsui
+    affiliation: Microsoft Research
+  - given-names: Ziyue
+    family-names: Yang
+    affiliation: Microsoft Research
+
+repository-code: 'https://github.com/microsoft/mscclpp'
+abstract: >-
+  MSCCL++ redefines the interface for inter-GPU communication, thereby
+  delivering a highly efficient and customizable communication stack
+  tailored for distributed GPU applications.
+license: MIT
+license-url: https://github.com/microsoft/mscclpp/blob/main/LICENSE
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 982f9d568..6b90cbd86 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,18 +2,14 @@
 # Licensed under the MIT license.
 
 set(MSCCLPP_MAJOR "0")
-set(MSCCLPP_MINOR "3")
-set(MSCCLPP_PATCH "0")
+set(MSCCLPP_MINOR "4")
+set(MSCCLPP_PATCH "2")
 
 set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR})
 set(MSCCLPP_VERSION "${MSCCLPP_MAJOR}.${MSCCLPP_MINOR}.${MSCCLPP_PATCH}")
 
 cmake_minimum_required(VERSION 3.25)
-project(mscclpp LANGUAGES CUDA CXX)
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CUDA_STANDARD 17)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall,-Wextra")
+enable_language(CXX)
 
 # Code coverage from https://github.com/codecov/example-cpp11-cmake
 add_library(coverage_config INTERFACE)
@@ -27,75 +23,114 @@ if(CMAKE_BUILD_TYPE MATCHES "Debug" AND CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang
     target_link_options(coverage_config INTERFACE --coverage)
 endif()
 
-list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
-
-# Format targets
-include(${PROJECT_SOURCE_DIR}/cmake/AddFormatTargets.cmake)
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 
 # Options
 option(ENABLE_TRACE "Enable tracing" OFF)
 option(USE_NPKIT "Use NPKIT" ON)
 option(BUILD_TESTS "Build tests" ON)
 option(BUILD_PYTHON_BINDINGS "Build Python bindings" ON)
-option(ALLOW_GDRCOPY "Use GDRCopy, if available" OFF)
-option(BYPASS_PEERMEM_CHECK "Bypass checking nvidia_peermem" OFF)
+option(USE_CUDA "Use NVIDIA/CUDA." OFF)
+option(USE_ROCM "Use AMD/ROCm." OFF)
+option(BYPASS_GPU_CHECK "Bypass GPU check." OFF)
+
+if(BYPASS_GPU_CHECK)
+    if(USE_CUDA)
+        message("Bypassing GPU check: using NVIDIA/CUDA.")
+        find_package(CUDAToolkit REQUIRED)
+    elseif(USE_ROCM)
+        message("Bypassing GPU check: using AMD/ROCm.")
+        # Temporal fix for rocm5.6
+        set(CMAKE_PREFIX_PATH "/opt/rocm;${CMAKE_PREFIX_PATH}")
+        find_package(hip REQUIRED)
+    else()
+        message(FATAL_ERROR "Bypassing GPU check: neither NVIDIA/CUDA nor AMD/ROCm is specified.")
+    endif()
+else()
+    # Detect GPUs
+    include(CheckNvidiaGpu)
+    include(CheckAmdGpu)
+    if(NVIDIA_FOUND AND AMD_FOUND)
+        message("Detected NVIDIA/CUDA and AMD/ROCm: prioritizing NVIDIA/CUDA.")
+        set(USE_CUDA ON)
+        set(USE_ROCM OFF)
+    elseif(NVIDIA_FOUND)
+        message("Detected NVIDIA/CUDA.")
+        set(USE_CUDA ON)
+        set(USE_ROCM OFF)
+    elseif(AMD_FOUND)
+        message("Detected AMD/ROCm.")
+        set(USE_CUDA OFF)
+        set(USE_ROCM ON)
+    else()
+        message(FATAL_ERROR "Neither NVIDIA/CUDA nor AMD/ROCm is found.")
+    endif()
+endif()
+
+# Declare project
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
+if(USE_CUDA)
+    set(CMAKE_CUDA_STANDARD 17)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall,-Wextra")
+    project(mscclpp LANGUAGES CXX CUDA)
 
-# Find CUDAToolkit. Set CUDA flags based on the detected CUDA version
-find_package(CUDAToolkit REQUIRED)
-if(CUDAToolkit_FOUND)
+    # CUDA 11 or higher is required
     if(CUDAToolkit_VERSION_MAJOR LESS 11)
         message(FATAL_ERROR "CUDA 11 or higher is required but detected ${CUDAToolkit_VERSION}")
     endif()
 
+    # Set CUDA architectures
     if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 11)
         set(CMAKE_CUDA_ARCHITECTURES 80)
     endif()
 
+    # Hopper architecture
     if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 12)
         set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES} 90)
     endif()
+
+    set(GPU_LIBRARIES CUDA::cudart CUDA::cuda_driver)
+    set(GPU_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS})
+else()
+    set(CMAKE_HIP_STANDARD 17)
+    set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wall -Wextra")
+    project(mscclpp LANGUAGES CXX)
+
+    set(CMAKE_HIP_ARCHITECTURES gfx90a gfx941 gfx942)
+
+    set(GPU_LIBRARIES hip::device)
+    set(GPU_INCLUDE_DIRS ${hip_INCLUDE_DIRS})
 endif()
-set(CUDA_LIBRARIES CUDA::cudart CUDA::cuda_driver)
-
-# Find if nvidia_peermem is installed and loaded
-if(NOT BYPASS_PEERMEM_CHECK)
-    execute_process(COMMAND sh -c "lsmod | grep nvidia_peermem"
-                    RESULT_VARIABLE lsmod_result
-                    OUTPUT_VARIABLE lsmod_output)
-    if(NOT lsmod_result EQUAL 0)
-        message(FATAL_ERROR "nvidia_peermem is not installed or not loaded.")
-    endif()
-endif()
+
+# Format targets
+include(${PROJECT_SOURCE_DIR}/cmake/AddFormatTargets.cmake)
 
 # Find ibverbs and libnuma
 find_package(IBVerbs REQUIRED)
 find_package(NUMA REQUIRED)
-
-# Find optional packages
-if(ALLOW_GDRCOPY)
-    find_package(GDRCopy)
-endif()
+find_package(Threads REQUIRED)
 
 add_library(mscclpp_obj OBJECT)
 target_include_directories(mscclpp_obj
-    PRIVATE
-    ${CUDAToolkit_INCLUDE_DIRS}
+    SYSTEM PRIVATE
+    ${GPU_INCLUDE_DIRS}
     ${IBVERBS_INCLUDE_DIRS}
-    ${NUMA_INCLUDE_DIRS}
-    ${GDRCOPY_INCLUDE_DIRS})
-target_link_libraries(mscclpp_obj PRIVATE ${CUDA_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES} ${GDRCOPY_LIBRARIES})
+    ${NUMA_INCLUDE_DIRS})
+target_link_libraries(mscclpp_obj PRIVATE ${GPU_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES} Threads::Threads)
 target_link_libraries(mscclpp_obj PUBLIC coverage_config)
 set_target_properties(mscclpp_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
+if(USE_CUDA)
+    target_compile_definitions(mscclpp_obj PRIVATE USE_CUDA)
+elseif(USE_ROCM)
+    target_compile_definitions(mscclpp_obj PRIVATE USE_ROCM)
+endif()
 if(ENABLE_TRACE)
     target_compile_definitions(mscclpp_obj PRIVATE ENABLE_TRACE)
 endif()
 if(USE_NPKIT)
     target_compile_definitions(mscclpp_obj PRIVATE ENABLE_NPKIT)
 endif()
-if(ALLOW_GDRCOPY AND GDRCOPY_FOUND)
-    target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_GDRCOPY)
-    target_link_libraries(mscclpp_obj PRIVATE MSCCLPP::gdrcopy)
-endif()
 
 # libmscclpp
 add_library(mscclpp SHARED)
@@ -108,15 +143,19 @@ set_target_properties(mscclpp_static PROPERTIES VERSION ${MSCCLPP_VERSION} SOVER
 add_subdirectory(include)
 add_subdirectory(src)
 
+if("${INSTALL_PREFIX}" STREQUAL "")
+    set(INSTALL_PREFIX "./")
+endif()
+
 install(TARGETS mscclpp_obj
-    FILE_SET HEADERS DESTINATION include)
+    FILE_SET HEADERS DESTINATION ${INSTALL_PREFIX}/include)
 install(TARGETS mscclpp
-    LIBRARY DESTINATION lib)
+    LIBRARY DESTINATION ${INSTALL_PREFIX}/lib)
 install(TARGETS mscclpp_static
-    ARCHIVE DESTINATION lib)
+    ARCHIVE DESTINATION ${INSTALL_PREFIX}/lib)
 
 # Tests
-if (BUILD_TESTS)
+if(BUILD_TESTS)
     enable_testing() # Called here to allow ctest from the build directory
     add_subdirectory(test)
 endif()
diff --git a/README.md b/README.md
index 7f0112ec1..9796179d3 100644
--- a/README.md
+++ b/README.md
@@ -1,32 +1,56 @@
 # MSCCL++
 
-GPU-driven computation & communication stack.
+[![Latest Release](https://img.shields.io/github/release/microsoft/mscclpp.svg)](https://github.com/microsoft/mscclpp/releases/latest)
+[![License](https://img.shields.io/github/license/microsoft/mscclpp.svg)](LICENSE)
+[![CodeQL](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml/badge.svg?branch=main)](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml)
 
-See [Quick Start](docs/quickstart.md) to quickly get started.
+| Pipelines                | Build Status      |
+|--------------------------|-------------------|
+| Unit Tests (CUDA)        | [![Build Status](https://dev.azure.com/binyli/HPC/_apis/build/status%2Fmscclpp-ut?branchName=main)](https://dev.azure.com/binyli/HPC/_build/latest?definitionId=4&branchName=main) |
+| Integration Tests (CUDA) | [![Build Status](https://dev.azure.com/binyli/HPC/_apis/build/status%2Fmscclpp-test?branchName=main)](https://dev.azure.com/binyli/HPC/_build/latest?definitionId=3&branchName=main) |
+
+*NOTE (Nov 2023): Azure pipelines for ROCm will be added soon.*
 
-See the latest performance evaluation on Azure [NDmv4](docs/performance-ndmv4.md).
+A GPU-driven communication stack for scalable AI applications.
 
-Build our Doxygen document by running `doxygen` in [`docs/`](docs/) directory. Run `python3 -m http.server <PORT>` in `docs/doxygen/html/` directory to serve the generated HTML files.
+See [Quick Start](docs/quickstart.md) to quickly get started.
 
 ## Overview
 
-MSCCL++ is a development kit for implementing highly optimized distributed GPU applications, in terms of both inter-GPU communication and GPU computation. MSCCL++ is specially designed for developers who want to fine-tune inter-GPU communication of their applications at the GPU kernel level, without awareness of detailed communication mechanisms. The key underlying concept of MSCCL++ is GPU-driven execution, where both communication and computation tasks are initiated by GPU not by CPU. That is, the communication and computation interfaces of MSCCL++ are provided as device-side APIs (called inside a GPU kernel), while the host-side APIs of MSCCL++ are for bootstrapping, initial connection setups, or background host threads for inter-GPU DMA and RDMA (called proxies). By using MSCCL++, we expect:
+MSCCL++ redefines inter-GPU communication interfaces, thereby delivering a highly efficient and customizable communication stack for distributed GPU applications. Its design is specifically tailored to accommodate diverse performance optimization scenarios often encountered in state-of-the-art AI applications. Figure below provides a high-level overview of MSCCL++ abstractions in CUDA, C, and Python.
+
+| <center>MSCCL++ Abstractions Overview |
+|-------------------------------|
+| <img src="./docs/figs/abstractions.png" alt="MSCCL++ Abstractions" style="width: 800px;"/> |
+
+The followings highlight the key features of MSCCL++.
+
+* **Light-weight and multi-layer abstractions.** MSCCL++ provides communication abstractions at lowest level close to hardware and at the highest level close to application API. The lowest level of abstraction is ultra light weight which enables a user to implement logics of data movement for a collective operation such as AllReduce inside a GPU kernel extremely efficiently without worrying about memory ordering of different ops. The modularity of MSCCL++ enables a user to construct the building blocks of MSCCL++ in a high level abstraction in Python and feed them to a CUDA kernel in order to facilitate the user's productivity.
+
+* **1-sided 0-copy synchronous and asynchronous abstracts.** MSCCL++ provides fine-grained synchronous and asynchronous 0-copy 1-sided abstracts for communication primitives such as `put()`, `get()`, `signal()`, `flush()`, and `wait()`. The 1-sided abstractions allows a user to asynchronously `put()` their data on the remote GPU as soon as it is ready without requiring the remote side to issue any receive instruction. This enables users to easily implement flexible communication logics, such as overlapping communication with computation, or implementing customized collective communication algorithms without worrying about potential deadlocks. Additionally, the 0-copy capability enables MSCCL++ to directly transfer data between user's buffers without using intermediate internal buffers which saves GPU bandwidth and memory capacity.
 
-* **Holistic Optimization for High GPU Utilization.** As both communication and computation are scheduled inside a GPU kernel at the same time, we can optimize end-to-end performance of distributed GPU applications from a global view. For example, we can minimize the GPU resource contention between communication and computation, which is known to often substantially degrade throughput of distributed deep learning applications.
+* **Unified abstractions for different interconnection hardware.** MSCCL++ provides consistent abstractions regardless of the location of the remote GPU (either on the local node or on a remote node) or the underlying link (either NVLink/xGMI or InfiniBand). This simplifies the code for inter-GPU communication, which is often complex due to memory ordering of GPU/CPU read/writes and therefore, is error-prone.
 
-* **Fully Pipelined System to Reduce Overhead from the Control Plane.** We can eliminate control overhead from CPU by allowing GPU to autonomously schedule both communication and computation. This significantly reduces GPU scheduling overhead and CPU-GPU synchronization overhead. For example, this allows us to implement a highly fine-grained system pipelining (i.e., hiding communication delays by overlapping with computation), which has been difficult for CPU-controlled applications due to the large control/scheduling overhead.
+## Performance
 
-* **Runtime Performance Optimization for Dynamic Workload.** As we can easily implement flexible communication logics, we can optimize communication performance even during runtime. For example, we can implement the system to automatically choose different communication paths or different collective communication algorithms depending on the dynamic workload at runtime.
+While the power of MSCCL++ is fully realized with application-specific optimization, it still delivers performance benefits even for collective communication operations. The following figures provide a comparison of the AllReduce throughput of MSCCL++ against NCCL 2.19.3. This benchmark was tested over two [Azure NDmv4 SKUs](https://learn.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series) (8 A100-80G GPUs per node).
 
-## Key Features (v0.3)
+The key motivation behind these results is scaling of inference for LLM models using tensor parallelism. LLM requests usually are executed in two phases: prompt processing and token sampling. The prompt processing uses a large batch size that is usually equal to a request context length and the corresponding AllReduce size is `len_context*dim_hidden*sizeof(fp16)`. For a context length of 2048 with a hidden dimension of 12288 (GPT-3 size), the AllReduce size is 48MB. The token sampling uses a smaller batch size which corresponds to concurrent user requests in the system and therefore, the AllReduce size is `batch_size*dim_hidden*sizeof(fp16)`. For a concurrency of 16 users, the AllReduce size is 384KB. As the figures below demonstrates, MSCCL++ provides significant speed up over NCCL which is crucial for efficiency of serving LLMs at large scale.
 
-MSCCL++ v0.3 supports the following features.
+| <center>Single-node AllReduce | <center>Two-node AllReduce |
+|-------------------------------|----------------------------|
+| <img src="./docs/figs/mscclpp_vs_nccl_comparison_num_nodes_1.jpeg" alt="MSCCL++ vs NCCL AllReduce (Single-node)" style="width: 400px;"/> | <img src="./docs/figs/mscclpp_vs_nccl_comparison_num_nodes_2.jpeg" alt="MSCCL++ vs NCCL AllReduce (Two-node)" style="width: 400px;"/> |
 
-### In-Kernel Communication Interfaces
+## Key Concepts
 
-MSCCL++ provides inter-GPU communication interfaces to be called by a GPU thread. For example, the `put()` method in the following example copies 1KB data from the local GPU to a remote GPU. `channel` is a peer-to-peer communication channel between two GPUs, which consists of information on send/receive buffers. `channel` is initialized from the host side before the kernel execution.
+The following highlights key concepts of MSCCL++.
+
+### On-GPU Communication Interfaces: Channels
+
+MSCCL++ provides peer-to-peer communication methods between GPUs. A peer-to-peer connection between two GPUs is called a *Channel*. Channels are constructed by MSCCL++ host-side interfaces and copied to GPUs during initialization. Channels provide *GPU-side interfaces*, which means that all communication methods are defined as a device function to be called from a GPU kernel code. For example, the `put()` method in the following example copies 1KB data from the local GPU to a remote GPU.
 
 ```cpp
+// `ProxyChannel` will be explained in the following section.
 __device__ mscclpp::DeviceHandle<mscclpp::SimpleProxyChannel> channel;
 __global__ void gpuKernel() {
   ...
@@ -53,11 +77,17 @@ __device__ void barrier() {
 }
 ```
 
-MSCCL++ provides consistent in-kernel interfaces, i.e., the above interfaces are used regardless of the location of the remote GPU (either on the local node or on a remote node) or the underlying link (either NVLink or InfiniBand).
+MSCCL++ provides consistent interfaces, i.e., the above interfaces are used regardless of the location of the remote GPU (either on the local node or on a remote node) or the underlying link (either NVLink/xGMI or InfiniBand).
+
+### ProxyChannel and SmChannel
+
+MSCCL++ delivers two types of channels, **ProxyChannel** and **SmChannel**. `ProxyChannel` provides (R)DMA-based data copy and synchronization methods. When called, these methods send/receive a signal to/from a host-side proxy (hence the name `ProxyChannel`), which will trigger (R)DMA (such as `cudaMemcpy*` or `ibv_post_send`) or issue synchronization methods (such as `cudaStreamSynchronize` or `ibv_poll_cq`). Since the key functionalities are run by the proxy, `ProxyChannel` requires only a single GPU thread to call its methods. See all `ProxyChannel` methods from [here](./include/mscclpp/proxy_channel_device.hpp).
+
+On the other hand, `SmChannel` provides memory-mapping-based copy and synchronization methods. When called, these methods will directly use GPU threads to read/write from/to the remote GPU's memory space. Comparing against `ProxyChannel`, `SmChannel` is especially performant for low-latency scenarios, while it may need many GPU threads to call copying methods at the same time to achieve high copying bandwidth. See all `SmChannel` methods from [here](./include/mscclpp/sm_channel_device.hpp).
 
 ### Host-Side Communication Proxy
 
-Some in-kernel communication interfaces of MSCCL++ send requests (called triggers) to a GPU-external helper that conducts key functionalities such as DMA or RDMA. This helper is called a proxy service or a proxy in short. MSCCL++ provides a default implementation of a proxy, which is a background host thread that busy polls triggers from GPUs and conducts functionalities accordingly. For example, the following is a typical host-side code for MSCCL++.
+MSCCL++ provides a default implementation of a host-side proxy for ProxyChannels, which is a background host thread that busy polls triggers from GPUs and conducts functionalities accordingly. For example, the following is a typical host-side code for MSCCL++.
 
 ```cpp
 // Bootstrap: initialize control-plane connections between all ranks
@@ -120,19 +150,9 @@ public:
 
 Customized proxies can be used for conducting a series of pre-defined data transfers within only a single trigger from GPU at runtime. This would be more efficient than sending a trigger for each data transfer one by one.
 
-### Flexible Customization
-
-Most of key components of MSCCL++ are designed to be easily customized. This enables MSCCL++ to easily adopt a new software / hardware technology and lets users implement algorithms optimized for their own use cases.
-
-### New in MSCCL++ v0.3 (Latest Release)
-* Updated interfaces
-* Add Python bindings and interfaces
-* Add Python unit tests
-* Add more configurable parameters
-* Add a new single-node AllReduce kernel
-* Fix bugs
+### Python Interfaces
 
-See details from https://github.com/microsoft/mscclpp/issues/89.
+MSCCL++ provides Python bindings and interfaces, which simplifies integration with Python applications.
 
 ## Contributing
 
diff --git a/cmake/AddFormatTargets.cmake b/cmake/AddFormatTargets.cmake
index 71c3ef4ab..b95ad447b 100644
--- a/cmake/AddFormatTargets.cmake
+++ b/cmake/AddFormatTargets.cmake
@@ -26,11 +26,11 @@ find_program(BLACK black)
 if (BLACK)
     message(STATUS "Found black: ${BLACK}")
     add_custom_target(check-format-py
-        COMMAND ${BLACK} --config ${PROJECT_SOURCE_DIR}/pyproject.toml --check ${PROJECT_SOURCE_DIR}/python ${PROJECT_SOURCE_DIR}/test
+        COMMAND ${BLACK} --config ${PROJECT_SOURCE_DIR}/pyproject.toml --check ${PROJECT_SOURCE_DIR}
     )
     add_dependencies(check-format check-format-py)
     add_custom_target(format-py
-        COMMAND ${BLACK} --config ${PROJECT_SOURCE_DIR}/pyproject.toml ${PROJECT_SOURCE_DIR}/python ${PROJECT_SOURCE_DIR}/test
+        COMMAND ${BLACK} --config ${PROJECT_SOURCE_DIR}/pyproject.toml ${PROJECT_SOURCE_DIR}
     )
     add_dependencies(format format-py)
 else()
diff --git a/cmake/CheckAmdGpu.cmake b/cmake/CheckAmdGpu.cmake
new file mode 100644
index 000000000..3b26bfa5e
--- /dev/null
+++ b/cmake/CheckAmdGpu.cmake
@@ -0,0 +1,25 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+set(AMD_FOUND "FALSE")
+
+set(CMAKE_PREFIX_PATH "/opt/rocm;${CMAKE_PREFIX_PATH}")
+# Temporal fix for rocm5.6
+set(ENV{amd_comgr_DIR} "/opt/rocm/lib/cmake/amd_comgr")
+set(ENV{AMDDeviceLibs_DIR} "/opt/rocm/lib/cmake/AMDDeviceLibs")
+
+find_package(hip QUIET)
+
+if(NOT hip_FOUND)
+    return()
+endif()
+
+enable_language(HIP)
+
+set(CHECK_SRC "${CMAKE_CURRENT_SOURCE_DIR}/cmake/check_amd_gpu.hip")
+
+try_run(RUN_RESULT COMPILE_SUCCESS SOURCES ${CHECK_SRC})
+
+if(COMPILE_SUCCESS AND RUN_RESULT EQUAL 0)
+    set(AMD_FOUND "TRUE")
+endif()
diff --git a/cmake/CheckNvidiaGpu.cmake b/cmake/CheckNvidiaGpu.cmake
new file mode 100644
index 000000000..adc42ea00
--- /dev/null
+++ b/cmake/CheckNvidiaGpu.cmake
@@ -0,0 +1,36 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+set(NVIDIA_FOUND "FALSE")
+
+find_package(CUDAToolkit)
+
+if(NOT CUDAToolkit_FOUND)
+    return()
+endif()
+
+set(CMAKE_CUDA_ARCHITECTURES "60")
+if(NOT CMAKE_CUDA_COMPILER)
+    # In case the CUDA Toolkit directory is not in the PATH
+    find_program(CUDA_COMPILER
+                 NAMES nvcc
+                 PATHS ${CUDAToolkit_BIN_DIR})
+    if(NOT CUDA_COMPILER)
+        message(WARNING "Could not find nvcc in ${CUDAToolkit_BIN_DIR}")
+        unset(CMAKE_CUDA_ARCHITECTURES)
+        return()
+    endif()
+    set(CMAKE_CUDA_COMPILER "${CUDA_COMPILER}")
+endif()
+enable_language(CUDA)
+
+set(CHECK_SRC "${CMAKE_CURRENT_SOURCE_DIR}/cmake/check_nvidia_gpu.cu")
+
+try_run(RUN_RESULT COMPILE_SUCCESS SOURCES ${CHECK_SRC})
+
+if(COMPILE_SUCCESS AND RUN_RESULT EQUAL 0)
+    set(NVIDIA_FOUND "TRUE")
+else()
+    unset(CMAKE_CUDA_ARCHITECTURES)
+    unset(CMAKE_CUDA_COMPILER)
+endif()
diff --git a/cmake/check_amd_gpu.hip b/cmake/check_amd_gpu.hip
new file mode 100644
index 000000000..7537f7edc
--- /dev/null
+++ b/cmake/check_amd_gpu.hip
@@ -0,0 +1,15 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <hip/hip_runtime.h>
+
+__global__ void kernel() {}
+
+int main() {
+    int cnt;
+    hipError_t err = hipGetDeviceCount(&cnt);
+    if (err != hipSuccess || cnt == 0) {
+        return 1;
+    }
+    return 0;
+}
diff --git a/cmake/check_nvidia_gpu.cu b/cmake/check_nvidia_gpu.cu
new file mode 100644
index 000000000..672e70f28
--- /dev/null
+++ b/cmake/check_nvidia_gpu.cu
@@ -0,0 +1,15 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <cuda_runtime.h>
+
+__global__ void kernel() {}
+
+int main() {
+    int cnt;
+    cudaError_t err = cudaGetDeviceCount(&cnt);
+    if (err != cudaSuccess || cnt == 0) {
+        return 1;
+    }
+    return 0;
+}
diff --git a/docker/base-cuda12.1.dockerfile b/docker/base-cuda12.1.dockerfile
deleted file mode 100644
index 5c5bcd602..000000000
--- a/docker/base-cuda12.1.dockerfile
+++ /dev/null
@@ -1,59 +0,0 @@
-FROM nvidia/cuda:12.1.1-devel-ubuntu20.04
-
-LABEL maintainer="MSCCL++"
-LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN rm -rf /opt/nvidia
-
-RUN apt-get clean && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends \
-        build-essential \
-        ca-certificates \
-        curl \
-        git \
-        libcap2 \
-        libnuma-dev \
-        openssh-client \
-        openssh-server \
-        python3-dev \
-        python3-pip \
-        python3-setuptools \
-        python3-wheel \
-        sudo \
-        wget \
-        && \
-    apt-get autoremove && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/* /tmp/*
-
-# Install OFED
-ENV OFED_VERSION=5.2-2.2.3.0
-RUN cd /tmp && \
-    wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
-    tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
-    MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \
-    rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
-
-# Install OpenMPI
-ENV OPENMPI_VERSION=4.1.5
-RUN cd /tmp && \
-    export ompi_v_parsed="$(echo ${OPENMPI_VERSION} | sed -E 's/^([0-9]+)\.([0-9]+)\..*/\1.\2/')" && \
-    wget -q https://download.open-mpi.org/release/open-mpi/v${ompi_v_parsed}/openmpi-${OPENMPI_VERSION}.tar.gz && \
-    tar xzf openmpi-${OPENMPI_VERSION}.tar.gz && \
-    cd openmpi-${OPENMPI_VERSION} && \
-    ./configure --prefix=/usr/local/mpi && \
-    make -j && \
-    make install && \
-    cd .. && \
-    rm -rf /tmp/openmpi-${OPENMPI_VERSION}*
-
-ENV PATH="/usr/local/mpi/bin:${PATH}" \
-    LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64:${LD_LIBRARY_PATH}"
-
-RUN echo PATH="${PATH}" > /etc/environment && \
-    echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment
-
-ENTRYPOINT []
diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile
new file mode 100644
index 000000000..87d3f5c0d
--- /dev/null
+++ b/docker/base-dev-x.dockerfile
@@ -0,0 +1,38 @@
+ARG BASE_IMAGE=ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
+FROM ${BASE_IMAGE}
+
+LABEL maintainer="MSCCL++"
+LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        htop \
+        lcov \
+        vim \
+        && \
+    apt-get autoremove && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* /tmp/*
+
+# Install cmake 3.26.4
+ENV CMAKE_VERSION="3.26.4"
+ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
+    CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
+RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
+    tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \
+    rm -rf ${CMAKE_HOME}.tar.gz
+ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"
+
+# Install Python dependencies
+ADD . /tmp/mscclpp
+WORKDIR /tmp/mscclpp
+ARG TARGET="cuda12.1"
+RUN cuda_major_version=$(echo ${TARGET} | grep -oP 'cuda\K[0-9]+') && \
+    python3 -m pip install --no-cache-dir -r python/requirements_cu${cuda_major_version}.txt
+
+# Set PATH
+RUN echo PATH="${PATH}" > /etc/environment
+
+# Cleanup
+RUN rm -rf /tmp/mscclpp
+WORKDIR /
diff --git a/docker/base-cuda11.8.dockerfile b/docker/base-x.dockerfile
similarity index 87%
rename from docker/base-cuda11.8.dockerfile
rename to docker/base-x.dockerfile
index 22e03443b..bf29f718a 100644
--- a/docker/base-cuda11.8.dockerfile
+++ b/docker/base-x.dockerfile
@@ -1,4 +1,5 @@
-FROM nvidia/cuda:11.8.0-devel-ubuntu20.04
+ARG BASE_IMAGE=nvidia/cuda:12.1.1-devel-ubuntu20.04
+FROM ${BASE_IMAGE}
 
 LABEL maintainer="MSCCL++"
 LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
@@ -7,8 +8,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 
 RUN rm -rf /opt/nvidia
 
-RUN apt-get clean && \
-    apt-get update && \
+RUN apt-get update && \
     apt-get install -y --no-install-recommends \
         build-essential \
         ca-certificates \
@@ -50,10 +50,12 @@ RUN cd /tmp && \
     cd .. && \
     rm -rf /tmp/openmpi-${OPENMPI_VERSION}*
 
+ARG EXTRA_LD_PATH=/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64
 ENV PATH="/usr/local/mpi/bin:${PATH}" \
-    LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/local/cuda-11.8/lib64:${LD_LIBRARY_PATH}"
+    LD_LIBRARY_PATH="/usr/local/mpi/lib:${EXTRA_LD_PATH}:${LD_LIBRARY_PATH}"
 
 RUN echo PATH="${PATH}" > /etc/environment && \
     echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment
 
 ENTRYPOINT []
+WORKDIR /
diff --git a/docker/build.sh b/docker/build.sh
new file mode 100755
index 000000000..5b14bcc4c
--- /dev/null
+++ b/docker/build.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+
+set -e
+
+declare -A baseImageTable
+baseImageTable=(
+    ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04"
+    ["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04"
+    ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
+)
+
+declare -A extraLdPathTable
+extraLdPathTable=(
+    ["cuda11.8"]="/usr/local/cuda-11.8/lib64"
+    ["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
+    ["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64"
+)
+
+GHCR="ghcr.io/microsoft/mscclpp/mscclpp"
+TARGET=${1}
+
+print_usage() {
+    echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2]"
+}
+
+if [[ ! -v "baseImageTable[${TARGET}]" ]]; then
+    echo "Invalid target: ${TARGET}"
+    print_usage
+    exit 1
+fi
+echo "Target: ${TARGET}"
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+
+cd ${SCRIPT_DIR}/..
+
+docker build -t ${GHCR}:base-${TARGET} \
+    -f docker/base-x.dockerfile \
+    --build-arg BASE_IMAGE=${baseImageTable[${TARGET}]} \
+    --build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
+    --build-arg TARGET=${TARGET} .
+
+docker build -t ${GHCR}:base-dev-${TARGET} \
+    -f docker/base-dev-x.dockerfile \
+    --build-arg BASE_IMAGE=${GHCR}:base-${TARGET} \
+    --build-arg TARGET=${TARGET} .
diff --git a/docker/dev-cuda11.8.dockerfile b/docker/dev-cuda11.8.dockerfile
deleted file mode 100644
index 094772b06..000000000
--- a/docker/dev-cuda11.8.dockerfile
+++ /dev/null
@@ -1,28 +0,0 @@
-FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
-
-LABEL maintainer="MSCCL++"
-LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
-
-ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \
-    CMAKE_VERSION="3.26.4"
-
-ADD . ${MSCCLPP_SRC_DIR}
-WORKDIR ${MSCCLPP_SRC_DIR}
-
-# Install cmake 3.26.4
-ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
-    CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
-RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
-    tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \
-    rm -rf ${CMAKE_HOME}.tar.gz
-ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"
-
-# Install pytest & dependencies
-RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu11.txt
-
-# Set PATH
-RUN echo PATH="${PATH}" > /etc/environment
-
-# Cleanup
-WORKDIR /
-RUN rm -rf ${MSCCLPP_SRC_DIR}
diff --git a/docker/dev-cuda12.1.dockerfile b/docker/dev-cuda12.1.dockerfile
deleted file mode 100644
index 70fe684c1..000000000
--- a/docker/dev-cuda12.1.dockerfile
+++ /dev/null
@@ -1,27 +0,0 @@
-FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
-
-LABEL maintainer="MSCCL++"
-LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
-
-ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \
-    CMAKE_VERSION="3.26.4"
-
-ADD . ${MSCCLPP_SRC_DIR}
-WORKDIR ${MSCCLPP_SRC_DIR}
-
-# Install cmake 3.26.4
-ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
-    CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
-RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
-    tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local
-ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"
-
-# Install pytest & dependencies
-RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu12.txt
-
-# Set PATH
-RUN echo PATH="${PATH}" > /etc/environment
-
-# Cleanup
-WORKDIR /
-RUN rm -rf ${MSCCLPP_SRC_DIR}
diff --git a/docker/release-cuda11.8.dockerfile b/docker/release-cuda11.8.dockerfile
deleted file mode 100644
index 67963c583..000000000
--- a/docker/release-cuda11.8.dockerfile
+++ /dev/null
@@ -1,32 +0,0 @@
-FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
-
-LABEL maintainer="MSCCL++"
-LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
-
-ENV MSCCLPP_HOME="/usr/local/mscclpp" \
-    MSCCLPP_SRC_DIR="/tmp/mscclpp" \
-    CMAKE_VERSION="3.26.4"
-
-# Download cmake 3.26.4
-ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
-    CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
-RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
-    tar xzf ${CMAKE_HOME}.tar.gz -C /tmp
-
-# Install MSCCL++
-ADD . ${MSCCLPP_SRC_DIR}
-WORKDIR ${MSCCLPP_SRC_DIR}
-RUN rm -rf build && \
-    mkdir build && \
-    cd build && \
-    ${CMAKE_HOME}/bin/cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=${MSCCLPP_HOME} .. && \
-    make -j mscclpp && \
-    make install/fast && \
-    strip ${MSCCLPP_HOME}/lib/libmscclpp.so.[0-9]*.[0-9]*.[0-9]*
-
-ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${MSCCLPP_HOME}/lib"
-RUN echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment
-
-# Cleanup
-WORKDIR /
-RUN rm -rf ${CMAKE_HOME}* ${MSCCLPP_SRC_DIR}
diff --git a/docker/release-cuda12.1.dockerfile b/docker/release-cuda12.1.dockerfile
deleted file mode 100644
index 7c1961121..000000000
--- a/docker/release-cuda12.1.dockerfile
+++ /dev/null
@@ -1,36 +0,0 @@
-FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
-
-LABEL maintainer="MSCCL++"
-LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
-
-ENV MSCCLPP_HOME="/usr/local/mscclpp" \
-    MSCCLPP_SRC_DIR="/tmp/mscclpp" \
-    CMAKE_VERSION="3.26.4"
-
-# Download cmake 3.26.4
-ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
-    CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
-RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
-    tar xzf ${CMAKE_HOME}.tar.gz -C /tmp
-
-# Install MSCCL++
-ADD . ${MSCCLPP_SRC_DIR}
-WORKDIR ${MSCCLPP_SRC_DIR}
-RUN rm -rf build && \
-    mkdir build && \
-    cd build && \
-    ${CMAKE_HOME}/bin/cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=${MSCCLPP_HOME} .. && \
-    make -j mscclpp mscclpp_static && \
-    make install/fast && \
-    strip ${MSCCLPP_HOME}/lib/libmscclpp.so.[0-9]*.[0-9]*.[0-9]*
-
-# Install MSCCL++ Python bindings
-WORKDIR ${MSCCLPP_SRC_DIR}
-RUN python3.8 -m pip install .
-
-ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${MSCCLPP_HOME}/lib"
-RUN echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment
-
-# Cleanup
-WORKDIR /
-RUN rm -rf ${CMAKE_HOME}* ${MSCCLPP_SRC_DIR}
diff --git a/docs/.gitignore b/docs/.gitignore
index 94f90d1e6..00d9344fb 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -1 +1,3 @@
 doxygen/
+_build/
+sphinx/
diff --git a/docs/Doxyfile b/docs/Doxyfile
index 0fa68bf2a..b2d5528e7 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -2043,7 +2043,7 @@ MAN_LINKS              = NO
 # captures the structure of the code including all documentation.
 # The default value is: NO.
 
-GENERATE_XML           = NO
+GENERATE_XML           = YES
 
 # The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
 # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 000000000..d4bb2cbb9
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 000000000..2bb9c1efb
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,27 @@
+## How to build docs
+
+1. Install `doxygen`.
+
+    ```bash
+    $ sudo apt-get install doxygen
+    ```
+
+2. Install Python packages below. If you install them on the user's local, you need to include `~/.local/bin` to `$PATH` (to use `sphinx-build`).
+
+    ```bash
+    $ sudo python3 -m pip install sphinx sphinx_rtd_theme breathe
+    ```
+
+3. Create Doxygen documents.
+
+    ```bash
+    $ doxygen
+    ```
+
+4. Create Sphinx documents.
+
+    ```bash
+    $ sphinx-build -b html -Dbreathe_projects.mscclpp=$PWD/doxygen/xml $PWD $PWD/sphinx
+    ```
+
+5. Done. The HTML files will be on `sphinx/` directory.
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 000000000..2e6544fa1
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,29 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+project = "mscclpp"
+copyright = "2023, MSCCL++ Team"
+author = "MSCCL++ Team"
+release = "v0.4.2"
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = ["breathe"]
+
+templates_path = ["_templates"]
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+
+# Breathe configuration
+breathe_default_project = "mscclpp"
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = "sphinx_rtd_theme"
+html_static_path = ["_static"]
diff --git a/docs/figs/abstractions.png b/docs/figs/abstractions.png
new file mode 100644
index 000000000..e6183aa91
Binary files /dev/null and b/docs/figs/abstractions.png differ
diff --git a/docs/figs/mscclpp_vs_nccl_comparison_num_nodes_1.jpeg b/docs/figs/mscclpp_vs_nccl_comparison_num_nodes_1.jpeg
new file mode 100644
index 000000000..9c483b986
Binary files /dev/null and b/docs/figs/mscclpp_vs_nccl_comparison_num_nodes_1.jpeg differ
diff --git a/docs/figs/mscclpp_vs_nccl_comparison_num_nodes_2.jpeg b/docs/figs/mscclpp_vs_nccl_comparison_num_nodes_2.jpeg
new file mode 100644
index 000000000..6c8132565
Binary files /dev/null and b/docs/figs/mscclpp_vs_nccl_comparison_num_nodes_2.jpeg differ
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 000000000..ba060047c
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,26 @@
+.. MSCCL++ documentation master file, created by
+   sphinx-quickstart on Tue Sep  5 13:03:46 2023.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to MSCCL++'s documentation!
+===================================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+
+Docs
+====
+
+.. doxygennamespace:: mscclpp
+   :members:
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 000000000..32bb24529
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/performance-ndmv4.md b/docs/performance-ndmv4.md
index 28e38b0e7..4187b3b0a 100644
--- a/docs/performance-ndmv4.md
+++ b/docs/performance-ndmv4.md
@@ -1,50 +1,3 @@
 # NDmv4 Performance
 
-All results from NDmv4. NCCL version 2.17.1+cuda11.8, reported in-place numbers.
-
-nccl-tests command example:
-```bash
-mpirun --bind-to numa -hostfile /mnt/hostfile --tag-output --allow-run-as-root -map-by ppr:8:node --bind-to numa -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0 -x PATH -x LD_PRELOAD=/mnt/nccl/build/lib/libnccl.so -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 -x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=/mnt/ndv4-topo.xml -x NCCL_DEBUG=WARN ./build/all_gather_perf -b 1K -e 1K -g 1 -c 1 -w 10 -n 10 -G 1
-```
-
-mscclpp-tests command example:
-```bash
-mpirun -allow-run-as-root -map-by ppr:8:node -hostfile /mnt/hostfile ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1K -w 10 -n 10 -G 10 -k 0
-```
-
-**NOTE:** NCCL AllGather leverages Ring algorithm instead of all-pairs alike algorithm, which greatly reduces inter-node transmission, causing significant higher performance. MSCCL++ should do something similar in the future
-
-### 1 node, 8 gpus/node
-**Latency (us)**
-| Message Size | NCCL AllGather | NCCL AllReduce | NCCL AllToAll | MSCCL AllToAll LL/LL128/Simple | MSCCL++ AllGather K0/K1/K2 | MSCCL++ AllReduce |
-|:------------:|:--------------:|:--------------:|:-------------:|:------------------------------:|:--------------------------:|:-----------------:|
-| 1K           | 12.53          | **16.96**      | 9.34          | **7.76** / 21.06 / 28.50       | 157.91 / 143.21 / 447.0    | 326.4             |
-
-**BusBW (GB/s)**
-| Message Size | NCCL AllGather | NCCL AllReduce | NCCL AllToAll | MSCCL AllToAll LL/LL128/Simple | MSCCL++ AllGather K0/K1/K2   | MSCCL++ AllReduce |
-|:------------:|:--------------:|:--------------:|:-------------:|:------------------------------:|:----------------------------:|:-----------------:|
-| 1G           | 253.59         | **231.45**     | 254.69        | 217.05 / 216.98 / 217.15       | 125.06 / **255.64** / 124.89 | 22.55             |
-
-### 2 nodes, 1 gpu/node
-**Latency (us)**
-| Message Size | NCCL AllGather | NCCL AllReduce |  NCCL AllToAll | MSCCL AllToAll LL/LL128/Simple | MSCCL++ AllGather K0/K1/K2 | MSCCL++ AllReduce |
-|:------------:|:--------------:|:--------------:|:--------------:|:------------------------------:|:--------------------------:|:-----------------:|
-| 1K           | 16.08          | **21.27**      | 29.84          | 14.67 / 29.12 / 35.43          | 15.32 / **13.84** / 26.08  | -                 |
-
-**BusBW (GB/s)**
-| Message Size | NCCL AllGather | NCCL AllReduce | NCCL AllToAll | MSCCL AllToAll LL/LL128/Simple | MSCCL++ AllGather K0/K1/K2 | MSCCL++ AllReduce |
-|:------------:|:--------------:|:--------------:|:-------------:|:------------------------------:|:--------------------------:|:-----------------:|
-| 1G           | 15.84          | **18.65**      | 15.48         | 13.94 / 13.83 / 14.10          | **23.30** / 23.29 / 21.60  | -                 |
-
-### 2 nodes, 8 gpus/node
-**Latency (us)**
-| Message Size | NCCL AllGather | NCCL AllReduce | NCCL AllToAll | MSCCL AllToAll LL/LL128/Simple | MSCCL++ AllGather K0/K1/K2 | MSCCL++ AllReduce |
-|:------------:|:--------------:|:--------------:|:-------------:|:------------------------------:|:--------------------------:|:-----------------:|
-| 1K           | 33.74          | **35.85**      | 49.75         | **22.55** / 39.33 / 56.93      | 159.14 / 230.52 / 462.7    | -                 |
-
-**BusBW (GB/s)**
-| Message Size | NCCL AllGather | NCCL AllReduce | NCCL AllToAll | MSCCL AllToAll LL/LL128/Simple | MSCCL++ AllGather K0/K1/K2 | MSCCL++ AllReduce |
-|:------------:|:--------------:|:--------------:|:-------------:|:------------------------------:|:--------------------------:|:-----------------:|
-| 1G           | 177.05         | **183.82**     | 37.80         | 40.17 / 40.18 / 40.23          | 44.19 / 9.31 / **209.33**  | -                 |
-| 4G           | 186.01         | **188.18**     | 37.81         | - / - / -                      | 44.60 / - / **234.08**     | -                 |
-
+TBU
diff --git a/docs/quickstart.md b/docs/quickstart.md
index 9ccf1b6f9..af1bbe5f3 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -8,8 +8,10 @@
     * ND_H100_v5
     * [NC_A100_v4](https://learn.microsoft.com/en-us/azure/virtual-machines/nc-a100-v4-series) (TBD)
 * Non-Azure Systems
-    * NVIDIA A100 GPUs + CUDA >= 11.1.1
-    * NVIDIA H100 GPUs + CUDA >= 12.0.0
+    * NVIDIA A100 GPUs + CUDA >= 11.8
+    * NVIDIA H100 GPUs + CUDA >= 12.0
+    * AMD MI250X GPUs + ROCm >= 5.7
+    * AMD MI300X GPUs + ROCm >= 5.7
 * OS: tested over Ubuntu 18.04 and 20.04
 * Libraries: [libnuma](https://github.com/numactl/numactl), MPI (optional)
 * Others
@@ -25,10 +27,24 @@ CMake 3.25 or later is required.
 ```bash
 $ git clone https://github.com/microsoft/mscclpp.git
 $ mkdir -p mscclpp/build && cd mscclpp/build
+```
+
+For NVIDIA platforms, build MSCCL++ as follows.
+
+```bash
+# For NVIDIA platforms
 $ cmake -DCMAKE_BUILD_TYPE=Release ..
 $ make -j
 ```
 
+For AMD platforms, use HIPCC instead of the default C++ compiler. Replace `/path/to/hipcc` from the command below into the your HIPCC path.
+
+```bash
+# For AMD platforms
+$ CXX=/path/to/hipcc cmake -DCMAKE_BUILD_TYPE=Release ..
+$ make -j
+```
+
 ## Install from Source (Libraries and Headers)
 
 ```bash
@@ -54,6 +70,8 @@ Our base image installs all prerequisites for MSCCL++.
 $ docker pull ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
 ```
 
+See all available images [here](https://github.com/microsoft/mscclpp/pkgs/container/mscclpp%2Fmscclpp).
+
 ## Unit Tests
 
 `unit_tests` require one GPU on the system. It only tests operation of basic components.
@@ -76,37 +94,53 @@ To run `mp_unit_tests` with more than two nodes, you need to specify the `-ip_po
 $ mpirun -np 16 -npernode 8 -hostfile hostfile ./test/mp_unit_tests -ip_port 10.0.0.5:50000
 ```
 
-## mscclpp-test
+## Performance Benchmark
+
+### Python Benchmark
 
-mscclpp-test is a set of performance benchmarks for MSCCL++. It requires MPI to be installed on the system, and the path should be provided via `MPI_HOME` environment variable to the CMake build system.
+[Install the MSCCL++ Python package](https://github.com/microsoft/mscclpp/blob/chhwang/docs/docs/quickstart.md#install-from-source-python-module) and run our Python AllReduce benchmark as follows. It requires MPI on the system.
+
+```bash
+# Choose either `requirements_cu11.txt` or `requirements_cu12.txt` according to your CUDA version.
+$ python3 -m pip install -r ./python/requirements_cu12.txt
+$ mpirun -tag-output -np 8 python3 ./python/benchmark/allreduce_bench.py
+```
+
+### C++ Benchmark (mscclpp-test)
+
+*NOTE: mscclpp-test will be retired soon and will be maintained only as an example of C++ implementation. If you want to get the latest performance numbers, please use the Python benchmark instead.*
+
+mscclpp-test is a set of C++ performance benchmarks. It requires MPI on the system, and the path should be provided via `MPI_HOME` environment variable to the CMake build system.
 
 ```bash
 $ MPI_HOME=/path/to/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
-$ make -j sendrecv_test_perf allgather_test_perf allreduce_test_perf alltoall_test_perf
+$ make -j allgather_test_perf allreduce_test_perf
 ```
 
-For example, the following command runs the AllReduce benchmark with 8 GPUs starting from 3MB to 48MB messages, by doubling the message size in between.
+For example, the following command runs the `allreduce5` algorithm with 8 GPUs starting from 3MB to 48MB messages, by doubling the message size in between. You can try different algorithms by changing the `-k 5` option to another value (e.g., `-k 3` runs `allreduce3`). Check all algorithms from the code: [allreduce_test.cu](https://github.com/microsoft/mscclpp/blob/main/test/mscclpp-test/allreduce_test.cu) and [allgather_test.cu](https://github.com/microsoft/mscclpp/blob/main/test/mscclpp-test/allgather_test.cu).
 
 ```bash
-$ mpirun -np 8 ./test/mscclpp-test/allreduce_test_perf -b 3m -e 48m -G 100 -n 100 -w 20 -f 2 -k 4
+$ mpirun --bind-to numa -np 8 ./test/mscclpp-test/allreduce_test_perf -b 3m -e 48m -G 100 -n 100 -w 20 -f 2 -k 5
 ```
 
+*NOTE: a few algorithms set a condition on the total data size, such as to be a multiple of 3. If the condition is unmet, the command will throw a regarding error.*
+
 Check the help message for more details.
 
 ```bash
 $ ./test/mscclpp-test/allreduce_test_perf --help
-USAGE: allreduce_test_perf 
-        [-b,--minbytes <min size in bytes>] 
-        [-e,--maxbytes <max size in bytes>] 
-        [-i,--stepbytes <increment size>] 
-        [-f,--stepfactor <increment factor>] 
-        [-n,--iters <iteration count>] 
-        [-w,--warmup_iters <warmup iteration count>] 
-        [-c,--check <0/1>] 
-        [-T,--timeout <time in seconds>] 
-        [-G,--cudagraph <num graph launches>] 
-        [-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] 
-        [-k,--kernel_num <kernel number of commnication primitive>] 
-        [-o, --output_file <output file name>] 
+USAGE: allreduce_test_perf
+        [-b,--minbytes <min size in bytes>]
+        [-e,--maxbytes <max size in bytes>]
+        [-i,--stepbytes <increment size>]
+        [-f,--stepfactor <increment factor>]
+        [-n,--iters <iteration count>]
+        [-w,--warmup_iters <warmup iteration count>]
+        [-c,--check <0/1>]
+        [-T,--timeout <time in seconds>]
+        [-G,--cudagraph <num graph launches>]
+        [-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>]
+        [-k,--kernel_num <kernel number of commnication primitive>]
+        [-o, --output_file <output file name>]
         [-h,--help]
 ```
diff --git a/include/mscclpp/atomic_device.hpp b/include/mscclpp/atomic_device.hpp
new file mode 100644
index 000000000..1a8dd4c38
--- /dev/null
+++ b/include/mscclpp/atomic_device.hpp
@@ -0,0 +1,65 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef MSCCLPP_ATOMIC_DEVICE_HPP_
+#define MSCCLPP_ATOMIC_DEVICE_HPP_
+
+#include "device.hpp"
+
+#if defined(MSCCLPP_DEVICE_CUDA)
+#include <cuda/atomic>
+#endif  // defined(MSCCLPP_DEVICE_CUDA)
+
+namespace mscclpp {
+
+#if defined(MSCCLPP_DEVICE_CUDA)
+
+constexpr cuda::memory_order memoryOrderRelaxed = cuda::memory_order_relaxed;
+constexpr cuda::memory_order memoryOrderAcquire = cuda::memory_order_acquire;
+constexpr cuda::memory_order memoryOrderRelease = cuda::memory_order_release;
+constexpr cuda::memory_order memoryOrderAcqRel = cuda::memory_order_acq_rel;
+constexpr cuda::memory_order memoryOrderSeqCst = cuda::memory_order_seq_cst;
+
+template <typename T>
+MSCCLPP_HOST_DEVICE_INLINE T atomicLoad(T* ptr, cuda::memory_order memoryOrder) {
+  return cuda::atomic_ref<T, cuda::thread_scope_system>{*ptr}.load(memoryOrder);
+}
+
+template <typename T>
+MSCCLPP_HOST_DEVICE_INLINE void atomicStore(T* ptr, const T& val, cuda::memory_order memoryOrder) {
+  cuda::atomic_ref<T, cuda::thread_scope_system>{*ptr}.store(val, memoryOrder);
+}
+
+template <typename T>
+MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, cuda::memory_order memoryOrder) {
+  return cuda::atomic_ref<T, cuda::thread_scope_system>{*ptr}.fetch_add(val, memoryOrder);
+}
+
+#elif defined(MSCCLPP_DEVICE_HIP)
+
+constexpr auto memoryOrderRelaxed = __ATOMIC_RELAXED;
+constexpr auto memoryOrderAcquire = __ATOMIC_ACQUIRE;
+constexpr auto memoryOrderRelease = __ATOMIC_RELEASE;
+constexpr auto memoryOrderAcqRel = __ATOMIC_ACQ_REL;
+constexpr auto memoryOrderSeqCst = __ATOMIC_SEQ_CST;
+
+template <typename T>
+MSCCLPP_HOST_DEVICE_INLINE T atomicLoad(const T* ptr, int memoryOrder) {
+  return __atomic_load_n(ptr, memoryOrder);
+}
+
+template <typename T>
+MSCCLPP_HOST_DEVICE_INLINE void atomicStore(T* ptr, const T& val, int memoryOrder) {
+  __atomic_store_n(ptr, val, memoryOrder);
+}
+
+template <typename T>
+MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, int memoryOrder) {
+  return __atomic_fetch_add(ptr, val, memoryOrder);
+}
+
+#endif  // defined(MSCCLPP_DEVICE_HIP)
+
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_ATOMIC_DEVICE_HPP_
diff --git a/include/mscclpp/concurrency.hpp b/include/mscclpp/concurrency_device.hpp
similarity index 61%
rename from include/mscclpp/concurrency.hpp
rename to include/mscclpp/concurrency_device.hpp
index ab2a5bd38..6614b91c5 100644
--- a/include/mscclpp/concurrency.hpp
+++ b/include/mscclpp/concurrency_device.hpp
@@ -1,10 +1,11 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef MSCCLPP_CONCURRENCY_HPP_
-#define MSCCLPP_CONCURRENCY_HPP_
+#ifndef MSCCLPP_CONCURRENCY_DEVICE_HPP_
+#define MSCCLPP_CONCURRENCY_DEVICE_HPP_
 
-#include <mscclpp/poll.hpp>
+#include "atomic_device.hpp"
+#include "poll_device.hpp"
 
 namespace mscclpp {
 
@@ -17,47 +18,41 @@ struct DeviceSyncer {
   /// Destroy the DeviceSyncer object.
   ~DeviceSyncer() = default;
 
-#ifdef __CUDACC__
+#if defined(MSCCLPP_DEVICE_COMPILE)
   /// Synchronize all threads inside a kernel. Guarantee that all previous work of all threads in cooperating blocks is
   /// finished.
   /// @param blockNum The number of blocks that will synchronize.
   /// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative.
-  __forceinline__ __device__ void sync(int blockNum, int64_t maxSpinCount = 100000000) {
-    int maxOldCnt = blockNum - 1;
+  MSCCLPP_DEVICE_INLINE void sync(int blockNum, int64_t maxSpinCount = 100000000) {
+    unsigned int maxOldCnt = blockNum - 1;
     __syncthreads();
     if (blockNum == 1) return;
     if (threadIdx.x == 0) {
       // Need a `__threadfence()` before to flip `flag`.
       __threadfence();
-      int tmpIsAdd = isAdd_ ^ 1;
-      if (tmpIsAdd) {
-        if (atomicAdd(&count_, 1) == maxOldCnt) {
-          flag_ = 1;
-        }
-        POLL_MAYBE_JAILBREAK(!flag_, maxSpinCount);
+      unsigned int tmp = preFlag_ ^ 1;
+      if (atomicInc(&count_, maxOldCnt) == maxOldCnt) {
+        atomicStore(&flag_, tmp, memoryOrderRelaxed);
       } else {
-        if (atomicSub(&count_, 1) == 1) {
-          flag_ = 0;
-        }
-        POLL_MAYBE_JAILBREAK(flag_, maxSpinCount);
+        POLL_MAYBE_JAILBREAK((atomicLoad(&flag_, memoryOrderRelaxed) != tmp), maxSpinCount);
       }
-      isAdd_ = tmpIsAdd;
+      preFlag_ = tmp;
     }
     // We need sync here because only a single thread is checking whether
     // the flag is flipped.
     __syncthreads();
   }
-#endif
+#endif  // !defined(MSCCLPP_DEVICE_COMPILE)
 
  private:
   /// The flag to indicate whether the barrier is reached by the latest thread.
-  volatile int flag_;
+  unsigned int flag_;
   /// The counter of synchronized blocks.
-  int count_;
-  /// The flag to indicate whether to increase or decrease @ref count_.
-  int isAdd_;
+  unsigned int count_;
+  /// The flag to indicate whether to increase or decrease @ref flag_.
+  unsigned int preFlag_;
 };
 
 }  // namespace mscclpp
 
-#endif  // MSCCLPP_CONCURRENCY_HPP_
+#endif  // MSCCLPP_CONCURRENCY_DEVICE_HPP_
diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 03eb8cc69..c2a4dff44 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -5,18 +5,22 @@
 #define MSCCLPP_CORE_HPP_
 
 #define MSCCLPP_MAJOR 0
-#define MSCCLPP_MINOR 3
-#define MSCCLPP_PATCH 0
+#define MSCCLPP_MINOR 4
+#define MSCCLPP_PATCH 2
 #define MSCCLPP_VERSION (MSCCLPP_MAJOR * 10000 + MSCCLPP_MINOR * 100 + MSCCLPP_PATCH)
 
 #include <array>
 #include <bitset>
 #include <future>
 #include <memory>
-#include <mscclpp/errors.hpp>
+#include <mscclpp/gpu.hpp>
+#include <mscclpp/gpu_utils.hpp>
+#include <mscclpp/nvls_device.hpp>
 #include <string>
 #include <vector>
 
+#include "errors.hpp"
+
 namespace mscclpp {
 
 #define MSCCLPP_UNIQUE_ID_BYTES 128
@@ -39,6 +43,7 @@ class Bootstrap {
   virtual void allGather(void* allData, int size) = 0;
   virtual void barrier() = 0;
 
+  void groupBarrier(const std::vector<int>& ranks);
   void send(const std::vector<char>& data, int peer, int tag);
   void recv(std::vector<char>& data, int peer, int tag);
 };
@@ -46,6 +51,10 @@ class Bootstrap {
 /// A native implementation of the bootstrap using TCP sockets.
 class TcpBootstrap : public Bootstrap {
  public:
+  /// Create a random unique ID.
+  /// @return The created unique ID.
+  static UniqueId createUniqueId();
+
   /// Constructor.
   /// @param rank The rank of the process.
   /// @param nRanks The total number of ranks.
@@ -54,10 +63,6 @@ class TcpBootstrap : public Bootstrap {
   /// Destructor.
   ~TcpBootstrap();
 
-  /// Create a random unique ID and store it in the @ref TcpBootstrap.
-  /// @return The created unique ID.
-  UniqueId createUniqueId();
-
   /// Return the unique ID stored in the @ref TcpBootstrap.
   /// @return The unique ID stored in the @ref TcpBootstrap.
   UniqueId getUniqueId() const;
@@ -114,7 +119,7 @@ class TcpBootstrap : public Bootstrap {
 
  private:
   // The interal implementation.
-  struct Impl;
+  class Impl;
 
   // Pointer to the internal implementation.
   std::unique_ptr<Impl> pimpl_;
@@ -124,6 +129,7 @@ class TcpBootstrap : public Bootstrap {
 enum class Transport {
   Unknown,       // Unknown transport type.
   CudaIpc,       // CUDA IPC transport type.
+  Nvls,          // NVLS transport type.
   IB0,           // InfiniBand device 0 transport type.
   IB1,           // InfiniBand device 1 transport type.
   IB2,           // InfiniBand device 2 transport type.
@@ -135,10 +141,11 @@ enum class Transport {
   NumTransports  // The number of transports.
 };
 
-const std::string TransportNames[] = {"UNK", "IPC", "IB0", "IB1", "IB2", "IB3", "IB4", "IB5", "IB6", "IB7", "NUM"};
+const std::string TransportNames[] = {"UNK", "IPC", "NVLS", "IB0", "IB1", "IB2",
+                                      "IB3", "IB4", "IB5",  "IB6", "IB7", "NUM"};
 
 namespace detail {
-const size_t TransportFlagsSize = 10;
+const size_t TransportFlagsSize = 11;
 static_assert(TransportFlagsSize == static_cast<size_t>(Transport::NumTransports),
               "TransportFlagsSize must match the number of transports");
 /// Bitset for storing transport flags.
@@ -400,6 +407,8 @@ class Endpoint {
 /// Represents a connection between two processes.
 class Connection {
  public:
+  virtual ~Connection() = default;
+
   /// Write data from a source @ref RegisteredMemory to a destination @ref RegisteredMemory.
   ///
   /// @param dst The destination @ref RegisteredMemory.
@@ -442,12 +451,50 @@ class Connection {
   static std::shared_ptr<Endpoint::Impl> getImpl(Endpoint& memory);
 };
 
+class NvlsConnection {
+ public:
+  NvlsConnection(size_t bufferSize, int numDevices);
+  NvlsConnection(const std::vector<char>& data);
+  NvlsConnection() = delete;
+  std::vector<char> serialize();
+
+  // Everyone needs to synchronize after creating a NVLS connection before adding devices
+  void addDevice();
+  void addDevice(int cudaDeviceId);
+
+  struct DeviceMulticastPointer {
+   private:
+    std::shared_ptr<PhysicalCudaMemory<char>> deviceMem_;
+    std::shared_ptr<char> mcPtr_;
+    size_t bufferSize_;
+
+   public:
+    using DeviceHandle = DeviceMulticastPointerDeviceHandle;
+    DeviceMulticastPointer(std::shared_ptr<PhysicalCudaMemory<char>> deviceMem, std::shared_ptr<char> mcPtr,
+                           size_t bufferSize)
+        : deviceMem_(deviceMem), mcPtr_(mcPtr), bufferSize_(bufferSize) {}
+    DeviceHandle deviceHandle();
+    char* getDevicePtr();
+
+    friend class NvlsConnection;
+  };
+
+  std::shared_ptr<DeviceMulticastPointer> allocateAndBindCuda(size_t size);
+  size_t getMultiCastMinGranularity();
+
+ private:
+  class Impl;
+  std::shared_ptr<Impl> pimpl_;
+};
+
 /// Used to configure an endpoint.
 struct EndpointConfig {
   static const int DefaultMaxCqSize = 1024;
   static const int DefaultMaxCqPollNum = 1;
   static const int DefaultMaxSendWr = 8192;
   static const int DefaultMaxWrPerSend = 64;
+  // the recommended buffer size for NVLS, returned by cuMulticastGetGranularity
+  static const int DefaultNvlsBufferSize = (1 << 29);
 
   Transport transport;
   int ibMaxCqSize = DefaultMaxCqSize;
@@ -455,6 +502,8 @@ struct EndpointConfig {
   int ibMaxSendWr = DefaultMaxSendWr;
   int ibMaxWrPerSend = DefaultMaxWrPerSend;
 
+  size_t nvlsBufferSize = DefaultNvlsBufferSize;
+
   /// Default constructor. Sets transport to Transport::Unknown.
   EndpointConfig() : transport(Transport::Unknown) {}
 
@@ -462,6 +511,11 @@ struct EndpointConfig {
   ///
   /// @param transport The transport to use.
   EndpointConfig(Transport transport) : transport(transport) {}
+
+  /// Constructor for NVLS explicitly
+  /// @param transport must be either NvlsRoot or NvlsNonRoot
+  /// @param nvlsBufferSize is the buffer to be alloced on each device
+  EndpointConfig(Transport transport, size_t nvlsBufferSize) : transport(transport), nvlsBufferSize(nvlsBufferSize) {}
 };
 
 /// Represents a context for communication. This provides a low-level interface for forming connections in use-cases
@@ -524,6 +578,8 @@ class Context {
 
 /// A base class for objects that can be set up during @ref Communicator::setup().
 struct Setuppable {
+  virtual ~Setuppable() = default;
+
   /// Called inside @ref Communicator::setup() before any call to @ref endSetup() of any @ref Setuppable object that is
   /// being set up within the same @ref Communicator::setup() call.
   ///
@@ -643,6 +699,16 @@ class Communicator {
   /// to the connection.
   NonblockingFuture<std::shared_ptr<Connection>> connectOnSetup(int remoteRank, int tag, EndpointConfig localConfig);
 
+  /// Connect to NVLS on setup.
+  ///
+  /// This function used to connect to NVLS on setup. NVLS collective using multicast operations to send/recv data.
+  /// Here we need to put all involved ranks into the collective group.
+  ///
+  /// @param allRanks The ranks of all processes involved in the collective.
+  /// @param config The configuration for the local endpoint.
+  /// @return std::shared_ptr<NvlsConnection> A shared pointer to the NVLS connection.
+  std::shared_ptr<NvlsConnection> connctNvlsCollective(std::vector<int> allRanks, EndpointConfig config);
+
   /// Get the remote rank a connection is connected to.
   ///
   /// @param connection The connection to get the remote rank for.
diff --git a/include/mscclpp/device.hpp b/include/mscclpp/device.hpp
new file mode 100644
index 000000000..9dae40422
--- /dev/null
+++ b/include/mscclpp/device.hpp
@@ -0,0 +1,29 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef MSCCLPP_DEVICE_HPP_
+#define MSCCLPP_DEVICE_HPP_
+
+#if defined(__HIP_PLATFORM_AMD__)
+#include <hip/hip_runtime.h>
+#endif  // defined(__HIP_PLATFORM_AMD__)
+
+#if (defined(__NVCC__) || defined(__HIP_PLATFORM_AMD__))
+
+#define MSCCLPP_DEVICE_COMPILE
+#define MSCCLPP_DEVICE_INLINE __forceinline__ __device__
+#define MSCCLPP_HOST_DEVICE_INLINE __forceinline__ __host__ __device__
+#if defined(__HIP_PLATFORM_AMD__)
+#define MSCCLPP_DEVICE_HIP
+#else  // !(defined(__HIP_PLATFORM_AMD__)
+#define MSCCLPP_DEVICE_CUDA
+#endif  // !(defined(__HIP_PLATFORM_AMD__))
+
+#else  // !(defined(__NVCC__) || defined(__HIP_PLATFORM_AMD__))
+
+#define MSCCLPP_HOST_COMPILE
+#define MSCCLPP_HOST_DEVICE_INLINE inline
+
+#endif  // !(defined(__NVCC__) || defined(__HIP_PLATFORM_AMD__))
+
+#endif  // MSCCLPP_DEVICE_HPP_
diff --git a/include/mscclpp/errors.hpp b/include/mscclpp/errors.hpp
index 0e9c7ecc0..4e90c8d84 100644
--- a/include/mscclpp/errors.hpp
+++ b/include/mscclpp/errors.hpp
@@ -4,9 +4,6 @@
 #ifndef MSCCLPP_ERRORS_HPP_
 #define MSCCLPP_ERRORS_HPP_
 
-#include <cuda.h>
-#include <cuda_runtime.h>
-
 #include <stdexcept>
 
 namespace mscclpp {
@@ -77,14 +74,14 @@ class SysError : public BaseError {
 /// An error from a CUDA runtime library call.
 class CudaError : public BaseError {
  public:
-  CudaError(const std::string& message, cudaError_t errorCode);
+  CudaError(const std::string& message, int errorCode);
   virtual ~CudaError() = default;
 };
 
 /// An error from a CUDA driver library call.
 class CuError : public BaseError {
  public:
-  CuError(const std::string& message, CUresult errorCode);
+  CuError(const std::string& message, int errorCode);
   virtual ~CuError() = default;
 };
 
diff --git a/include/mscclpp/fifo.hpp b/include/mscclpp/fifo.hpp
index be31b80a0..126f901ed 100644
--- a/include/mscclpp/fifo.hpp
+++ b/include/mscclpp/fifo.hpp
@@ -7,17 +7,19 @@
 #include <cstdint>
 #include <functional>
 #include <memory>
-#include <mscclpp/fifo_device.hpp>
-#include <mscclpp/poll.hpp>
+
+#include "fifo_device.hpp"
 
 namespace mscclpp {
 
+constexpr size_t DEFAULT_FIFO_SIZE = 128;
+
 /// A class representing a host proxy FIFO that can consume work elements pushed by device threads.
 class Fifo {
  public:
   /// Constructs a new @ref Fifo object.
   /// @param size The number of entires in the FIFO.
-  Fifo(int size = 128);
+  Fifo(int size = DEFAULT_FIFO_SIZE);
 
   /// Destroys the @ref Fifo object.
   ~Fifo();
diff --git a/include/mscclpp/fifo_device.hpp b/include/mscclpp/fifo_device.hpp
index c48cef274..eca6125e9 100644
--- a/include/mscclpp/fifo_device.hpp
+++ b/include/mscclpp/fifo_device.hpp
@@ -4,9 +4,14 @@
 #ifndef MSCCLPP_FIFO_DEVICE_HPP_
 #define MSCCLPP_FIFO_DEVICE_HPP_
 
-#include <cuda/atomic>
+#include <cstdint>
 
-#include "poll.hpp"
+#include "device.hpp"
+
+#if defined(MSCCLPP_DEVICE_COMPILE)
+#include "atomic_device.hpp"
+#include "poll_device.hpp"
+#endif  // defined(MSCCLPP_DEVICE_COMPILE)
 
 namespace mscclpp {
 
@@ -20,7 +25,8 @@ struct alignas(16) ProxyTrigger {
   uint64_t fst, snd;
 };
 
-/// A concurrent FIFO where multiple device threads can push work elements and a single host proxy thread consumes them.
+/// A concurrent FIFO where multiple device threads (the number of threads should not exceed the fifo size) can push
+/// work elements and a single host proxy thread consumes them.
 ///
 /// The FIFO has a head pointer allocated on the device which starts at 0 and goes up to 2^64-1, which is almost
 /// infinity. There are two copies of the tail, one on the device, @ref FifoDeviceHandle::tailReplica, and another on
@@ -33,15 +39,14 @@ struct alignas(16) ProxyTrigger {
 /// tail as there is usually enough space for device threads to push their work into.
 ///
 struct FifoDeviceHandle {
-#ifdef __CUDACC__
+#if defined(MSCCLPP_DEVICE_COMPILE)
   /// Push a trigger to the FIFO.
   ///
   /// @param trigger The trigger to push.
   /// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative.
   /// @return The new head of the FIFO.
-  __forceinline__ __device__ uint64_t push(ProxyTrigger trigger, int64_t maxSpinCount = 1000000) {
-    uint64_t curFifoHead =
-        cuda::atomic_ref<uint64_t, cuda::thread_scope_device>{*this->head}.fetch_add(1, cuda::memory_order_relaxed);
+  MSCCLPP_DEVICE_INLINE uint64_t push(ProxyTrigger trigger, int64_t maxSpinCount = 1000000) {
+    uint64_t curFifoHead = atomicFetchAdd(this->head, (uint64_t)1, memoryOrderRelaxed);
 
     // make the last bit intentionally non-zero so that we can safely poll. Don't worry, we will change it back in host
     // side
@@ -53,18 +58,22 @@ struct FifoDeviceHandle {
     // As atomic access is slow, we first check using the bare pointer and then use the atomic load if the
     // condition is not met.
     if (curFifoHead >= size + *(this->tailReplica)) {
-      OR_POLL_MAYBE_JAILBREAK(
-          (curFifoHead >= size + cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{*this->tailReplica}.load(
-                                     cuda::memory_order_relaxed)),
-          (cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{this->triggers[curFifoHead % size].fst}.load(
-               cuda::memory_order_relaxed) != 0),
-          maxSpinCount);
+      OR_POLL_MAYBE_JAILBREAK((curFifoHead >= size + atomicLoad(this->tailReplica, memoryOrderRelaxed)),
+                              (atomicLoad(&(this->triggers[curFifoHead % size].fst), memoryOrderRelaxed) != 0),
+                              maxSpinCount);
     }
 
-    ProxyTrigger* triggerPtr = (ProxyTrigger*)&(this->triggers[curFifoHead % size]);
+    ProxyTrigger* triggerPtr = &(this->triggers[curFifoHead % size]);
 
-    // store with memory order release so that the while loop does not go pass this.
-    asm volatile("st.global.release.cta.v2.u64 [%0], {%1,%2};" ::"l"(triggerPtr), "l"(trigger.fst), "l"(trigger.snd));
+    // There is a Write-After-Read hazard for the triggerPtr->fst. So the st instruction will not be executed
+    // before the loop.
+#if defined(MSCCLPP_DEVICE_CUDA)
+    asm volatile("st.global.relaxed.sys.v2.u64 [%0], {%1,%2};" ::"l"(triggerPtr), "l"(trigger.fst), "l"(trigger.snd));
+#else   // !defined(MSCCLPP_DEVICE_CUDA)
+    // store snd no later than fst.
+    atomicStore(&(triggerPtr->snd), trigger.snd, memoryOrderRelaxed);
+    atomicStore(&(triggerPtr->fst), trigger.fst, memoryOrderRelaxed);
+#endif  // !defined(MSCCLPP_DEVICE_CUDA)
 
     return curFifoHead;
   }
@@ -73,17 +82,14 @@ struct FifoDeviceHandle {
   ///
   /// @param curFifoHead The current head of the FIFO.
   /// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative.
-  __forceinline__ __device__ void sync(uint64_t curFifoHead, int64_t maxSpinCount = 1000000) {
+  MSCCLPP_DEVICE_INLINE void sync(uint64_t curFifoHead, int64_t maxSpinCount = 1000000) {
     // Same as push but in this case checking the fist condition is probably faster since for tail to be pushed we need
     // to wait for cudaMemcpy to be done.
-    OR_POLL_MAYBE_JAILBREAK(
-        (curFifoHead >=
-         cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{*this->tailReplica}.load(cuda::memory_order_relaxed)),
-        (cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{this->triggers[curFifoHead % size].fst}.load(
-             cuda::memory_order_relaxed) != 0),
-        maxSpinCount);
+    OR_POLL_MAYBE_JAILBREAK((curFifoHead >= atomicLoad(this->tailReplica, memoryOrderRelaxed)),
+                            (atomicLoad(&(this->triggers[curFifoHead % size].fst), memoryOrderRelaxed) != 0),
+                            maxSpinCount);
   }
-#endif  // __CUDACC__
+#endif  // defined(MSCCLPP_DEVICE_COMPILE)
 
   /// The FIFO buffer that is allocated on the host via `cudaHostAlloc()`.
   ProxyTrigger* triggers;
diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp
new file mode 100644
index 000000000..f560a655c
--- /dev/null
+++ b/include/mscclpp/gpu.hpp
@@ -0,0 +1,104 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef MSCCLPP_GPU_HPP_
+#define MSCCLPP_GPU_HPP_
+
+#if defined(__HIP_PLATFORM_AMD__)
+
+#include <hip/hip_runtime.h>
+
+using cudaError_t = hipError_t;
+using cudaGraph_t = hipGraph_t;
+using cudaGraphExec_t = hipGraphExec_t;
+using cudaDeviceProp = hipDeviceProp_t;
+using cudaStream_t = hipStream_t;
+using cudaStreamCaptureMode = hipStreamCaptureMode;
+using cudaMemcpyKind = hipMemcpyKind;
+using cudaIpcMemHandle_t = hipIpcMemHandle_t;
+
+using CUresult = hipError_t;
+using CUdeviceptr = hipDeviceptr_t;
+using CUmemGenericAllocationHandle = hipMemGenericAllocationHandle_t;
+using CUmemAllocationProp = hipMemAllocationProp;
+using CUmemAccessDesc = hipMemAccessDesc;
+
+constexpr auto cudaSuccess = hipSuccess;
+constexpr auto cudaStreamNonBlocking = hipStreamNonBlocking;
+constexpr auto cudaStreamCaptureModeGlobal = hipStreamCaptureModeGlobal;
+constexpr auto cudaStreamCaptureModeRelaxed = hipStreamCaptureModeRelaxed;
+constexpr auto cudaHostAllocMapped = hipHostMallocMapped;
+constexpr auto cudaHostAllocWriteCombined = hipHostMallocWriteCombined;
+constexpr auto cudaMemcpyDefault = hipMemcpyDefault;
+constexpr auto cudaMemcpyDeviceToDevice = hipMemcpyDeviceToDevice;
+constexpr auto cudaMemcpyHostToDevice = hipMemcpyHostToDevice;
+constexpr auto cudaMemcpyDeviceToHost = hipMemcpyDeviceToHost;
+constexpr auto cudaIpcMemLazyEnablePeerAccess = hipIpcMemLazyEnablePeerAccess;
+
+constexpr auto CU_MEM_ALLOCATION_TYPE_PINNED = hipMemAllocationTypePinned;
+constexpr auto CU_MEM_LOCATION_TYPE_DEVICE = hipMemLocationTypeDevice;
+constexpr auto CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = hipMemHandleTypePosixFileDescriptor;
+constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWrite;
+
+#ifndef CUDA_SUCCESS
+#define CUDA_SUCCESS hipSuccess
+#endif  // CUDA_SUCCESS
+
+#define cudaGetErrorString(...) hipGetErrorString(__VA_ARGS__)
+#define cudaGetDevice(...) hipGetDevice(__VA_ARGS__)
+#define cudaGetDeviceCount(...) hipGetDeviceCount(__VA_ARGS__)
+#define cudaGetDeviceProperties(...) hipGetDeviceProperties(__VA_ARGS__)
+#define cudaGetLastError(...) hipGetLastError(__VA_ARGS__)
+#define cudaSetDevice(...) hipSetDevice(__VA_ARGS__)
+#define cudaDeviceSynchronize(...) hipDeviceSynchronize(__VA_ARGS__)
+#define cudaDeviceGetPCIBusId(...) hipDeviceGetPCIBusId(__VA_ARGS__)
+#define cudaHostAlloc(...) hipHostMalloc(__VA_ARGS__)
+#define cudaMalloc(...) hipMalloc(__VA_ARGS__)
+#define cudaFree(...) hipFree(__VA_ARGS__)
+#define cudaFreeHost(...) hipHostFree(__VA_ARGS__)
+#define cudaMemset(...) hipMemset(__VA_ARGS__)
+#define cudaMemsetAsync(...) hipMemsetAsync(__VA_ARGS__)
+#define cudaMemcpy(...) hipMemcpy(__VA_ARGS__)
+#define cudaMemcpyAsync(...) hipMemcpyAsync(__VA_ARGS__)
+#define cudaMemcpyToSymbol(...) hipMemcpyToSymbol(__VA_ARGS__)
+#define cudaStreamCreateWithFlags(...) hipStreamCreateWithFlags(__VA_ARGS__)
+#define cudaStreamSynchronize(...) hipStreamSynchronize(__VA_ARGS__)
+#define cudaStreamBeginCapture(...) hipStreamBeginCapture(__VA_ARGS__)
+#define cudaStreamEndCapture(...) hipStreamEndCapture(__VA_ARGS__)
+#define cudaStreamDestroy(...) hipStreamDestroy(__VA_ARGS__)
+#define cudaGraphInstantiate(...) hipGraphInstantiate(__VA_ARGS__)
+#define cudaGraphLaunch(...) hipGraphLaunch(__VA_ARGS__)
+#define cudaGraphDestroy(...) hipGraphDestroy(__VA_ARGS__)
+#define cudaGraphExecDestroy(...) hipGraphExecDestroy(__VA_ARGS__)
+#define cudaThreadExchangeStreamCaptureMode(...) hipThreadExchangeStreamCaptureMode(__VA_ARGS__)
+#define cudaIpcGetMemHandle(...) hipIpcGetMemHandle(__VA_ARGS__)
+#define cudaIpcOpenMemHandle(...) hipIpcOpenMemHandle(__VA_ARGS__)
+#define cudaIpcCloseMemHandle(...) hipIpcCloseMemHandle(__VA_ARGS__)
+
+#define cuGetErrorString(...) hipDrvGetErrorString(__VA_ARGS__)
+#define cuMemAddressReserve(...) hipMemAddressReserve(__VA_ARGS__)
+#define cuMemAddressFree(...) hipMemAddressFree(__VA_ARGS__)
+#define cuMemGetAddressRange(...) hipMemGetAddressRange(__VA_ARGS__)
+#define cuMemCreate(...) hipMemCreate(__VA_ARGS__)
+#define cuMemRelease(...) hipMemRelease(__VA_ARGS__)
+#define cuMemSetAccess(...) hipMemSetAccess(__VA_ARGS__)
+#define cuMemMap(...) hipMemMap(__VA_ARGS__)
+#define cuMemUnmap(...) hipMemUnmap(__VA_ARGS__)
+
+#else
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#endif
+
+// NVLS
+#if !defined(__HIP_PLATFORM_AMD__)
+#include <linux/version.h>
+#define USE_NVLS ((CUDART_VERSION >= 12010) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)))
+#else  // !defined(__HIP_PLATFORM_AMD__)
+#define USE_NVLS 0
+#endif  // !defined(__HIP_PLATFORM_AMD__)
+
+#endif  // MSCCLPP_GPU_HPP_
diff --git a/include/mscclpp/cuda_utils.hpp b/include/mscclpp/gpu_utils.hpp
similarity index 64%
rename from include/mscclpp/cuda_utils.hpp
rename to include/mscclpp/gpu_utils.hpp
index 2cadfef6c..9be6a7d16 100644
--- a/include/mscclpp/cuda_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -1,14 +1,14 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef MSCCLPP_CUDA_UTILS_HPP_
-#define MSCCLPP_CUDA_UTILS_HPP_
-
-#include <cuda_runtime.h>
+#ifndef MSCCLPP_GPU_UTILS_HPP_
+#define MSCCLPP_GPU_UTILS_HPP_
 
 #include <cstring>
 #include <memory>
-#include <mscclpp/errors.hpp>
+
+#include "errors.hpp"
+#include "gpu.hpp"
 
 /// Throw @ref mscclpp::CudaError if @p cmd does not return cudaSuccess.
 /// @param cmd The command to execute.
@@ -50,6 +50,18 @@ struct CudaStreamWithFlags {
   cudaStream_t stream_;
 };
 
+template <class T>
+struct CudaDeleter;
+
+template <class T>
+struct PhysicalCudaMemory {
+  CUmemGenericAllocationHandle memHandle_;
+  T* devicePtr_;
+  size_t size_;
+  PhysicalCudaMemory(CUmemGenericAllocationHandle memHandle, T* devicePtr, size_t size)
+      : memHandle_(memHandle), devicePtr_(devicePtr), size_(size) {}
+};
+
 namespace detail {
 
 /// A wrapper of cudaMalloc that sets the allocated memory to zero.
@@ -67,6 +79,62 @@ T* cudaCalloc(size_t nelem) {
   return ptr;
 }
 
+template <class T>
+PhysicalCudaMemory<T>* cudaPhysicalCalloc(size_t nelem, size_t gran) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+
+  int deviceId = -1;
+  MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
+
+  CUmemAllocationProp prop = {};
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.location.id = deviceId;
+#if defined(__HIP_PLATFORM_AMD__)
+  // TODO: revisit when HIP fixes this typo in the field name
+  prop.requestedHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+#else
+  prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+#endif
+
+  CUmemGenericAllocationHandle memHandle;
+  size_t bufferSize = sizeof(T) * nelem;
+  // allocate physical memory
+  MSCCLPP_CUTHROW(cuMemCreate(&memHandle, bufferSize, &prop, 0 /*flags*/));
+
+  CUmemAccessDesc accessDesc = {};
+  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  accessDesc.location.id = deviceId;
+  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+
+  T* devicePtr = nullptr;
+  // Map the device pointer
+  MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, bufferSize, gran, 0U, 0));
+  MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, bufferSize, 0, memHandle, 0));
+  MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)devicePtr, bufferSize, &accessDesc, 1));
+  CudaStreamWithFlags stream(cudaStreamNonBlocking);
+  MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, bufferSize, stream));
+
+  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
+
+  return new PhysicalCudaMemory<T>(memHandle, devicePtr, bufferSize);
+}
+
+template <class T>
+T* cudaExtCalloc(size_t nelem) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  T* ptr;
+  CudaStreamWithFlags stream(cudaStreamNonBlocking);
+#if defined(__HIP_PLATFORM_AMD__)
+  MSCCLPP_CUDATHROW(hipExtMallocWithFlags((void**)&ptr, nelem * sizeof(T), hipDeviceMallocUncached));
+#else
+  MSCCLPP_CUDATHROW(cudaMalloc(&ptr, nelem * sizeof(T)));
+#endif
+  MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, 0, nelem * sizeof(T), stream));
+  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
+  return ptr;
+}
+
 /// A wrapper of cudaHostAlloc that sets the allocated memory to zero.
 /// @tparam T Type of each element in the allocated memory.
 /// @param nelem Number of elements to allocate.
@@ -103,6 +171,25 @@ Memory safeAlloc(size_t nelem) {
   return Memory(ptr, Deleter());
 }
 
+template <class T, T*(alloc)(size_t, size_t), class Deleter, class Memory>
+Memory safeAlloc(size_t nelem, size_t gran) {
+  if ((nelem * sizeof(T)) % gran) {
+    throw Error("The request allocation size is not divisible by the required granularity:" +
+                    std::to_string(nelem * sizeof(T)) + " vs " + std::to_string(gran),
+                ErrorCode::InvalidUsage);
+  }
+  T* ptr = nullptr;
+  try {
+    ptr = alloc(nelem, gran);
+  } catch (...) {
+    if (ptr) {
+      Deleter()(ptr);
+    }
+    throw;
+  }
+  return Memory(ptr, Deleter());
+}
+
 }  // namespace detail
 
 /// A deleter that calls cudaFree for use with std::unique_ptr or std::shared_ptr.
@@ -116,6 +203,17 @@ struct CudaDeleter {
   }
 };
 
+template <class T>
+struct CudaPhysicalDeleter {
+  static_assert(!std::is_array_v<T>, "T must not be an array");
+  void operator()(PhysicalCudaMemory<T>* ptr) {
+    AvoidCudaGraphCaptureGuard cgcGuard;
+    MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr->devicePtr_, ptr->size_));
+    MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr->devicePtr_, ptr->size_));
+    MSCCLPP_CUTHROW(cuMemRelease(ptr->memHandle_));
+  }
+};
+
 /// A deleter that calls cudaFreeHost for use with std::unique_ptr or std::shared_ptr.
 /// @tparam T Type of each element in the allocated memory.
 template <class T>
@@ -136,6 +234,27 @@ std::shared_ptr<T> allocSharedCuda(size_t count = 1) {
   return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
 }
 
+/// Allocated physical memory on the device and returns a memory handle along with a memory handle for it.
+/// The deallocation only happens PhysicalCudaMemory goes out of scope.
+/// @tparam T Type of each element in the allocated memory.
+/// @param count Number of elements to allocate.
+/// @param gran the granularity of the allocation.
+/// @return A std::shared_ptr to the memory handle and a device pointer for that memory.
+template <class T>
+std::shared_ptr<PhysicalCudaMemory<T>> allocSharedPhysicalCuda(size_t count, size_t gran) {
+  return detail::safeAlloc<PhysicalCudaMemory<T>, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>,
+                           std::shared_ptr<PhysicalCudaMemory<T>>>(count, gran);
+}
+
+/// Allocates memory on the device and returns a std::shared_ptr to it. The memory is zeroed out.
+/// @tparam T Type of each element in the allocated memory.
+/// @param count Number of elements to allocate.
+/// @return A std::shared_ptr to the allocated memory.
+template <class T>
+std::shared_ptr<T> allocExtSharedCuda(size_t count = 1) {
+  return detail::safeAlloc<T, detail::cudaExtCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
+}
+
 /// Unique device pointer that will call cudaFree on destruction.
 /// @tparam T Type of each element in the allocated memory.
 template <class T>
@@ -150,6 +269,27 @@ UniqueCudaPtr<T> allocUniqueCuda(size_t count = 1) {
   return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
 }
 
+/// Allocated physical memory on the device and returns a memory handle along with a virtual memory handle for it.
+/// The memory is zeroed out.
+/// @tparam T Type of each element in the allocated memory.
+/// @param count Number of elements to allocate.
+/// @param gran the granularity of the allocation.
+/// @return A std::unique_ptr to the memory handle and a device pointer for that memory.
+template <class T>
+std::unique_ptr<PhysicalCudaMemory<T>> allocUniquePhysicalCuda(size_t count, size_t gran) {
+  return detail::safeAlloc<PhysicalCudaMemory<T>, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>,
+                           std::unique_ptr<CudaPhysicalDeleter<T>, CudaDeleter<CudaPhysicalDeleter<T>>>>(count, gran);
+}
+
+/// Allocates memory on the device and returns a std::unique_ptr to it. The memory is zeroed out.
+/// @tparam T Type of each element in the allocated memory.
+/// @param count Number of elements to allocate.
+/// @return A std::unique_ptr to the allocated memory.
+template <class T>
+UniqueCudaPtr<T> allocExtUniqueCuda(size_t count = 1) {
+  return detail::safeAlloc<T, detail::cudaExtCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
+}
+
 /// Allocates memory with cudaHostAlloc, constructs an object of type T in it and returns a std::shared_ptr to it.
 /// @tparam T Type of the object to construct.
 /// @tparam Args Types of the arguments to pass to the constructor.
@@ -238,4 +378,4 @@ void memcpyCuda(T* dst, const T* src, size_t count, cudaMemcpyKind kind = cudaMe
 
 }  // namespace mscclpp
 
-#endif  // MSCCLPP_CUDA_UTILS_HPP_
+#endif  // MSCCLPP_GPU_UTILS_HPP_
diff --git a/include/mscclpp/nvls_device.hpp b/include/mscclpp/nvls_device.hpp
new file mode 100644
index 000000000..edaf2e256
--- /dev/null
+++ b/include/mscclpp/nvls_device.hpp
@@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef MSCCLPP_NVLS_DEVICE_HPP_
+#define MSCCLPP_NVLS_DEVICE_HPP_
+
+#include <mscclpp/gpu.hpp>
+#include <type_traits>
+
+#include "device.hpp"
+
+namespace mscclpp {
+
+template <class>
+constexpr bool dependentFalse = false;  // workaround before CWG2518/P2593R1
+
+/// Device-side handle for @ref Host2DeviceSemaphore.
+struct DeviceMulticastPointerDeviceHandle {
+  void* devicePtr;
+  void* mcPtr;
+  size_t bufferSize;
+
+#if defined(MSCCLPP_DEVICE_CUDA)
+  template <int NElemPerThread = 4, typename TVaule = float4, typename T = float>
+  MSCCLPP_DEVICE_INLINE void multimemLoad(TVaule& val, T* ptr) {
+    static_assert(NElemPerThread == 4, "Only support NElemPerThread == 4");
+    if constexpr (std::is_same<T, float>::value) {
+      asm("multimem.ld_reduce.global.add.v4.f32 {%0,%1,%2,%3}, [%4];"
+          : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
+          : "l"(ptr)
+          : "memory");
+    } else if constexpr (std::is_same<T, half2>::value) {
+      asm("multimem.ld_reduce.global.add.v4.f16x2 {%0,%1,%2,%3}, [%4];"
+          : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
+          : "l"(ptr)
+          : "memory");
+    } else {
+      static_assert(dependentFalse<T>, "Not supported type");
+    }
+  };
+
+  template <int NElemPerThread = 4, typename TVaule, typename T>
+  MSCCLPP_DEVICE_INLINE void multimemStore(const TVaule& val, T* ptr) {
+    static_assert(NElemPerThread == 4, "Only support NElemPerThread == 4");
+    if constexpr (std::is_same<T, float>::value) {
+      asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z),
+                   "r"(val.w)
+                   : "memory");
+    } else if constexpr (std::is_same<T, half2>::value) {
+      asm volatile("multimem.st.global.v4.f16x2 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z),
+                   "r"(val.w)
+                   : "memory");
+    } else {
+      static_assert(dependentFalse<T>, "Not supported type");
+    }
+  };
+#endif  // defined(MSCCLPP_DEVICE_CUDA)
+};
+
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_SEMAPHORE_DEVICE_HPP_
diff --git a/include/mscclpp/packet.hpp b/include/mscclpp/packet.hpp
deleted file mode 100644
index 6f95bb092..000000000
--- a/include/mscclpp/packet.hpp
+++ /dev/null
@@ -1,109 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef MSCCLPP_PACKET_HPP_
-#define MSCCLPP_PACKET_HPP_
-
-#include "poll.hpp"
-
-namespace mscclpp {
-
-/// LL (low latency) protocol packet.
-union LLPacket {
-  // Assume data is written with an atomicity of 8 bytes (IB/RDMA).
-  struct {
-    uint32_t data1;
-    uint32_t flag1;
-    uint32_t data2;
-    uint32_t flag2;
-  };
-
-  struct {
-    uint64_t x;
-    uint64_t y;
-  } vec;
-
-  uint64_t v[2];
-
-#ifdef __CUDACC__
-  __forceinline__ __device__ LLPacket() {}
-
-  /// Write 8 bytes of data to the packet.
-  /// @param val1 The first 4-byte data to write.
-  /// @param val2 The second 4-byte data to write.
-  /// @param flag The flag to write.
-  __forceinline__ __device__ void write(uint32_t val1, uint32_t val2, uint32_t flag) {
-    asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" ::"l"(v), "r"(val1), "r"(flag), "r"(val2), "r"(flag));
-  }
-
-  /// Write 8 bytes of data to the packet.
-  /// @param val The 8-byte data to write.
-  /// @param flag The flag to write.
-  __forceinline__ __device__ void write(uint64_t val, uint32_t flag) {
-    asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" ::"l"(v), "r"((uint32_t)val), "r"(flag),
-                 "r"((uint32_t)(val >> 32)), "r"(flag));
-  }
-
-  /// Helper of @ref read().
-  /// @param flag The flag to read.
-  /// @param data The 8-byte data read.
-  /// @return True if the flag is not equal to the given flag.
-  __forceinline__ __device__ bool readOnce(uint32_t flag, uint2& data) const {
-    uint32_t flag1, flag2;
-    asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];"
-                 : "=r"(data.x), "=r"(flag1), "=r"(data.y), "=r"(flag2)
-                 : "l"(v));
-    return (flag1 != flag) || (flag2 != flag);
-  }
-
-  /// Read 8 bytes of data from the packet.
-  /// @param flag The flag to read.
-  /// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative.
-  /// @return The 8-byte data read.
-  __forceinline__ __device__ uint2 read(uint32_t flag, int64_t maxSpinCount = 100000000) const {
-    uint2 data;
-    POLL_MAYBE_JAILBREAK(readOnce(flag, data), maxSpinCount);
-    return data;
-  }
-
-  /// Clear the packet.
-  __forceinline__ __device__ void clear() {
-    vec.x = 0;
-    vec.y = 0;
-  }
-#endif  // __CUDACC__
-};
-
-#ifdef __CUDACC__
-/// Read from the origin and write to the target buffer.
-__forceinline__ __device__ void putPackets(void* targetPtr, uint64_t targetOffset, const void* originPtr,
-                                           uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
-                                           uint32_t numThreads, uint32_t flag) {
-  // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes
-  const uint32_t* originBase = (const uint32_t*)((const char*)originPtr + originOffset);
-  LLPacket* targetBase = (LLPacket*)((char*)targetPtr + targetOffset);
-  size_t nElem = originBytes / sizeof(uint64_t);
-  for (size_t i = threadId; i < nElem; i += numThreads) {
-    LLPacket* pkt = &targetBase[i];
-    pkt->write(originBase[2 * i], originBase[2 * i + 1], flag);
-  }
-}
-
-/// Read from the target buffer and write to the origin.
-__forceinline__ __device__ void getPackets(const void* targetPtr, uint64_t targetOffset, void* originPtr,
-                                           uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
-                                           uint32_t numThreads, uint32_t flag) {
-  // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes
-  const LLPacket* targetBase = (const LLPacket*)((const char*)targetPtr + targetOffset);
-  uint2* originBase = (uint2*)((char*)originPtr + originOffset);
-  size_t nElem = originBytes / sizeof(uint2);
-  for (size_t i = threadId; i < nElem; i += numThreads) {
-    const LLPacket* pkt = &targetBase[i];
-    originBase[i] = pkt->read(flag);
-  }
-}
-#endif  // __CUDACC__
-
-};  // namespace mscclpp
-
-#endif  // MSCCLPP_PACKET_HPP_
diff --git a/include/mscclpp/packet_device.hpp b/include/mscclpp/packet_device.hpp
new file mode 100644
index 000000000..11f63b53f
--- /dev/null
+++ b/include/mscclpp/packet_device.hpp
@@ -0,0 +1,291 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef MSCCLPP_PACKET_DEVICE_HPP_
+#define MSCCLPP_PACKET_DEVICE_HPP_
+
+#include <cstdint>
+#include <type_traits>
+
+#include "device.hpp"
+
+#if defined(MSCCLPP_DEVICE_COMPILE)
+#include "atomic_device.hpp"
+#include "poll_device.hpp"
+#endif  // defined(MSCCLPP_DEVICE_COMPILE)
+
+namespace mscclpp {
+/// LL (low latency) protocol packet.
+union alignas(16) LL16Packet {
+  // Assume data is written with an atomicity of 8 bytes (IB/RDMA).
+  struct {
+    uint32_t data1;
+    uint32_t flag1;
+    uint32_t data2;
+    uint32_t flag2;
+  };
+
+#if defined(MSCCLPP_DEVICE_COMPILE)
+  ulonglong2 raw_;
+
+  MSCCLPP_DEVICE_INLINE LL16Packet() {}
+
+  /// Write 8 bytes of data to the packet.
+  /// @param val1 The first 4-byte data to write.
+  /// @param val2 The second 4-byte data to write.
+  /// @param flag The flag to write.
+  MSCCLPP_DEVICE_INLINE void write(uint32_t val1, uint32_t val2, uint32_t flag) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+    asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" ::"l"(&raw_), "r"(val1), "r"(flag), "r"(val2),
+                 "r"(flag));
+#else  // !defined(MSCCLPP_DEVICE_CUDA)
+    uint4 reg = make_uint4(val1, flag, val2, flag);
+    ulonglong2* p = reinterpret_cast<ulonglong2*>(&reg);
+    atomicStore(&(raw_.x), p->x, memoryOrderRelaxed);
+    atomicStore(&(raw_.y), p->y, memoryOrderRelaxed);
+#endif
+  }
+
+  /// Write 8 bytes of data to the packet.
+  /// @param val The 8-byte data to write.
+  /// @param flag The flag to write.
+  MSCCLPP_DEVICE_INLINE void write(uint64_t val, uint32_t flag) { write((uint32_t)val, (uint32_t)(val >> 32), flag); }
+
+  /// Helper of @ref read().
+  /// @param flag The flag to read.
+  /// @param data The 8-byte data read.
+  /// @return True if the flag is not equal to the given flag.
+  MSCCLPP_DEVICE_INLINE bool readOnce(uint32_t flag, uint2& data) const {
+#if defined(MSCCLPP_DEVICE_CUDA)
+    uint32_t flag1, flag2;
+    asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];"
+                 : "=r"(data.x), "=r"(flag1), "=r"(data.y), "=r"(flag2)
+                 : "l"(&raw_));
+    return (flag1 != flag) || (flag2 != flag);
+#else  // !defined(MSCCLPP_DEVICE_CUDA)
+    ulonglong2 reg;
+    reg.x = atomicLoad(&(raw_.x), memoryOrderRelaxed);
+    reg.y = atomicLoad(&(raw_.y), memoryOrderRelaxed);
+    uint4* ptr = reinterpret_cast<uint4*>(&reg);
+    data.x = ptr->x;
+    data.y = ptr->z;
+    return (ptr->y != flag) || (ptr->w != flag);
+#endif
+  }
+
+  /// Read 8 bytes of data from the packet.
+  /// @param flag The flag to read.
+  /// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative.
+  /// @return The 8-byte data read.
+  MSCCLPP_DEVICE_INLINE uint2 read(uint32_t flag, int64_t maxSpinCount = 100000000) const {
+    uint2 data;
+    POLL_MAYBE_JAILBREAK(readOnce(flag, data), maxSpinCount);
+    return data;
+  }
+
+  /// Clear the packet.
+  MSCCLPP_DEVICE_INLINE void clear() { raw_ = make_ulonglong2(0, 0); }
+#endif  // defined(MSCCLPP_DEVICE_COMPILE)
+};
+
+union alignas(8) LL8Packet {
+  // Assume data is written with an atomicity of 8 bytes (IB/RDMA).
+  struct {
+    uint32_t data;
+    uint32_t flag;
+  };
+  uint64_t raw_;
+#if defined(MSCCLPP_DEVICE_COMPILE)
+
+  MSCCLPP_DEVICE_INLINE LL8Packet() {}
+
+  MSCCLPP_DEVICE_INLINE void write(uint32_t val, uint32_t flag) {
+#if defined(MSCCLPP_DEVICE_CUDA)
+    asm volatile("st.volatile.global.v2.u32 [%0], {%1,%2};" ::"l"(&raw_), "r"(val), "r"(flag));
+#else  // !defined(MSCCLPP_DEVICE_CUDA)
+    uint2 reg = make_uint2(val, flag);
+    uint64_t* p = reinterpret_cast<uint64_t*>(&reg);
+    atomicStore(&(raw_), *p, memoryOrderRelaxed);
+#endif
+  }
+
+  MSCCLPP_DEVICE_INLINE bool readOnce(uint32_t flag, uint32_t& data) const {
+#if defined(MSCCLPP_DEVICE_CUDA)
+    uint32_t f;
+    asm volatile("ld.volatile.global.v2.u32 {%0,%1}, [%2];" : "=r"(data), "=r"(f) : "l"(&raw_));
+    return (f != flag);
+#else  // !defined(MSCCLPP_DEVICE_CUDA)
+    uint64_t reg;
+    reg = atomicLoad(&(raw_), memoryOrderRelaxed);
+    uint2* ptr = reinterpret_cast<uint2*>(&reg);
+    data = ptr->x;
+    return (ptr->y != flag);
+#endif
+  }
+
+  MSCCLPP_DEVICE_INLINE uint32_t read(uint32_t flag, int64_t maxSpinCount = 1000000) const {
+    uint32_t data;
+    POLL_MAYBE_JAILBREAK(readOnce(flag, data), maxSpinCount);
+    return data;
+  }
+
+  /// Clear the packet.
+  MSCCLPP_DEVICE_INLINE void clear() { raw_ = 0; }
+#endif  // defined(MSCCLPP_DEVICE_COMPILE)
+};
+
+using LLPacket = LL16Packet;
+
+#if defined(MSCCLPP_DEVICE_COMPILE)
+/// Read data from the origin and write LL16Packets to the target buffer.
+///
+/// @param targetPtr The target buffer.
+/// @param targetOffset The offset in the target buffer.
+/// @param originPtr The origin buffer.
+/// @param originOffset The offset in the origin buffer.
+/// @param originBytes The number of bytes to write to the target buffer.
+/// @param threadId The thread ID. The thread ID should be less than @p numThreads.
+/// @param numThreads The number of threads that call this function.
+/// @param flag The flag to write.
+///
+MSCCLPP_DEVICE_INLINE void putLL16Packets(void* targetPtr, uint64_t targetOffset, const void* originPtr,
+                                          uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
+                                          uint32_t numThreads, uint32_t flag) {
+  // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes
+  const uint32_t* originBase = (const uint32_t*)((const char*)originPtr + originOffset);
+  LL16Packet* targetBase = (LL16Packet*)((char*)targetPtr + targetOffset);
+  size_t nElem = originBytes / sizeof(uint64_t);
+  for (size_t i = threadId; i < nElem; i += numThreads) {
+    LL16Packet* pkt = &targetBase[i];
+    pkt->write(originBase[2 * i], originBase[2 * i + 1], flag);
+  }
+}
+
+/// Read LL16Packets from the target buffer and write retrieved data to the origin.
+///
+/// @param targetPtr The target buffer.
+/// @param targetOffset The offset in the target buffer.
+/// @param originPtr The origin buffer.
+/// @param originOffset The offset in the origin buffer.
+/// @param originBytes The number of bytes to write to the target buffer.
+/// @param threadId The thread ID. The thread ID should be less than @p numThreads.
+/// @param numThreads The number of threads that call this function.
+/// @param flag The flag to write.
+///
+MSCCLPP_DEVICE_INLINE void getLL16Packets(const void* targetPtr, uint64_t targetOffset, void* originPtr,
+                                          uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
+                                          uint32_t numThreads, uint32_t flag) {
+  // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes
+  const LL16Packet* targetBase = (const LL16Packet*)((const char*)targetPtr + targetOffset);
+  uint2* originBase = (uint2*)((char*)originPtr + originOffset);
+  size_t nElem = originBytes / sizeof(uint2);
+  for (size_t i = threadId; i < nElem; i += numThreads) {
+    const LL16Packet* pkt = &targetBase[i];
+    originBase[i] = pkt->read(flag);
+  }
+}
+
+/// Read data from the origin and write LL8Packets to the target buffer.
+///
+/// @param targetPtr The target buffer.
+/// @param targetOffset The offset in the target buffer.
+/// @param originPtr The origin buffer.
+/// @param originOffset The offset in the origin buffer.
+/// @param originBytes The number of bytes to write to the target buffer.
+/// @param threadId The thread ID. The thread ID should be less than @p numThreads.
+/// @param numThreads The number of threads that call this function.
+/// @param flag The flag to write.
+///
+MSCCLPP_DEVICE_INLINE void putLL8Packets(void* targetPtr, uint64_t targetOffset, const void* originPtr,
+                                         uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
+                                         uint32_t numThreads, uint32_t flag) {
+  // Offsets should be aligned to 4 bytes & size should be a multiple of 4 bytes
+  const uint32_t* originBase = (const uint32_t*)((const char*)originPtr + originOffset);
+  LL8Packet* targetBase = (LL8Packet*)((char*)targetPtr + targetOffset);
+  size_t nElem = originBytes / sizeof(uint32_t);
+  for (size_t i = threadId; i < nElem; i += numThreads) {
+    LL8Packet* pkt = &targetBase[i];
+    pkt->write(originBase[i], flag);
+  }
+}
+
+/// Read LL8Packets from the target buffer and write retrieved data to the origin.
+///
+/// @param targetPtr The target buffer.
+/// @param targetOffset The offset in the target buffer.
+/// @param originPtr The origin buffer.
+/// @param originOffset The offset in the origin buffer.
+/// @param originBytes The number of bytes to write to the target buffer.
+/// @param threadId The thread ID. The thread ID should be less than @p numThreads.
+/// @param numThreads The number of threads that call this function.
+/// @param flag The flag to write.
+///
+MSCCLPP_DEVICE_INLINE void getLL8Packets(const void* targetPtr, uint64_t targetOffset, void* originPtr,
+                                         uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
+                                         uint32_t numThreads, uint32_t flag) {
+  // Offsets should be aligned to 4 bytes & size should be a multiple of 4 bytes
+  const LL8Packet* targetBase = (const LL8Packet*)((const char*)targetPtr + targetOffset);
+  uint32_t* originBase = (uint32_t*)((char*)originPtr + originOffset);
+  size_t nElem = originBytes / sizeof(uint32_t);
+  for (size_t i = threadId; i < nElem; i += numThreads) {
+    const LL8Packet* pkt = &targetBase[i];
+    originBase[i] = pkt->read(flag);
+  }
+}
+
+/// Read data from the origin and write packets to the target buffer.
+///
+/// @param targetPtr The target buffer.
+/// @param targetOffset The offset in the target buffer.
+/// @param originPtr The origin buffer.
+/// @param originOffset The offset in the origin buffer.
+/// @param originBytes The number of bytes to write to the target buffer.
+/// @param threadId The thread ID. The thread ID should be less than @p numThreads.
+/// @param numThreads The number of threads that call this function.
+/// @param flag The flag to write.
+/// @tparam PacketType The packet type. It should be either @ref LL16Packet or @ref LL8Packet.
+///
+template <typename PacketType = LL16Packet>
+MSCCLPP_DEVICE_INLINE void putPackets(void* targetPtr, uint64_t targetOffset, const void* originPtr,
+                                      uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
+                                      uint32_t numThreads, uint32_t flag) {
+  if constexpr (std::is_same<PacketType, LL16Packet>::value) {
+    putLL16Packets(targetPtr, targetOffset, originPtr, originOffset, originBytes, threadId, numThreads, flag);
+  } else if constexpr (std::is_same<PacketType, LL8Packet>::value) {
+    putLL8Packets(targetPtr, targetOffset, originPtr, originOffset, originBytes, threadId, numThreads, flag);
+  } else {
+    static_assert(std::is_same<PacketType, LL16Packet>::value || std::is_same<PacketType, LL8Packet>::value,
+                  "Unsupported packet type");
+  }
+}
+
+/// Read packets from the target buffer and write retrieved data to the origin.
+///
+/// @param targetPtr The target buffer.
+/// @param targetOffset The offset in the target buffer.
+/// @param originPtr The origin buffer.
+/// @param originOffset The offset in the origin buffer.
+/// @param originBytes The number of bytes to read from the origin buffer.
+/// @param threadId The thread ID. The thread ID should be less than @p numThreads.
+/// @param numThreads The number of threads that call this function.
+/// @param flag The flag to read.
+/// @tparam PacketType The packet type. It should be either @ref LL16Packet or @ref LL8Packet.
+///
+template <typename PacketType = LL16Packet>
+MSCCLPP_DEVICE_INLINE void getPackets(const void* targetPtr, uint64_t targetOffset, void* originPtr,
+                                      uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
+                                      uint32_t numThreads, uint32_t flag) {
+  if constexpr (std::is_same<PacketType, LL16Packet>::value) {
+    getLL16Packets(targetPtr, targetOffset, originPtr, originOffset, originBytes, threadId, numThreads, flag);
+  } else if constexpr (std::is_same<PacketType, LL8Packet>::value) {
+    getLL8Packets(targetPtr, targetOffset, originPtr, originOffset, originBytes, threadId, numThreads, flag);
+  } else {
+    static_assert(std::is_same<PacketType, LL16Packet>::value || std::is_same<PacketType, LL8Packet>::value,
+                  "Unsupported packet type");
+  }
+}
+#endif  // defined(MSCCLPP_DEVICE_COMPILE)
+
+};  // namespace mscclpp
+
+#endif  // MSCCLPP_PACKET_DEVICE_HPP_
diff --git a/include/mscclpp/poll.hpp b/include/mscclpp/poll_device.hpp
similarity index 70%
rename from include/mscclpp/poll.hpp
rename to include/mscclpp/poll_device.hpp
index ea111d9de..9ad116f84 100644
--- a/include/mscclpp/poll.hpp
+++ b/include/mscclpp/poll_device.hpp
@@ -1,28 +1,26 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef MSCCLPP_POLL_HPP_
-#define MSCCLPP_POLL_HPP_
+#ifndef MSCCLPP_POLL_DEVICE_HPP_
+#define MSCCLPP_POLL_DEVICE_HPP_
 
-#ifdef __CUDACC__
+#include "device.hpp"
+
+#if defined(MSCCLPP_DEVICE_COMPILE)
 
 #include <cstdint>
 
+#if defined(NDEBUG)
+#define __assert_fail(__assertion, __file, __line, __function) ;
+#else  // !defined(NDEBUG)
+#if defined(MSCCLPP_DEVICE_HIP)
+extern "C" __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
+                                         const char *__function);
+#else   // !defined(MSCCLPP_DEVICE_HIP)
 extern "C" __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line,
                                          const char *__function) __THROW;
-
-// If a spin is stuck, escape from it and set status to 1.
-#define POLL_MAYBE_JAILBREAK_ESCAPE(__cond, __max_spin_cnt, __status) \
-  do {                                                                \
-    int64_t __spin_cnt = 0;                                           \
-    __status = 0;                                                     \
-    while (__cond) {                                                  \
-      if (__max_spin_cnt >= 0 && __spin_cnt++ == __max_spin_cnt) {    \
-        __status = 1;                                                 \
-        break;                                                        \
-      }                                                               \
-    }                                                                 \
-  } while (0);
+#endif  // !defined(MSCCLPP_DEVICE_HIP)
+#endif  // NDEBUG
 
 // If a spin is stuck, print a warning and keep spinning.
 #define POLL_MAYBE_JAILBREAK(__cond, __max_spin_cnt)                     \
@@ -52,6 +50,6 @@ extern "C" __device__ void __assert_fail(const char *__assertion, const char *__
     }                                                                              \
   } while (0);
 
-#endif  // __CUDACC__
+#endif  // defined(MSCCLPP_DEVICE_COMPILE)
 
-#endif  // MSCCLPP_POLL_HPP_
+#endif  // MSCCLPP_POLL_DEVICE_HPP_
diff --git a/include/mscclpp/proxy.hpp b/include/mscclpp/proxy.hpp
index c403a86de..b55f84e39 100644
--- a/include/mscclpp/proxy.hpp
+++ b/include/mscclpp/proxy.hpp
@@ -6,7 +6,8 @@
 
 #include <functional>
 #include <memory>
-#include <mscclpp/fifo.hpp>
+
+#include "fifo.hpp"
 
 namespace mscclpp {
 
@@ -21,8 +22,8 @@ using ProxyHandler = std::function<ProxyHandlerResult(ProxyTrigger)>;
 
 class Proxy {
  public:
-  Proxy(ProxyHandler handler, std::function<void()> threadInit);
-  Proxy(ProxyHandler handler);
+  Proxy(ProxyHandler handler, std::function<void()> threadInit, size_t fifoSize = DEFAULT_FIFO_SIZE);
+  Proxy(ProxyHandler handler, size_t fifoSize = DEFAULT_FIFO_SIZE);
   ~Proxy();
 
   void start();
@@ -40,4 +41,4 @@ class Proxy {
 
 }  // namespace mscclpp
 
-#endif  // MSCCLPP_PROXY_HPP_
\ No newline at end of file
+#endif  // MSCCLPP_PROXY_HPP_
diff --git a/include/mscclpp/proxy_channel.hpp b/include/mscclpp/proxy_channel.hpp
index 016a4f24c..12612e7d0 100644
--- a/include/mscclpp/proxy_channel.hpp
+++ b/include/mscclpp/proxy_channel.hpp
@@ -4,10 +4,10 @@
 #ifndef MSCCLPP_PROXY_CHANNEL_HPP_
 #define MSCCLPP_PROXY_CHANNEL_HPP_
 
-#include <mscclpp/core.hpp>
-#include <mscclpp/proxy.hpp>
-#include <mscclpp/proxy_channel_device.hpp>
-#include <mscclpp/semaphore.hpp>
+#include "core.hpp"
+#include "proxy.hpp"
+#include "proxy_channel_device.hpp"
+#include "semaphore.hpp"
 
 namespace mscclpp {
 
@@ -26,7 +26,7 @@ class BaseProxyService {
 class ProxyService : public BaseProxyService {
  public:
   /// Constructor.
-  ProxyService();
+  ProxyService(size_t fifoSize = DEFAULT_FIFO_SIZE);
 
   /// Build and add a semaphore to the proxy service.
   /// @param connection The connection associated with the semaphore.
diff --git a/include/mscclpp/proxy_channel_device.hpp b/include/mscclpp/proxy_channel_device.hpp
index 35c5b6fea..2b91dcec4 100644
--- a/include/mscclpp/proxy_channel_device.hpp
+++ b/include/mscclpp/proxy_channel_device.hpp
@@ -22,7 +22,7 @@ const TriggerType TriggerSync = 0x4;  // Trigger a flush.
 
 #define MSCCLPP_BITS_SIZE 32
 #define MSCCLPP_BITS_OFFSET 32
-#define MSCCLPP_BITS_REGMEM_HANDLE 8
+#define MSCCLPP_BITS_REGMEM_HANDLE 9
 #define MSCCLPP_BITS_TYPE 3
 #define MSCCLPP_BITS_CONNID 10
 #define MSCCLPP_BITS_FIFO_RESERVED 1
@@ -47,12 +47,12 @@ union ChannelTrigger {
     uint64_t reserved : MSCCLPP_BITS_FIFO_RESERVED;
   } fields;
 
-#ifdef __CUDACC__
+#if defined(MSCCLPP_DEVICE_COMPILE)
   /// Default constructor.
-  __forceinline__ __device__ ChannelTrigger() {}
+  MSCCLPP_DEVICE_INLINE ChannelTrigger() {}
 
   /// Copy constructor.
-  __forceinline__ __device__ ChannelTrigger(ProxyTrigger value) : value(value) {}
+  MSCCLPP_DEVICE_INLINE ChannelTrigger(ProxyTrigger value) : value(value) {}
 
   /// Constructor.
   /// @param type The type of the trigger.
@@ -62,16 +62,25 @@ union ChannelTrigger {
   /// @param srcOffset The offset into the source memory region.
   /// @param bytes The bytes of the transfer.
   /// @param semaphoreId The ID of the semaphore.
-  __forceinline__ __device__ ChannelTrigger(TriggerType type, MemoryId dst, uint64_t dstOffset, MemoryId src,
-                                            uint64_t srcOffset, uint64_t bytes, int semaphoreId) {
-    value.fst = ((srcOffset << MSCCLPP_BITS_SIZE) + bytes);
-    value.snd = ((((((((semaphoreId << MSCCLPP_BITS_TYPE) + (uint64_t)type) << MSCCLPP_BITS_REGMEM_HANDLE) + dst)
+  MSCCLPP_DEVICE_INLINE ChannelTrigger(TriggerType type, MemoryId dst, uint64_t dstOffset, MemoryId src,
+                                       uint64_t srcOffset, uint64_t bytes, int semaphoreId) {
+    constexpr uint64_t maskSize = (1ULL << MSCCLPP_BITS_SIZE) - 1;
+    constexpr uint64_t maskSrcOffset = (1ULL << MSCCLPP_BITS_OFFSET) - 1;
+    constexpr uint64_t maskDstOffset = (1ULL << MSCCLPP_BITS_OFFSET) - 1;
+    constexpr uint64_t maskSrcMemoryId = (1ULL << MSCCLPP_BITS_REGMEM_HANDLE) - 1;
+    constexpr uint64_t maskDstMemoryId = (1ULL << MSCCLPP_BITS_REGMEM_HANDLE) - 1;
+    constexpr uint64_t maskType = (1ULL << MSCCLPP_BITS_TYPE) - 1;
+    constexpr uint64_t maskChanId = (1ULL << MSCCLPP_BITS_CONNID) - 1;
+    value.fst = (((srcOffset & maskSrcOffset) << MSCCLPP_BITS_SIZE) + (bytes & maskSize));
+    value.snd = (((((((((semaphoreId & maskChanId) << MSCCLPP_BITS_TYPE) + ((uint64_t)type & maskType))
+                      << MSCCLPP_BITS_REGMEM_HANDLE) +
+                     (dst & maskDstMemoryId))
                     << MSCCLPP_BITS_REGMEM_HANDLE) +
-                   src)
+                   (src & maskSrcMemoryId))
                   << MSCCLPP_BITS_OFFSET) +
-                 dstOffset);
+                 (dstOffset & maskDstOffset));
   }
-#endif  // __CUDACC__
+#endif  // defined(MSCCLPP_DEVICE_COMPILE)
 };
 
 struct ProxyChannelDeviceHandle {
@@ -83,15 +92,14 @@ struct ProxyChannelDeviceHandle {
   // can produce for and the sole proxy thread consumes it.
   FifoDeviceHandle fifo_;
 
-#ifdef __CUDACC__
+#if defined(MSCCLPP_DEVICE_COMPILE)
   /// Push a @ref TriggerData to the FIFO.
   /// @param dst The destination memory region.
   /// @param dstOffset The offset into the destination memory region.
   /// @param src The source memory region.
   /// @param srcOffset The offset into the source memory region.
   /// @param size The size of the transfer.
-  __forceinline__ __device__ void put(MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset,
-                                      uint64_t size) {
+  MSCCLPP_DEVICE_INLINE void put(MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset, uint64_t size) {
     fifo_.push(ChannelTrigger(TriggerData, dst, dstOffset, src, srcOffset, size, semaphoreId_).value);
   }
 
@@ -100,14 +108,12 @@ struct ProxyChannelDeviceHandle {
   /// @param src The source memory region.
   /// @param offset The common offset into the destination and source memory regions.
   /// @param size The size of the transfer.
-  __forceinline__ __device__ void put(MemoryId dst, MemoryId src, uint64_t offset, uint64_t size) {
+  MSCCLPP_DEVICE_INLINE void put(MemoryId dst, MemoryId src, uint64_t offset, uint64_t size) {
     put(dst, offset, src, offset, size);
   }
 
   /// Push a @ref TriggerFlag to the FIFO.
-  __forceinline__ __device__ void signal() {
-    fifo_.push(ChannelTrigger(TriggerFlag, 0, 0, 0, 0, 1, semaphoreId_).value);
-  }
+  MSCCLPP_DEVICE_INLINE void signal() { fifo_.push(ChannelTrigger(TriggerFlag, 0, 0, 0, 0, 1, semaphoreId_).value); }
 
   /// Push a @ref TriggerData and a @ref TriggerFlag at the same time to the FIFO.
   /// @param dst The destination memory region.
@@ -115,8 +121,8 @@ struct ProxyChannelDeviceHandle {
   /// @param src The source memory region.
   /// @param srcOffset The offset into the source memory region.
   /// @param size The size of the transfer.
-  __forceinline__ __device__ void putWithSignal(MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset,
-                                                uint64_t size) {
+  MSCCLPP_DEVICE_INLINE void putWithSignal(MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset,
+                                           uint64_t size) {
     fifo_.push(ChannelTrigger(TriggerData | TriggerFlag, dst, dstOffset, src, srcOffset, size, semaphoreId_).value);
   }
 
@@ -125,7 +131,7 @@ struct ProxyChannelDeviceHandle {
   /// @param src The source memory region.
   /// @param offset The common offset into the destination and source memory regions.
   /// @param size The size of the transfer.
-  __forceinline__ __device__ void putWithSignal(MemoryId dst, MemoryId src, uint64_t offset, uint64_t size) {
+  MSCCLPP_DEVICE_INLINE void putWithSignal(MemoryId dst, MemoryId src, uint64_t offset, uint64_t size) {
     putWithSignal(dst, offset, src, offset, size);
   }
 
@@ -135,8 +141,8 @@ struct ProxyChannelDeviceHandle {
   /// @param src The source memory region.
   /// @param srcOffset The offset into the source memory region.
   /// @param size The size of the transfer.
-  __forceinline__ __device__ void putWithSignalAndFlush(MemoryId dst, uint64_t dstOffset, MemoryId src,
-                                                        uint64_t srcOffset, uint64_t size) {
+  MSCCLPP_DEVICE_INLINE void putWithSignalAndFlush(MemoryId dst, uint64_t dstOffset, MemoryId src, uint64_t srcOffset,
+                                                   uint64_t size) {
     uint64_t curFifoHead = fifo_.push(
         ChannelTrigger(TriggerData | TriggerFlag | TriggerSync, dst, dstOffset, src, srcOffset, size, semaphoreId_)
             .value);
@@ -148,25 +154,25 @@ struct ProxyChannelDeviceHandle {
   /// @param src The source memory region.
   /// @param offset The common offset into the destination and source memory regions.
   /// @param size The size of the transfer.
-  __forceinline__ __device__ void putWithSignalAndFlush(MemoryId dst, MemoryId src, uint64_t offset, uint64_t size) {
+  MSCCLPP_DEVICE_INLINE void putWithSignalAndFlush(MemoryId dst, MemoryId src, uint64_t offset, uint64_t size) {
     putWithSignalAndFlush(dst, offset, src, offset, size);
   }
 
   /// Push a @ref TriggerSync to the FIFO.
-  __forceinline__ __device__ void flush() {
+  MSCCLPP_DEVICE_INLINE void flush() {
     uint64_t curFifoHead = fifo_.push(ChannelTrigger(TriggerSync, 0, 0, 0, 0, 1, semaphoreId_).value);
     fifo_.sync(curFifoHead);
   }
 
   /// Check if the proxy channel has been signaled.
   /// @return true if the proxy channel has been signaled.
-  __forceinline__ __device__ bool poll() { return semaphore_.poll(); }
+  MSCCLPP_DEVICE_INLINE bool poll() { return semaphore_.poll(); }
 
   /// Wait for the proxy channel to be signaled.
   /// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative.
-  __forceinline__ __device__ void wait(int64_t maxSpinCount = 10000000) { semaphore_.wait(maxSpinCount); }
+  MSCCLPP_DEVICE_INLINE void wait(int64_t maxSpinCount = 10000000) { semaphore_.wait(maxSpinCount); }
 
-#endif  // __CUDACC__
+#endif  // defined(MSCCLPP_DEVICE_COMPILE)
 };
 
 struct SimpleProxyChannelDeviceHandle {
@@ -174,62 +180,62 @@ struct SimpleProxyChannelDeviceHandle {
   MemoryId dst_;
   MemoryId src_;
 
-#ifdef __CUDACC__
+#if defined(MSCCLPP_DEVICE_COMPILE)
   /// Push a @ref TriggerData to the FIFO.
   /// @param dstOffset The offset into the destination memory region.
   /// @param srcOffset The offset into the source memory region.
   /// @param size The size of the transfer.
-  __forceinline__ __device__ void put(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) {
+  MSCCLPP_DEVICE_INLINE void put(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) {
     proxyChan_.put(dst_, dstOffset, src_, srcOffset, size);
   }
 
   /// Push a @ref TriggerData to the FIFO.
   /// @param offset The common offset into the destination and source memory regions.
   /// @param size The size of the transfer.
-  __forceinline__ __device__ void put(uint64_t offset, uint64_t size) { put(offset, offset, size); }
+  MSCCLPP_DEVICE_INLINE void put(uint64_t offset, uint64_t size) { put(offset, offset, size); }
 
   /// Push a @ref TriggerFlag to the FIFO.
-  __forceinline__ __device__ void signal() { proxyChan_.signal(); }
+  MSCCLPP_DEVICE_INLINE void signal() { proxyChan_.signal(); }
 
   /// Push a @ref TriggerData and a @ref TriggerFlag at the same time to the FIFO.
   /// @param dstOffset The offset into the destination memory region.
   /// @param srcOffset The offset into the source memory region.
   /// @param size The size of the transfer.
-  __forceinline__ __device__ void putWithSignal(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) {
+  MSCCLPP_DEVICE_INLINE void putWithSignal(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) {
     proxyChan_.putWithSignal(dst_, dstOffset, src_, srcOffset, size);
   }
 
   /// Push a @ref TriggerData and a @ref TriggerFlag at the same time to the FIFO.
   /// @param offset The common offset into the destination and source memory regions.
   /// @param size The size of the transfer.
-  __forceinline__ __device__ void putWithSignal(uint64_t offset, uint64_t size) { putWithSignal(offset, offset, size); }
+  MSCCLPP_DEVICE_INLINE void putWithSignal(uint64_t offset, uint64_t size) { putWithSignal(offset, offset, size); }
 
   /// Push a @ref TriggerData, a @ref TriggerFlag, and a @ref TriggerSync at the same time to the FIFO.
   /// @param dstOffset The offset into the destination memory region.
   /// @param srcOffset The offset into the source memory region.
   /// @param size The size of the transfer.
-  __forceinline__ __device__ void putWithSignalAndFlush(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) {
+  MSCCLPP_DEVICE_INLINE void putWithSignalAndFlush(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) {
     proxyChan_.putWithSignalAndFlush(dst_, dstOffset, src_, srcOffset, size);
   }
 
   /// Push a @ref TriggerData, a @ref TriggerFlag, and a @ref TriggerSync at the same time to the FIFO.
   /// @param offset The common offset into the destination and source memory regions.
   /// @param size The size of the transfer.
-  __forceinline__ __device__ void putWithSignalAndFlush(uint64_t offset, uint64_t size) {
+  MSCCLPP_DEVICE_INLINE void putWithSignalAndFlush(uint64_t offset, uint64_t size) {
     putWithSignalAndFlush(offset, offset, size);
   }
 
   /// Push a @ref TriggerSync to the FIFO.
-  __forceinline__ __device__ void flush() { proxyChan_.flush(); }
+  MSCCLPP_DEVICE_INLINE void flush() { proxyChan_.flush(); }
 
   /// Check if the proxy channel has been signaled.
   /// @return true if the proxy channel has been signaled.
-  __forceinline__ __device__ bool poll() { return proxyChan_.poll(); }
+  MSCCLPP_DEVICE_INLINE bool poll() { return proxyChan_.poll(); }
 
   /// Wait for the proxy channel to be signaled.
   /// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative.
-  __forceinline__ __device__ void wait(int64_t maxSpinCount = 10000000) { proxyChan_.wait(maxSpinCount); }
-#endif  // __CUDACC__
+  MSCCLPP_DEVICE_INLINE void wait(int64_t maxSpinCount = 10000000) { proxyChan_.wait(maxSpinCount); }
+#endif  // defined(MSCCLPP_DEVICE_COMPILE)
 };
 
 }  // namespace mscclpp
diff --git a/include/mscclpp/semaphore.hpp b/include/mscclpp/semaphore.hpp
index 7ad3ec6be..5f1800990 100644
--- a/include/mscclpp/semaphore.hpp
+++ b/include/mscclpp/semaphore.hpp
@@ -5,10 +5,10 @@
 #define MSCCLPP_SEMAPHORE_HPP_
 
 #include <memory>
-#include <mscclpp/core.hpp>
-#include <mscclpp/cuda_utils.hpp>
-#include <mscclpp/poll.hpp>
-#include <mscclpp/semaphore_device.hpp>
+
+#include "core.hpp"
+#include "gpu_utils.hpp"
+#include "semaphore_device.hpp"
 
 namespace mscclpp {
 
@@ -125,7 +125,7 @@ class SmDevice2DeviceSemaphore : public BaseSemaphore<CudaDeleter, CudaDeleter>
   SmDevice2DeviceSemaphore(Communicator& communicator, std::shared_ptr<Connection> connection);
 
   /// Constructor.
-  SmDevice2DeviceSemaphore() = default;
+  SmDevice2DeviceSemaphore() = delete;
 
   /// Device-side handle for @ref SmDevice2DeviceSemaphore.
   using DeviceHandle = SmDevice2DeviceSemaphoreDeviceHandle;
diff --git a/include/mscclpp/semaphore_device.hpp b/include/mscclpp/semaphore_device.hpp
index 3c6f22fd3..cd455078a 100644
--- a/include/mscclpp/semaphore_device.hpp
+++ b/include/mscclpp/semaphore_device.hpp
@@ -4,32 +4,33 @@
 #ifndef MSCCLPP_SEMAPHORE_DEVICE_HPP_
 #define MSCCLPP_SEMAPHORE_DEVICE_HPP_
 
-#include <cuda/atomic>
+#include "device.hpp"
 
-#include "poll.hpp"
+#if defined(MSCCLPP_DEVICE_COMPILE)
+#include "atomic_device.hpp"
+#include "poll_device.hpp"
+#endif  // defined(MSCCLPP_DEVICE_COMPILE)
 
 namespace mscclpp {
 
 /// Device-side handle for @ref Host2DeviceSemaphore.
 struct Host2DeviceSemaphoreDeviceHandle {
-#ifdef __CUDACC__
+#if defined(MSCCLPP_DEVICE_COMPILE)
   /// Poll if the host has signaled.
   /// @return true if the host has signaled.
-  __forceinline__ __device__ bool poll() {
-    bool signaled = (cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{*inboundSemaphoreId}.load(
-                         cuda::memory_order_acquire) > (*expectedInboundSemaphoreId));
+  MSCCLPP_DEVICE_INLINE bool poll() {
+    bool signaled = (atomicLoad(inboundSemaphoreId, memoryOrderAcquire) > (*expectedInboundSemaphoreId));
     if (signaled) (*expectedInboundSemaphoreId) += 1;
     return signaled;
   }
 
   /// Wait for the host to signal.
-  __forceinline__ __device__ void wait(int64_t maxSpinCount = 10000000) {
+  MSCCLPP_DEVICE_INLINE void wait(int64_t maxSpinCount = 100000000) {
     (*expectedInboundSemaphoreId) += 1;
-    POLL_MAYBE_JAILBREAK((cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{*inboundSemaphoreId}.load(
-                              cuda::memory_order_acquire) < (*expectedInboundSemaphoreId)),
+    POLL_MAYBE_JAILBREAK((atomicLoad(inboundSemaphoreId, memoryOrderAcquire) < (*expectedInboundSemaphoreId)),
                          maxSpinCount);
   }
-#endif  // __CUDACC__
+#endif  // defined(MSCCLPP_DEVICE_COMPILE)
 
   uint64_t* inboundSemaphoreId;
   uint64_t* expectedInboundSemaphoreId;
@@ -37,21 +38,19 @@ struct Host2DeviceSemaphoreDeviceHandle {
 
 /// Device-side handle for @ref SmDevice2DeviceSemaphore.
 struct SmDevice2DeviceSemaphoreDeviceHandle {
-#ifdef __CUDACC__
+#if defined(MSCCLPP_DEVICE_COMPILE)
   /// Poll if the remote device has signaled.
   /// @return true if the remote device has signaled.
-  __forceinline__ __device__ bool poll() {
-    bool signaled = (cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{*inboundSemaphoreId}.load(
-                         cuda::memory_order_acquire) > (*expectedInboundSemaphoreId));
+  MSCCLPP_DEVICE_INLINE bool poll() {
+    bool signaled = (atomicLoad(inboundSemaphoreId, memoryOrderAcquire) > (*expectedInboundSemaphoreId));
     if (signaled) (*expectedInboundSemaphoreId) += 1;
     return signaled;
   }
 
   /// Wait for the remote device to signal.
-  __forceinline__ __device__ void wait(int64_t maxSpinCount = 10000000) {
+  MSCCLPP_DEVICE_INLINE void wait(int64_t maxSpinCount = 100000000) {
     (*expectedInboundSemaphoreId) += 1;
-    POLL_MAYBE_JAILBREAK((cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{*inboundSemaphoreId}.load(
-                              cuda::memory_order_acquire) < (*expectedInboundSemaphoreId)),
+    POLL_MAYBE_JAILBREAK((atomicLoad(inboundSemaphoreId, memoryOrderAcquire) < (*expectedInboundSemaphoreId)),
                          maxSpinCount);
   }
 
@@ -60,12 +59,23 @@ struct SmDevice2DeviceSemaphoreDeviceHandle {
   /// This function guarantees that all the memory operation before this function is completed before the remote
   /// semaphore is signaled.
   ///
-  __forceinline__ __device__ void signal() {
+  MSCCLPP_DEVICE_INLINE void signal() {
     // This fence ensures that preceding writes are visible on the peer GPU before the incremented
     // `outboundSemaphoreId` is visible.
     semaphoreIncrement();
-    cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{*remoteInboundSemaphoreId}.store(semaphoreGetLocal(),
-                                                                                           cuda::memory_order_seq_cst);
+    atomicStore(remoteInboundSemaphoreId, semaphoreGetLocal(), memoryOrderSeqCst);
+  }
+
+  /// Signal the remote device.
+  ///
+  /// This function is a relaxed version of signal() and provides no guarantee on the completion of memory operations.
+  /// User requires to call proper fencing before using this function.
+  ///
+  MSCCLPP_DEVICE_INLINE void relaxedSignal() {
+    // This fence ensures that preceding writes are visible on the peer GPU before the incremented
+    // `outboundSemaphoreId` is visible.
+    semaphoreIncrement();
+    atomicStore(remoteInboundSemaphoreId, semaphoreGetLocal(), memoryOrderRelaxed);
   }
 
   /// Signal the remote device for copied packets.
@@ -74,17 +84,17 @@ struct SmDevice2DeviceSemaphoreDeviceHandle {
   /// intended to be used with @ref putPackets() and @ref getPackets() that use flags inside packets to indicate the
   /// completion of copies.
   ///
-  __forceinline__ __device__ void signalPacket() {
+  MSCCLPP_DEVICE_INLINE void signalPacket() {
     semaphoreIncrement();
     *remoteInboundSemaphoreId = semaphoreGetLocal();
   }
 
   /// Increase the counter of the local semaphore.
-  __forceinline__ __device__ void semaphoreIncrement() { *outboundSemaphoreId += 1; }
+  MSCCLPP_DEVICE_INLINE void semaphoreIncrement() { *outboundSemaphoreId += 1; }
 
   /// Get the value of the local semaphore.
-  __forceinline__ __device__ uint64_t semaphoreGetLocal() const { return *outboundSemaphoreId; }
-#endif  // __CUDACC__
+  MSCCLPP_DEVICE_INLINE uint64_t semaphoreGetLocal() const { return *outboundSemaphoreId; }
+#endif  // defined(MSCCLPP_DEVICE_COMPILE)
 
   uint64_t* inboundSemaphoreId;
   uint64_t* outboundSemaphoreId;
diff --git a/include/mscclpp/sm_channel.hpp b/include/mscclpp/sm_channel.hpp
index a1d1daf2b..1a759968b 100644
--- a/include/mscclpp/sm_channel.hpp
+++ b/include/mscclpp/sm_channel.hpp
@@ -4,11 +4,12 @@
 #ifndef MSCCLPP_SM_CHANNEL_HPP_
 #define MSCCLPP_SM_CHANNEL_HPP_
 
-#include <mscclpp/core.hpp>
-#include <mscclpp/semaphore.hpp>
-#include <mscclpp/sm_channel_device.hpp>
 #include <type_traits>
 
+#include "core.hpp"
+#include "semaphore.hpp"
+#include "sm_channel_device.hpp"
+
 namespace mscclpp {
 
 /// Channel for accessing peer memory directly from SM.
diff --git a/include/mscclpp/sm_channel_device.hpp b/include/mscclpp/sm_channel_device.hpp
index 5c11ecd6b..e49a431b7 100644
--- a/include/mscclpp/sm_channel_device.hpp
+++ b/include/mscclpp/sm_channel_device.hpp
@@ -4,44 +4,17 @@
 #ifndef MSCCLPP_SM_CHANNEL_DEVICE_HPP_
 #define MSCCLPP_SM_CHANNEL_DEVICE_HPP_
 
-#include "packet.hpp"
-#include "poll.hpp"
 #include "semaphore_device.hpp"
+#if defined(MSCCLPP_DEVICE_COMPILE)
+#include "packet_device.hpp"
+#endif  // defined(MSCCLPP_DEVICE_COMPILE)
 
 namespace mscclpp {
 
-#ifdef __CUDACC__
+#if defined(MSCCLPP_DEVICE_COMPILE)
 
 namespace Element {
 
-/// Load an element from DRAM.
-///
-/// This is a warpper of ld.volatile.global.* PTX instruction. Address alignment is not this function's
-/// responsibility.
-///
-/// @param v The value to be loaded.
-/// @param p The address of the value to be loaded.
-///
-template <typename T>
-__forceinline__ __device__ void load(T& v, const T* p) {
-  // We should only use the specialized functions.
-  __assert_fail("Unsupported type", __FILE__, __LINE__, __PRETTY_FUNCTION__);
-}
-
-/// Write an element on DRAM.
-///
-/// This is a wrapper of st.volatile.global.* PTX instruction. Address alignment is not this function's
-/// responsibility.
-///
-/// @param p The address of the value to be written.
-/// @param v The value to be written.
-///
-template <typename T>
-__forceinline__ __device__ void store(T* p, const T& v) {
-  // We should only use the specialized functions.
-  __assert_fail("Unsupported type", __FILE__, __LINE__, __PRETTY_FUNCTION__);
-}
-
 /// Copy aligned elements from the source memory to the destination memory.
 ///
 /// This function is intended to be collectively called by multiple threads. Each thread copies a part of
@@ -55,64 +28,19 @@ __forceinline__ __device__ void store(T* p, const T& v) {
 /// @param numThreads The total number of threads that run this function.
 ///
 template <typename T>
-__forceinline__ __device__ void copy(T* dst, T* src, uint64_t numElems, uint32_t threadId, uint32_t numThreads) {
+MSCCLPP_DEVICE_INLINE void copy(T* dst, T* src, uint64_t numElems, uint32_t threadId, uint32_t numThreads) {
   T reg;
   for (size_t i = threadId; i < numElems; i += numThreads) {
     // Load to register first.
-    load(reg, src + i);
-    store(dst + i, reg);
+    reg = src[i];
+    // Then store to destination.
+    dst[i] = reg;
   }
 }
 
-template <>
-__forceinline__ __device__ void load<long long>(long long& v, const long long* p) {
-  asm volatile("ld.volatile.global.u64 %0, [%1];" : "=l"(v) : "l"(p) : "memory");
-}
-
-template <>
-__forceinline__ __device__ void store<long long>(long long* p, const long long& v) {
-  asm volatile("st.volatile.global.u64 [%0], %1;" : : "l"(p), "l"(v) : "memory");
-}
-
-template <>
-__forceinline__ __device__ void load<int>(int& v, const int* p) {
-  asm volatile("ld.volatile.global.u32 %0, [%1];" : "=r"(v) : "l"(p) : "memory");
-}
-
-template <>
-__forceinline__ __device__ void store<int>(int* p, const int& v) {
-  asm volatile("st.volatile.global.u32 [%0], %1;" : : "l"(p), "r"(v) : "memory");
-}
-
-template <>
-__forceinline__ __device__ void load<longlong2>(longlong2& v, const longlong2* p) {
-  asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v.x), "=l"(v.y) : "l"(p) : "memory");
-}
-
-template <>
-__forceinline__ __device__ void store<longlong2>(longlong2* p, const longlong2& v) {
-  asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" : : "l"(p), "l"(v.x), "l"(v.y) : "memory");
-}
-
-template <>
-__forceinline__ __device__ void load<int4>(int4& v, const int4* p) {
-  asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];"
-               : "=r"(v.x), "=r"(v.y), "=r"(v.z), "=r"(v.w)
-               : "l"(p)
-               : "memory");
-}
-
-template <>
-__forceinline__ __device__ void store<int4>(int4* p, const int4& v) {
-  asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};"
-               :
-               : "l"(p), "r"(v.x), "r"(v.y), "r"(v.z), "r"(v.w)
-               : "memory");
-}
-
 }  // namespace Element
 
-#endif  // __CUDACC__
+#endif  // defined(MSCCLPP_DEVICE_COMPILE)
 
 /// Channel for accessing peer memory directly from SM.
 struct SmChannelDeviceHandle {
@@ -121,16 +49,14 @@ struct SmChannelDeviceHandle {
   void* dst_;
   void* getPacketBuffer_;
 
-#ifdef __CUDACC__
+#if defined(MSCCLPP_DEVICE_COMPILE)
   /// Load a value from the remote memory.
   /// @tparam T The type of the value to be loaded.
   /// @param index The index of the value to be loaded. The offset in bytes is calculated as index * sizeof(T).
   /// @return The value loaded.
   template <typename T>
-  __forceinline__ __device__ T read(uint64_t index) {
-    T v;
-    Element::load<T>(v, (T*)dst_ + index);
-    return v;
+  MSCCLPP_DEVICE_INLINE T read(uint64_t index) {
+    return *(reinterpret_cast<T*>(dst_) + index);
   }
 
   /// Write a value to the remote memory.
@@ -138,14 +64,13 @@ struct SmChannelDeviceHandle {
   /// @param index The index of the value to be written. The offset in bytes is calculated as index * sizeof(T).
   /// @param v The value to be written.
   template <typename T>
-  __forceinline__ __device__ void write(uint64_t index, const T& v) {
-    Element::store<T>((T*)dst_ + index, v);
+  MSCCLPP_DEVICE_INLINE void write(uint64_t index, const T& v) {
+    *(reinterpret_cast<T*>(dst_) + index) = v;
   }
 
   /// this is a helper for copy function
   template <typename T, bool CopyRemainder = true>
-  __forceinline__ __device__ void copy_helper(void* dst, void* src, uint64_t bytes, uint32_t threadId,
-                                              uint32_t numThreads) {
+  MSCCLPP_DEVICE_INLINE void copy_helper(void* dst, void* src, uint64_t bytes, uint32_t threadId, uint32_t numThreads) {
     int* dstInt = reinterpret_cast<int*>(dst);
     int* srcInt = reinterpret_cast<int*>(src);
     const uintptr_t dstPtr = reinterpret_cast<uintptr_t>(dst);
@@ -184,7 +109,7 @@ struct SmChannelDeviceHandle {
   /// @param numThreads The total number of threads that run this function.
   ///
   template <int Alignment = 16, bool CopyRemainder = true>
-  __forceinline__ __device__ void copy(void* dst, void* src, uint64_t bytes, uint32_t threadId, uint32_t numThreads) {
+  MSCCLPP_DEVICE_INLINE void copy(void* dst, void* src, uint64_t bytes, uint32_t threadId, uint32_t numThreads) {
     if (Alignment == 4) {
       copy_helper<int, CopyRemainder>(dst, src, bytes, threadId, numThreads);
     } else if (Alignment == 8) {
@@ -211,8 +136,8 @@ struct SmChannelDeviceHandle {
   /// @param numThreads The total number of threads that run this function.
   ///
   template <int Alignment = 16, bool CopyRemainder = true>
-  __forceinline__ __device__ void put(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes,
-                                      uint32_t threadId, uint32_t numThreads) {
+  MSCCLPP_DEVICE_INLINE void put(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
+                                 uint32_t numThreads) {
     copy<Alignment, CopyRemainder>((char*)dst_ + targetOffset, (char*)src_ + originOffset, originBytes, threadId,
                                    numThreads);
   }
@@ -232,8 +157,8 @@ struct SmChannelDeviceHandle {
   /// @param numThreads The total number of threads that run this function.
   ///
   template <int Alignment = 16, bool CopyRemainder = true>
-  __forceinline__ __device__ void get(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes,
-                                      uint32_t threadId, uint32_t numThreads) {
+  MSCCLPP_DEVICE_INLINE void get(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
+                                 uint32_t numThreads) {
     // Note that `dst` and `src` are swapped for `get()`.
     copy<Alignment, CopyRemainder>((char*)src_ + originOffset, (char*)dst_ + targetOffset, originBytes, threadId,
                                    numThreads);
@@ -253,7 +178,7 @@ struct SmChannelDeviceHandle {
   /// @param numThreads The total number of threads that run this function.
   ///
   template <int Alignment = 16, bool CopyRemainder = true>
-  __forceinline__ __device__ void put(uint64_t offset, uint64_t bytes, uint32_t threadId, uint32_t numThreads) {
+  MSCCLPP_DEVICE_INLINE void put(uint64_t offset, uint64_t bytes, uint32_t threadId, uint32_t numThreads) {
     put<Alignment, CopyRemainder>(offset, offset, bytes, threadId, numThreads);
   }
 
@@ -271,7 +196,7 @@ struct SmChannelDeviceHandle {
   /// @param numThreads The total number of threads that run this function.
   ///
   template <int Alignment = 16, bool CopyRemainder = true>
-  __forceinline__ __device__ void get(uint64_t offset, uint64_t bytes, uint32_t threadId, uint32_t numThreads) {
+  MSCCLPP_DEVICE_INLINE void get(uint64_t offset, uint64_t bytes, uint32_t threadId, uint32_t numThreads) {
     get<Alignment, CopyRemainder>(offset, offset, bytes, threadId, numThreads);
   }
 
@@ -286,10 +211,12 @@ struct SmChannelDeviceHandle {
   /// @param threadId The index of the current thread among all threads running this function. This is different from
   /// the `threadIdx` in CUDA.
   /// @param numThreads The total number of threads that run this function.
+  /// @tparam PacketType The packet type. It should be either @ref LL16Packet or @ref LL8Packet.
   ///
-  __forceinline__ __device__ void putPackets(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes,
-                                             uint32_t threadId, uint32_t numThreads, uint32_t flag) {
-    mscclpp::putPackets(dst_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag);
+  template <typename PacketType = LL16Packet>
+  MSCCLPP_DEVICE_INLINE void putPackets(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes,
+                                        uint32_t threadId, uint32_t numThreads, uint32_t flag) {
+    mscclpp::putPackets<PacketType>(dst_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag);
   }
 
   /// Retrieve data from @ref LLPacket in the local packet buffer (target) and write it on the local data (origin).
@@ -302,10 +229,13 @@ struct SmChannelDeviceHandle {
   /// @param threadId The index of the current thread among all threads running this function. This is different from
   /// the `threadIdx` in CUDA.
   /// @param numThreads The total number of threads that run this function.
+  /// @tparam PacketType The packet type. It should be either @ref LL16Packet or @ref LL8Packet.
   ///
-  __forceinline__ __device__ void getPackets(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes,
-                                             uint32_t threadId, uint32_t numThreads, uint32_t flag) {
-    mscclpp::getPackets(getPacketBuffer_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag);
+  template <typename PacketType = LL16Packet>
+  MSCCLPP_DEVICE_INLINE void getPackets(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes,
+                                        uint32_t threadId, uint32_t numThreads, uint32_t flag) {
+    mscclpp::getPackets<PacketType>(getPacketBuffer_, targetOffset, src_, originOffset, originBytes, threadId,
+                                    numThreads, flag);
   }
 
   /// Signal the remote semaphore.
@@ -313,7 +243,14 @@ struct SmChannelDeviceHandle {
   /// This function guarantees that all the memory operation before this function is completed before the remote
   /// semaphore is signaled.
   ///
-  __forceinline__ __device__ void signal() { semaphore_.signal(); }
+  MSCCLPP_DEVICE_INLINE void signal() { semaphore_.signal(); }
+
+  /// Signal the remote semaphore.
+  ///
+  /// This function is a relaxed version of signal() and provides no guarantee on the completion of memory operations.
+  /// User requires to call proper fencing before using this function.
+  ///
+  MSCCLPP_DEVICE_INLINE void relaxedSignal() { semaphore_.relaxedSignal(); }
 
   /// Signal the remote semaphore for copied packets.
   ///
@@ -321,22 +258,22 @@ struct SmChannelDeviceHandle {
   /// intended to be used with @ref putPackets() and @ref getPackets() that use flags inside packets to indicate the
   /// completion of copies.
   ///
-  __forceinline__ __device__ void signalPacket() { semaphore_.signalPacket(); }
+  MSCCLPP_DEVICE_INLINE void signalPacket() { semaphore_.signalPacket(); }
 
   /// Increase the counter of the local semaphore.
-  __forceinline__ __device__ void semaphoreIncrement() { semaphore_.semaphoreIncrement(); }
+  MSCCLPP_DEVICE_INLINE void semaphoreIncrement() { semaphore_.semaphoreIncrement(); }
 
   /// Read the counter of the local semaphore.
-  __forceinline__ __device__ uint64_t semaphoreGetLocal() const { return semaphore_.semaphoreGetLocal(); }
+  MSCCLPP_DEVICE_INLINE uint64_t semaphoreGetLocal() const { return semaphore_.semaphoreGetLocal(); }
 
   /// Check if the remote semaphore has signaled.
   /// @return true if the remote semaphore has signaled.
-  __forceinline__ __device__ bool poll() { return semaphore_.poll(); }
+  MSCCLPP_DEVICE_INLINE bool poll() { return semaphore_.poll(); }
 
   /// Wait for the remote semaphore to send a signal.
   /// @param maxSpinCount The maximum number of spins before asserting. Never assert if negative.
-  __forceinline__ __device__ void wait(int64_t maxSpinCount = 10000000) { semaphore_.wait(maxSpinCount); }
-#endif  // __CUDACC__
+  MSCCLPP_DEVICE_INLINE void wait(int64_t maxSpinCount = 10000000) { semaphore_.wait(maxSpinCount); }
+#endif  // defined(MSCCLPP_DEVICE_COMPILE)
 };
 
 }  // namespace mscclpp
diff --git a/include/mscclpp/utils.hpp b/include/mscclpp/utils.hpp
index 7faec55f9..80b3bf39d 100644
--- a/include/mscclpp/utils.hpp
+++ b/include/mscclpp/utils.hpp
@@ -17,7 +17,7 @@ struct Timer {
 
   ~Timer();
 
-  /// Returns the elapsed time in milliseconds.
+  /// Returns the elapsed time in microseconds.
   int64_t elapsed() const;
 
   void set(int timeout);
@@ -37,6 +37,8 @@ struct ScopedTimer : public Timer {
 
 std::string getHostName(int maxlen, const char delim);
 
+bool isNvlsSupported();
+
 }  // namespace mscclpp
 
 #endif  // MSCCLPP_UTILS_HPP_
diff --git a/pyproject.toml b/pyproject.toml
index 5902c9464..37dbf8ac4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,12 +7,12 @@ build-backend = "scikit_build_core.build"
 
 [project]
 name = "mscclpp"
-version = "0.3.0"
+version = "0.4.2"
 
 [tool.scikit-build]
 cmake.minimum-version = "3.25.0"
 build-dir = "build/{wheel_tag}"
-wheel.packages = ["python/mscclpp"]
+wheel.packages = ["python/mscclpp", "python/mscclpp_benchmark"]
 wheel.install-dir = "mscclpp"
 
 [tool.scikit-build.cmake.define]
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 6bb8e2700..5e5b0ba60 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -4,13 +4,12 @@
 add_subdirectory(mscclpp)
 add_subdirectory(test)
 
-add_custom_target(pylib-copy)
-add_custom_command(TARGET pylib-copy POST_BUILD
+add_custom_target(pytest_lib_copy ALL
     COMMAND ${CMAKE_COMMAND} -E copy_if_different
         ${CMAKE_CURRENT_BINARY_DIR}/mscclpp/_mscclpp.*.so
         ${CMAKE_CURRENT_SOURCE_DIR}/mscclpp
     COMMAND ${CMAKE_COMMAND} -E copy_if_different
         ${CMAKE_CURRENT_BINARY_DIR}/test/_ext.*.so
         ${CMAKE_CURRENT_SOURCE_DIR}/test/_cpp
-    COMMAND ${CMAKE_COMMAND} -E echo "Copy python libraries"
+    DEPENDS mscclpp_py mscclpp_py_test
 )
diff --git a/python/examples/bootstrap.py b/python/examples/bootstrap.py
deleted file mode 100644
index 71539e0b1..000000000
--- a/python/examples/bootstrap.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import argparse
-import logging
-import multiprocessing as mp
-import sys
-
-import mscclpp
-import torch
-
-IB_TRANSPORTS = [
-    mscclpp.Transport.IB0,
-    mscclpp.Transport.IB1,
-    mscclpp.Transport.IB2,
-    mscclpp.Transport.IB3,
-    mscclpp.Transport.IB4,
-    mscclpp.Transport.IB5,
-    mscclpp.Transport.IB6,
-    mscclpp.Transport.IB7,
-]
-
-# Use to hold the sm channels so they don't get garbage collected
-sm_channels = []
-
-
-def setup_connections(comm, rank, world_size, element_size, proxy_service):
-    simple_proxy_channels = []
-    sm_semaphores = []
-    connections = []
-    remote_memories = []
-    memory = torch.zeros(element_size, dtype=torch.int32)
-    memory = memory.to("cuda")
-
-    transport_flag = mscclpp.TransportFlags(IB_TRANSPORTS[rank]) | mscclpp.Transport.CudaIpc
-    ptr = memory.data_ptr()
-    size = memory.numel() * memory.element_size()
-    reg_mem = comm.register_memory(ptr, size, transport_flag)
-
-    for r in range(world_size):
-        if r == rank:
-            continue
-        conn = comm.connect_on_setup(r, 0, mscclpp.Transport.CudaIpc)
-        connections.append(conn)
-        comm.send_memory_on_setup(reg_mem, r, 0)
-        remote_mem = comm.recv_memory_on_setup(r, 0)
-        remote_memories.append(remote_mem)
-    comm.setup()
-
-    connections = [conn.get() for conn in connections]
-
-    # Create simple proxy channels
-    for i, conn in enumerate(connections):
-        proxy_channel = mscclpp.SimpleProxyChannel(
-            proxy_service.proxy_channel(proxy_service.build_and_add_semaphore(comm, conn)),
-            proxy_service.add_memory(remote_memories[i].get()),
-            proxy_service.add_memory(reg_mem),
-        )
-        simple_proxy_channels.append(proxy_channel.device_handle())
-    comm.setup()
-
-    # Create sm channels
-    for i, conn in enumerate(connections):
-        sm_chan = mscclpp.SmDevice2DeviceSemaphore(comm, conn)
-        sm_semaphores.append(sm_chan)
-    comm.setup()
-
-    for i, conn in enumerate(sm_semaphores):
-        sm_chan = mscclpp.SmChannel(sm_semaphores[i], remote_memories[i].get(), ptr)
-        sm_channels.append(sm_chan)
-    return simple_proxy_channels, [sm_chan.device_handle() for sm_chan in sm_channels]
-
-
-def run(rank, args):
-    world_size = args.gpu_number
-    torch.cuda.set_device(rank)
-
-    boot = mscclpp.TcpBootstrap.create(rank, world_size)
-    boot.initialize(args.if_ip_port_trio)
-    comm = mscclpp.Communicator(boot)
-    proxy_service = mscclpp.ProxyService()
-
-    logging.info("Rank: %d, setting up connections", rank)
-    setup_connections(comm, rank, world_size, args.num_elements, proxy_service)
-
-    logging.info("Rank: %d, starting proxy service", rank)
-    proxy_service.start_proxy()
-
-
-def main():
-    logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
-    parser = argparse.ArgumentParser()
-    parser.add_argument("if_ip_port_trio", type=str)
-    parser.add_argument("-n", "--num-elements", type=int, default=10)
-    parser.add_argument("-g", "--gpu_number", type=int, default=2)
-    args = parser.parse_args()
-    processes = []
-
-    for rank in range(args.gpu_number):
-        p = mp.Process(target=run, args=(rank, args))
-        p.start()
-        processes.append(p)
-
-    for p in processes:
-        p.join()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/examples/send_recv.py b/python/examples/send_recv.py
deleted file mode 100644
index d19a7be2d..000000000
--- a/python/examples/send_recv.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import argparse
-import time
-
-import mscclpp
-
-
-def main(args):
-    if args.root:
-        rank = 0
-    else:
-        rank = 1
-
-    boot = mscclpp.TcpBootstrap.create(rank, 2)
-    boot.initialize(args.if_ip_port_trio)
-
-    comm = mscclpp.Communicator(boot)
-
-    if args.gpu:
-        import torch
-
-        print("Allocating GPU memory")
-        memory = torch.zeros(args.num_elements, dtype=torch.int32)
-        memory = memory.to("cuda")
-        ptr = memory.data_ptr()
-        size = memory.numel() * memory.element_size()
-    else:
-        from array import array
-
-        print("Allocating host memory")
-        memory = array("i", [0] * args.num_elements)
-        ptr, elements = memory.buffer_info()
-        size = elements * memory.itemsize
-    my_reg_mem = comm.register_memory(ptr, size, mscclpp.Transport.IB0)
-
-    conn = comm.connect_on_setup((rank + 1) % 2, 0, mscclpp.Transport.IB0)
-
-    other_reg_mem = None
-    if rank == 0:
-        other_reg_mem = comm.recv_memory_on_setup((rank + 1) % 2, 0)
-    else:
-        comm.send_memory_on_setup(my_reg_mem, (rank + 1) % 2, 0)
-
-    comm.setup()
-
-    if rank == 0:
-        other_reg_mem = other_reg_mem.get()
-
-    if rank == 0:
-        for i in range(args.num_elements):
-            memory[i] = i + 1
-        conn.write(other_reg_mem, 0, my_reg_mem, 0, size)
-        print("Done sending")
-    else:
-        print("Checking for correctness")
-        # polling
-        for _ in range(args.polling_num):
-            all_correct = True
-            for i in range(args.num_elements):
-                if memory[i] != i + 1:
-                    all_correct = False
-                    print(f"Error: Mismatch at index {i}: expected {i + 1}, got {memory[i]}")
-                    break
-            if all_correct:
-                print("All data matched expected values")
-                break
-            else:
-                time.sleep(0.1)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("if_ip_port_trio", type=str)
-    parser.add_argument("-r", "--root", action="store_true")
-    parser.add_argument("-n", "--num-elements", type=int, default=10)
-    parser.add_argument("--gpu", action="store_true")
-    parser.add_argument("--polling_num", type=int, default=100)
-    args = parser.parse_args()
-
-    main(args)
diff --git a/python/examples/utils.py b/python/examples/utils.py
deleted file mode 100644
index 7f2b4c989..000000000
--- a/python/examples/utils.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import time
-
-import mscclpp
-
-
-def main():
-    timer = mscclpp.Timer()
-    timer.reset()
-    time.sleep(2)
-    assert timer.elapsed() >= 2000000
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/mscclpp/CMakeLists.txt b/python/mscclpp/CMakeLists.txt
index 22fd318e9..bb9eadf32 100644
--- a/python/mscclpp/CMakeLists.txt
+++ b/python/mscclpp/CMakeLists.txt
@@ -9,6 +9,6 @@ FetchContent_MakeAvailable(nanobind)
 file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cpp)
 nanobind_add_module(mscclpp_py ${SOURCES})
 set_target_properties(mscclpp_py PROPERTIES OUTPUT_NAME _mscclpp)
-target_link_libraries(mscclpp_py PRIVATE mscclpp_static)
-target_include_directories(mscclpp_py PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+target_link_libraries(mscclpp_py PRIVATE mscclpp_static ${GPU_LIBRARIES})
+target_include_directories(mscclpp_py SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
 install(TARGETS mscclpp_py LIBRARY DESTINATION .)
diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py
index 5165e95cb..8f013e080 100644
--- a/python/mscclpp/__init__.py
+++ b/python/mscclpp/__init__.py
@@ -6,6 +6,7 @@
 from ._mscclpp import (
     Communicator,
     Connection,
+    EndpointConfig,
     Fifo,
     Host2DeviceSemaphore,
     Host2HostSemaphore,
@@ -19,10 +20,14 @@
     Transport,
     TransportFlags,
     version,
+    is_nvls_supported,
 )
 
 __version__ = version()
 
+if _os.environ.get("MSCCLPP_HOME", None) is None:
+    _os.environ["MSCCLPP_HOME"] = _os.path.abspath(_os.path.dirname(__file__))
+
 
 def get_include():
     """Return the directory that contains the MSCCL++ headers."""
diff --git a/python/mscclpp/comm.py b/python/mscclpp/comm.py
new file mode 100644
index 000000000..b3cc51f12
--- /dev/null
+++ b/python/mscclpp/comm.py
@@ -0,0 +1,243 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from __future__ import annotations
+from typing import Type
+
+import cupy as cp
+from ._mscclpp import (
+    Communicator,
+    Connection,
+    EndpointConfig,
+    Host2DeviceSemaphore,
+    Host2HostSemaphore,
+    ProxyService,
+    RegisteredMemory,
+    SimpleProxyChannel,
+    SmChannel,
+    SmDevice2DeviceSemaphore,
+    TcpBootstrap,
+    Transport,
+    TransportFlags,
+)
+import mpi4py
+import numpy as np
+
+from mscclpp.utils import is_torch_tensor
+
+
+class CommGroup:
+    def __init__(
+        self, mpi_comm: mpi4py.MPI.Comm = None, interfaceIpPortTrio: str = "", rank: int = None, size: int = None
+    ):
+        if interfaceIpPortTrio == "":
+            self.bootstrap = TcpBootstrap.create(mpi_comm.rank, mpi_comm.size)
+            uniq_id = None
+            if mpi_comm.rank == 0:
+                # similar to NCCL's unique id
+                uniq_id = self.bootstrap.create_unique_id()
+            uniq_id_global = mpi_comm.bcast(uniq_id, 0)
+            self.bootstrap.initialize(uniq_id_global)
+        elif mpi_comm:
+            # use this instead
+            self.bootstrap = TcpBootstrap.create(mpi_comm.rank, mpi_comm.size)
+            self.bootstrap.initialize(interfaceIpPortTrio)
+        elif not interfaceIpPortTrio == "":
+            assert rank >= 0 and size >= 1
+            self.bootstrap = TcpBootstrap.create(rank, size)
+            self.bootstrap.initialize(interfaceIpPortTrio)
+        else:
+            raise RuntimeError("Either the interface or mpi_group need to be specified")
+        self.communicator = Communicator(self.bootstrap)
+        self.my_rank = self.bootstrap.get_rank()
+        self.nranks = self.bootstrap.get_n_ranks()
+
+    def barrier(self):
+        self.bootstrap.barrier()
+
+    def send(self, tensor: np.ndarray, peer: int, tag: int):
+        self.bootstrap.send(tensor.ctypes.data, tensor.size * tensor.itemsize, peer, tag)
+
+    def recv(self, tensor: np.ndarray, peer: int, tag: int):
+        self.bootstrap.recv(tensor.ctypes.data, tensor.size * tensor.itemsize, peer, tag)
+
+    def my_ib_device(self, local_rank: int) -> Transport:
+        if local_rank == 0:
+            return Transport.IB0
+        if local_rank == 1:
+            return Transport.IB1
+        if local_rank == 2:
+            return Transport.IB2
+        if local_rank == 3:
+            return Transport.IB3
+        if local_rank == 4:
+            return Transport.IB4
+        if local_rank == 5:
+            return Transport.IB5
+        if local_rank == 6:
+            return Transport.IB6
+        if local_rank == 7:
+            return Transport.IB7
+        else:
+            assert False  # only 8 IBs are supported
+
+    def make_connection(
+        self,
+        all_ranks: list[int],
+        endpoints: EndpointConfig | Transport | dict[int, EndpointConfig] | dict[int, Transport],
+    ) -> dict[int, Connection]:
+        if type(endpoints) is Transport:
+            endpoints = EndpointConfig(endpoints)
+        elif type(endpoints) is dict:
+            endpoints = {k: EndpointConfig(v) if type(v) is Transport else v for k, v in endpoints.items()}
+        connections = {}
+        for rank in all_ranks:
+            if type(endpoints) is dict:
+                endpoint = endpoints[rank]
+            else:
+                endpoint = endpoints
+            if endpoint.transport == Transport.Nvls:
+                return self.communicator.connct_nvls_collective(all_ranks, endpoint)
+            else:
+                connections[rank] = self.communicator.connect_on_setup(rank, 0, endpoint)
+        self.communicator.setup()
+        connections = {rank: connections[rank].get() for rank in connections}
+        return connections
+
+    def register_tensor_with_connections(
+        self, tensor: Type[cp.ndarray] or Type[np.ndarray], connections: dict[int, Connection]
+    ) -> dict[int, RegisteredMemory]:
+        transport_flags = TransportFlags()
+        for rank in connections:
+            transport_flags |= connections[rank].transport()
+        data_ptr = (
+            tensor.data.ptr
+            if isinstance(tensor, cp.ndarray)
+            else tensor.data_ptr() if is_torch_tensor(tensor) else tensor.ctypes.data
+        )
+        tensor_size = (
+            tensor.numel() * tensor.element_size() if is_torch_tensor(tensor) else tensor.size * tensor.itemsize
+        )
+        local_reg_memory = self.communicator.register_memory(data_ptr, tensor_size, transport_flags)
+        all_registered_memories = {}
+        all_registered_memories[self.my_rank] = local_reg_memory
+        future_memories = {}
+        for rank in connections:
+            self.communicator.send_memory_on_setup(local_reg_memory, rank, 0)
+            future_memories[rank] = self.communicator.recv_memory_on_setup(rank, 0)
+        self.communicator.setup()
+        for rank in connections:
+            all_registered_memories[rank] = future_memories[rank].get()
+        return all_registered_memories
+
+    def make_semaphore(
+        self,
+        connections: dict[int, Connection],
+        semaphore_type: Type[Host2HostSemaphore] or Type[Host2DeviceSemaphore] or Type[SmDevice2DeviceSemaphore],
+    ) -> dict[int, Host2HostSemaphore]:
+        semaphores = {}
+        for rank in connections:
+            semaphores[rank] = semaphore_type(self.communicator, connections[rank])
+        self.communicator.setup()
+        return semaphores
+
+    def make_sm_channels(self, tensor: cp.ndarray, connections: dict[int, Connection]) -> dict[int, SmChannel]:
+        semaphores = self.make_semaphore(connections, SmDevice2DeviceSemaphore)
+        registered_memories = self.register_tensor_with_connections(tensor, connections)
+        channels = {}
+        tensor_data_ptr = tensor.data_ptr() if is_torch_tensor(tensor) else tensor.data.ptr
+        for rank in connections:
+            channels[rank] = SmChannel(semaphores[rank], registered_memories[rank], tensor_data_ptr)
+        return channels
+
+    def make_sm_channels_with_scratch(
+        self,
+        tensor: cp.ndarray,
+        scratchTensor: cp.ndarray,
+        connections: dict[int, Connection],
+    ) -> dict[int, SmChannel]:
+        semaphores = self.make_semaphore(connections, SmDevice2DeviceSemaphore)
+        registered_memories = self.register_tensor_with_connections(scratchTensor, connections)
+        channels = {}
+        tensor_data_ptr = tensor.data_ptr() if is_torch_tensor(tensor) else tensor.data.ptr
+        scratch_data_ptr = scratchTensor.data_ptr() if is_torch_tensor(scratchTensor) else scratchTensor.data.ptr
+        for rank in connections:
+            channels[rank] = SmChannel(semaphores[rank], registered_memories[rank], tensor_data_ptr, scratch_data_ptr)
+        return channels
+
+    def make_proxy_channels(
+        self, proxy_service: ProxyService, tensor: cp.ndarray, connections: dict[int, Connection]
+    ) -> dict[int, SmChannel]:
+        semaphores = self.make_semaphore(connections, Host2DeviceSemaphore)
+        registered_memories = self.register_tensor_with_connections(tensor, connections)
+        memory_ids = {}
+        semaphore_ids = {}
+        for rank in registered_memories:
+            memory_ids[rank] = proxy_service.add_memory(registered_memories[rank])
+        for rank in semaphores:
+            semaphore_ids[rank] = proxy_service.add_semaphore(semaphores[rank])
+        channels = {}
+        for rank in semaphores:
+            channels[rank] = SimpleProxyChannel(
+                proxy_service.proxy_channel(semaphore_ids[rank]), memory_ids[rank], memory_ids[self.my_rank]
+            )
+        return channels
+
+    def make_proxy_channels_with_scratch(
+        self,
+        proxy_service: ProxyService,
+        tensor: cp.ndarray,
+        scratchTensor: cp.ndarray,
+        connections: dict[int, Connection],
+    ) -> dict[int, SmChannel]:
+        transport_flags = TransportFlags()
+        for rank in connections:
+            transport_flags |= connections[rank].transport()
+        data_ptr = (
+            tensor.data.ptr
+            if isinstance(tensor, cp.ndarray)
+            else tensor.data_ptr() if is_torch_tensor(tensor) else tensor.ctypes.data
+        )
+        tensor_size = (
+            tensor.numel() * tensor.element_size() if is_torch_tensor(tensor) else tensor.size * tensor.itemsize
+        )
+        local_reg_memory = self.communicator.register_memory(data_ptr, tensor_size, transport_flags)
+
+        semaphores = self.make_semaphore(connections, Host2DeviceSemaphore)
+        registered_memories = self.register_tensor_with_connections(scratchTensor, connections)
+        memory_ids = {}
+        semaphore_ids = {}
+        for rank in registered_memories:
+            if rank == self.my_rank:
+                memory_ids[self.my_rank] = proxy_service.add_memory(local_reg_memory)
+            else:
+                memory_ids[rank] = proxy_service.add_memory(registered_memories[rank])
+        for rank in semaphores:
+            semaphore_ids[rank] = proxy_service.add_semaphore(semaphores[rank])
+        channels = {}
+        for rank in semaphores:
+            channels[rank] = SimpleProxyChannel(
+                proxy_service.proxy_channel(semaphore_ids[rank]), memory_ids[rank], memory_ids[self.my_rank]
+            )
+        return channels
+
+    def register_semaphore_with_proxy(
+        self, proxy_service: ProxyService, connections: dict[int, Connection]
+    ) -> dict[int, SmChannel]:
+        semaphores = self.make_semaphore(connections, Host2DeviceSemaphore)
+        semaphore_ids = {}
+        for rank in semaphores:
+            semaphore_ids[rank] = proxy_service.add_semaphore(semaphores[rank])
+        channels = {}
+        for rank in semaphores:
+            channels[rank] = proxy_service.proxy_channel(semaphore_ids[rank])
+        return channels
+
+    def register_memory_with_proxy(
+        self, proxy_service: ProxyService, tensor: cp.ndarray, connections: dict[int, Connection]
+    ) -> dict[int, int]:
+        registered_memories = self.register_tensor_with_connections(tensor, connections)
+        memory_ids = {}
+        for rank in registered_memories:
+            memory_ids[rank] = proxy_service.add_memory(registered_memories[rank])
+        return memory_ids
diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp
index 60ceb96cc..1a1cd2780 100644
--- a/python/mscclpp/core_py.cpp
+++ b/python/mscclpp/core_py.cpp
@@ -6,6 +6,7 @@
 #include <nanobind/stl/array.h>
 #include <nanobind/stl/shared_ptr.h>
 #include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 #include <mscclpp/core.hpp>
 
@@ -50,9 +51,9 @@ void register_core(nb::module_& m) {
           nb::arg("data"), nb::arg("size"), nb::arg("peer"), nb::arg("tag"))
       .def("all_gather", &Bootstrap::allGather, nb::arg("allData"), nb::arg("size"))
       .def("barrier", &Bootstrap::barrier)
-      .def("send", (void (Bootstrap::*)(const std::vector<char>&, int, int)) & Bootstrap::send, nb::arg("data"),
-           nb::arg("peer"), nb::arg("tag"))
-      .def("recv", (void (Bootstrap::*)(std::vector<char>&, int, int)) & Bootstrap::recv, nb::arg("data"),
+      .def("send", static_cast<void (Bootstrap::*)(const std::vector<char>&, int, int)>(&Bootstrap::send),
+           nb::arg("data"), nb::arg("peer"), nb::arg("tag"))
+      .def("recv", static_cast<void (Bootstrap::*)(std::vector<char>&, int, int)>(&Bootstrap::recv), nb::arg("data"),
            nb::arg("peer"), nb::arg("tag"));
 
   nb::class_<UniqueId>(m, "UniqueId");
@@ -62,16 +63,17 @@ void register_core(nb::module_& m) {
       .def_static(
           "create", [](int rank, int nRanks) { return std::make_shared<TcpBootstrap>(rank, nRanks); }, nb::arg("rank"),
           nb::arg("nRanks"))
-      .def("create_unique_id", &TcpBootstrap::createUniqueId)
+      .def_static("create_unique_id", &TcpBootstrap::createUniqueId)
       .def("get_unique_id", &TcpBootstrap::getUniqueId)
-      .def("initialize", (void (TcpBootstrap::*)(UniqueId, int64_t)) & TcpBootstrap::initialize, nb::arg("uniqueId"),
-           nb::arg("timeoutSec") = 30)
-      .def("initialize", (void (TcpBootstrap::*)(const std::string&, int64_t)) & TcpBootstrap::initialize,
-           nb::arg("ifIpPortTrio"), nb::arg("timeoutSec") = 30);
+      .def("initialize", static_cast<void (TcpBootstrap::*)(UniqueId, int64_t)>(&TcpBootstrap::initialize),
+           nb::call_guard<nb::gil_scoped_release>(), nb::arg("uniqueId"), nb::arg("timeoutSec") = 30)
+      .def("initialize", static_cast<void (TcpBootstrap::*)(const std::string&, int64_t)>(&TcpBootstrap::initialize),
+           nb::call_guard<nb::gil_scoped_release>(), nb::arg("ifIpPortTrio"), nb::arg("timeoutSec") = 30);
 
   nb::enum_<Transport>(m, "Transport")
       .value("Unknown", Transport::Unknown)
       .value("CudaIpc", Transport::CudaIpc)
+      .value("Nvls", Transport::Nvls)
       .value("IB0", Transport::IB0)
       .value("IB1", Transport::IB1)
       .value("IB2", Transport::IB2)
@@ -120,10 +122,28 @@ void register_core(nb::module_& m) {
             self->updateAndSync(dst, dstOffset, (uint64_t*)src, newValue);
           },
           nb::arg("dst"), nb::arg("dstOffset"), nb::arg("src"), nb::arg("newValue"))
-      .def("flush", &Connection::flush, nb::arg("timeoutUsec") = (int64_t)3e7)
+      .def("flush", &Connection::flush, nb::call_guard<nb::gil_scoped_release>(), nb::arg("timeoutUsec") = (int64_t)3e7)
       .def("transport", &Connection::transport)
       .def("remote_transport", &Connection::remoteTransport);
 
+  nb::class_<NvlsConnection::DeviceMulticastPointer>(m, "DeviceMulticastPointer")
+      .def("get_device_ptr",
+           [](NvlsConnection::DeviceMulticastPointer* self) { return (uintptr_t)self->getDevicePtr(); })
+      .def("device_handle", &NvlsConnection::DeviceMulticastPointer::deviceHandle);
+
+  nb::class_<NvlsConnection::DeviceMulticastPointer::DeviceHandle>(m, "DeviceHandle")
+      .def(nb::init<>())
+      .def_rw("devicePtr", &NvlsConnection::DeviceMulticastPointer::DeviceHandle::devicePtr)
+      .def_rw("mcPtr", &NvlsConnection::DeviceMulticastPointer::DeviceHandle::mcPtr)
+      .def_rw("size", &NvlsConnection::DeviceMulticastPointer::DeviceHandle::bufferSize)
+      .def_prop_ro("raw", [](const NvlsConnection::DeviceMulticastPointer::DeviceHandle& self) -> nb::bytes {
+        return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
+      });
+
+  nb::class_<NvlsConnection>(m, "NvlsConnection")
+      .def("allocate_bind_memory", &NvlsConnection::allocateAndBindCuda)
+      .def("get_multicast_min_granularity", &NvlsConnection::getMultiCastMinGranularity);
+
   nb::class_<Endpoint>(m, "Endpoint")
       .def("transport", &Endpoint::transport)
       .def("serialize", &Endpoint::serialize)
@@ -132,6 +152,7 @@ void register_core(nb::module_& m) {
   nb::class_<EndpointConfig>(m, "EndpointConfig")
       .def(nb::init<>())
       .def(nb::init_implicit<Transport>(), nb::arg("transport"))
+      .def(nb::init<Transport, size_t>(), nb::arg("transport"), nb::arg("nvlsBufferSize"))
       .def_rw("transport", &EndpointConfig::transport)
       .def_rw("ib_max_cq_size", &EndpointConfig::ibMaxCqSize)
       .def_rw("ib_max_cq_poll_num", &EndpointConfig::ibMaxCqPollNum)
@@ -168,6 +189,7 @@ void register_core(nb::module_& m) {
       .def("recv_memory_on_setup", &Communicator::recvMemoryOnSetup, nb::arg("remoteRank"), nb::arg("tag"))
       .def("connect_on_setup", &Communicator::connectOnSetup, nb::arg("remoteRank"), nb::arg("tag"),
            nb::arg("localConfig"))
+      .def("connct_nvls_collective", &Communicator::connctNvlsCollective, nb::arg("allRanks"), nb::arg("config"))
       .def("remote_rank_of", &Communicator::remoteRankOf)
       .def("tag_of", &Communicator::tagOf)
       .def("setup", &Communicator::setup);
diff --git a/python/mscclpp/error_py.cpp b/python/mscclpp/error_py.cpp
index 7b148dd5f..18d4b834a 100644
--- a/python/mscclpp/error_py.cpp
+++ b/python/mscclpp/error_py.cpp
@@ -31,10 +31,10 @@ void register_error(nb::module_& m) {
       .def(nb::init<const std::string&, int>(), nb::arg("message"), nb::arg("errorCode"));
 
   nb::class_<CudaError, BaseError>(m, "CudaError")
-      .def(nb::init<const std::string&, cudaError_t>(), nb::arg("message"), nb::arg("errorCode"));
+      .def(nb::init<const std::string&, int>(), nb::arg("message"), nb::arg("errorCode"));
 
   nb::class_<CuError, BaseError>(m, "CuError")
-      .def(nb::init<const std::string&, CUresult>(), nb::arg("message"), nb::arg("errorCode"));
+      .def(nb::init<const std::string&, int>(), nb::arg("message"), nb::arg("errorCode"));
 
   nb::class_<IbError, BaseError>(m, "IbError")
       .def(nb::init<const std::string&, int>(), nb::arg("message"), nb::arg("errorCode"));
diff --git a/python/mscclpp/fifo_py.cpp b/python/mscclpp/fifo_py.cpp
index eb23118bf..7a5ff4e4e 100644
--- a/python/mscclpp/fifo_py.cpp
+++ b/python/mscclpp/fifo_py.cpp
@@ -21,7 +21,7 @@ void register_fifo(nb::module_& m) {
       });
 
   nb::class_<Fifo>(m, "Fifo")
-      .def(nb::init<int>(), nb::arg("size") = 128)
+      .def(nb::init<int>(), nb::arg("size") = DEFAULT_FIFO_SIZE)
       .def("poll", &Fifo::poll)
       .def("pop", &Fifo::pop)
       .def("flush_tail", &Fifo::flushTail, nb::arg("sync") = false)
diff --git a/python/mscclpp/proxy_channel_py.cpp b/python/mscclpp/proxy_channel_py.cpp
index f8e3b71d7..f068c407d 100644
--- a/python/mscclpp/proxy_channel_py.cpp
+++ b/python/mscclpp/proxy_channel_py.cpp
@@ -16,7 +16,7 @@ void register_proxy_channel(nb::module_& m) {
       .def("stop_proxy", &BaseProxyService::stopProxy);
 
   nb::class_<ProxyService, BaseProxyService>(m, "ProxyService")
-      .def(nb::init<>())
+      .def(nb::init<size_t>(), nb::arg("fifoSize") = DEFAULT_FIFO_SIZE)
       .def("start_proxy", &ProxyService::startProxy)
       .def("stop_proxy", &ProxyService::stopProxy)
       .def("build_and_add_semaphore", &ProxyService::buildAndAddSemaphore, nb::arg("comm"), nb::arg("connection"))
diff --git a/python/mscclpp/semaphore_py.cpp b/python/mscclpp/semaphore_py.cpp
index 67e02184a..a616a89da 100644
--- a/python/mscclpp/semaphore_py.cpp
+++ b/python/mscclpp/semaphore_py.cpp
@@ -30,7 +30,8 @@ void register_semaphore(nb::module_& m) {
       .def("connection", &Host2HostSemaphore::connection)
       .def("signal", &Host2HostSemaphore::signal)
       .def("poll", &Host2HostSemaphore::poll)
-      .def("wait", &Host2HostSemaphore::wait, nb::arg("max_spin_count") = 10000000);
+      .def("wait", &Host2HostSemaphore::wait, nb::call_guard<nb::gil_scoped_release>(),
+           nb::arg("max_spin_count") = 10000000);
 
   nb::class_<SmDevice2DeviceSemaphore> smDevice2DeviceSemaphore(m, "SmDevice2DeviceSemaphore");
   smDevice2DeviceSemaphore
diff --git a/python/mscclpp/utils.py b/python/mscclpp/utils.py
new file mode 100644
index 000000000..244a69503
--- /dev/null
+++ b/python/mscclpp/utils.py
@@ -0,0 +1,162 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import ctypes
+import os
+import struct
+import subprocess
+import tempfile
+from typing import Any, Type
+
+import cupy as cp
+import numpy as np
+
+try:
+    import torch
+
+    _use_torch = True
+    torchTensor = torch.Tensor
+except ImportError:
+    _use_torch = False
+    torchTensor = Type[Any]
+
+
+class Kernel:
+    CU_LAUNCH_PARAM_BUFFER_POINTER = 0x01
+    CU_LAUNCH_PARAM_BUFFER_SIZE = 0x02
+    CU_LAUNCH_PARAM_END = 0x00 if not cp.cuda.runtime.is_hip else 0x03
+
+    def __init__(self, ptx: bytes, kernel_name: str):
+        self._module = cp.cuda.driver.moduleLoadData(ptx)
+        self._kernel = cp.cuda.driver.moduleGetFunction(self._module, kernel_name)
+
+    def launch_kernel(
+        self,
+        params: bytes,
+        nblocks: int,
+        nthreads: int,
+        shared: int,
+        stream: Type[cp.cuda.Stream] or Type[None],
+    ):
+        buffer = (ctypes.c_byte * len(params)).from_buffer_copy(params)
+        buffer_size = ctypes.c_size_t(len(params))
+        config = np.array(
+            [
+                Kernel.CU_LAUNCH_PARAM_BUFFER_POINTER,
+                ctypes.addressof(buffer),
+                Kernel.CU_LAUNCH_PARAM_BUFFER_SIZE,
+                ctypes.addressof(buffer_size),
+                Kernel.CU_LAUNCH_PARAM_END,
+            ],
+            dtype=np.uint64,
+        )
+        cuda_stream = stream.ptr if stream else 0
+        cp.cuda.driver.launchKernel(
+            self._kernel, nblocks, 1, 1, nthreads, 1, 1, shared, cuda_stream, 0, config.ctypes.data
+        )
+
+    def __del__(self):
+        cp.cuda.driver.moduleUnload(self._module)
+
+
+class KernelBuilder:
+    kernel_map: dict = {}
+
+    def get_key(self, kernel_name, macro_dict):
+        return kernel_name + "-".join(f"{key}={macro_dict[key]}" for key in sorted(macro_dict))
+
+    def __init__(self, file: str, kernel_name: str, file_dir: str = None, macro_dict: dict = {}):
+        kernel_key = self.get_key(kernel_name, macro_dict)
+        if kernel_key in self.kernel_map:
+            self._kernel = self.kernel_map[kernel_key]
+            return
+        self._tempdir = tempfile.TemporaryDirectory(suffix=f"{os.getpid()}")
+        self._current_file_dir = file_dir if file_dir else os.path.dirname(os.path.abspath(__file__))
+        self.macros = None
+        if file_dir:
+            self.macros = ["-D{}={}".format(macro, value) for macro, value in macro_dict.items()]
+        ptx = self._compile_cuda(os.path.join(self._current_file_dir, file), f"{kernel_name}.ptx")
+        self._kernel = Kernel(ptx, kernel_name)
+        self.kernel_map[kernel_key] = self._kernel
+
+    def _compile_cuda(self, source_file, output_file, std_version="c++17"):
+        mscclpp_home = os.environ.get("MSCCLPP_HOME", "/usr/local/mscclpp")
+        include_dir = os.path.join(mscclpp_home, "include")
+        if not cp.cuda.runtime.is_hip:
+            compute_capability = cp.cuda.Device().compute_capability
+            cuda_home = os.environ.get("CUDA_HOME")
+            nvcc = os.path.join(cuda_home, "bin/nvcc") if cuda_home else "nvcc"
+            command = [
+                nvcc,
+                f"-std={std_version}",
+                "-ptx",
+                "-Xcompiler",
+                "-Wall,-Wextra",
+                f"-I{include_dir}",
+                f"{source_file}",
+                f"--gpu-architecture=compute_{compute_capability}",
+                f"--gpu-code=sm_{compute_capability},compute_{compute_capability}",
+                "-o",
+                f"{self._tempdir.name}/{output_file}",
+            ]
+        else:
+            # the gcn arch name is like "gfx942:sramecc+:xnack-"
+            gcn_arch = (
+                cp.cuda.runtime.getDeviceProperties(cp.cuda.Device().id)["gcnArchName"].decode("utf-8").split(":")[0]
+            )
+            rocm_home = os.environ.get("ROCM_HOME")
+            hipcc = os.path.join(rocm_home, "bin/hipcc") if rocm_home else "hipcc"
+            command = [
+                hipcc,
+                f"-std={std_version}",
+                "--genco",
+                "-D__HIP_PLATFORM_AMD__",
+                f"--offload-arch={gcn_arch}",
+                f"-I{include_dir}",
+                f"{source_file}",
+                "-o",
+                f"{self._tempdir.name}/{output_file}",
+            ]
+        if self.macros:
+            command += self.macros
+        try:
+            subprocess.run(command, capture_output=True, text=True, check=True, bufsize=1)
+            with open(f"{self._tempdir.name}/{output_file}", "rb") as f:
+                return f.read()
+        except subprocess.CalledProcessError as e:
+            print(e.stderr, end="")
+            raise RuntimeError("Compilation failed: ", " ".join(command))
+
+    def get_compiled_kernel(self):
+        return self._kernel
+
+    def __del__(self):
+        if hasattr(self, "_tempdir"):
+            self._tempdir.cleanup()
+
+
+def pack(*args):
+    res = b""
+    for arg in list(args):
+        if isinstance(arg, int):
+            res += struct.pack("i", arg)
+        elif isinstance(arg, ctypes.c_size_t):
+            res += struct.pack("N", arg.value)
+        elif isinstance(arg, np.ndarray):
+            res += struct.pack("P", arg.ctypes.data)
+        elif isinstance(arg, cp.ndarray):
+            res += struct.pack("P", arg.data.ptr)
+        elif is_torch_tensor(arg):
+            res += struct.pack("P", arg.data_ptr())
+        # use int to represent bool, which can avoid CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES error
+        elif isinstance(arg, bool):
+            res += struct.pack("i", arg)
+        elif isinstance(arg, bytes):
+            res += struct.pack(f"{len(arg)}s", arg)
+        else:
+            raise RuntimeError(f"Unsupported type: {type(arg)}")
+    return res
+
+
+def is_torch_tensor(tensor: Any) -> bool:
+    return _use_torch and isinstance(tensor, torchTensor)
diff --git a/python/mscclpp/utils_py.cpp b/python/mscclpp/utils_py.cpp
index 16800a752..e9e847ee8 100644
--- a/python/mscclpp/utils_py.cpp
+++ b/python/mscclpp/utils_py.cpp
@@ -20,4 +20,5 @@ void register_utils(nb::module_& m) {
   nb::class_<ScopedTimer, Timer>(m, "ScopedTimer").def(nb::init<std::string>(), nb::arg("name"));
 
   m.def("get_host_name", &getHostName, nb::arg("maxlen"), nb::arg("delim"));
+  m.def("is_nvls_supported", &isNvlsSupported);
 }
diff --git a/python/mscclpp_benchmark/__init__.py b/python/mscclpp_benchmark/__init__.py
new file mode 100644
index 000000000..3f0560ca7
--- /dev/null
+++ b/python/mscclpp_benchmark/__init__.py
@@ -0,0 +1 @@
+from .mscclpp_op import MscclppAllReduce1, MscclppAllReduce2, MscclppAllReduce3, MscclppAllReduce4, MscclppAllReduce5
diff --git a/python/mscclpp_benchmark/allreduce.cu b/python/mscclpp_benchmark/allreduce.cu
new file mode 100644
index 000000000..72a9d6b7d
--- /dev/null
+++ b/python/mscclpp_benchmark/allreduce.cu
@@ -0,0 +1,836 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#if defined(__HIP_PLATFORM_AMD__)
+#include <hip/hip_fp16.h>
+#else
+#include <cuda_fp16.h>
+#endif
+
+#include <mscclpp/concurrency_device.hpp>
+#include <mscclpp/nvls_device.hpp>
+#include <mscclpp/proxy_channel_device.hpp>
+#include <mscclpp/sm_channel_device.hpp>
+
+__device__ mscclpp::DeviceSyncer deviceSyncer;
+__device__ mscclpp::DeviceSyncer allGatherDeviceSyncer;
+__device__ mscclpp::DeviceSyncer reduceScatterDeviceSyncer;
+__device__ mscclpp::DeviceSyncer ibDeviceSyncer;
+
+#ifndef TYPE
+#define TYPE float
+#endif
+
+#define VECTOR_SIZE (sizeof(int4) / sizeof(TYPE))
+
+template <typename To, typename From>
+__forceinline__ __device__ To bit_cast(const From& src) {
+  static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast");
+
+  union {
+    From f;
+    To t;
+  } u;
+  u.f = src;
+  return u.t;
+}
+
+template <typename T>
+__forceinline__ __device__ T add_elements(T a, T b) {
+  return a + b;
+}
+
+template <>
+__forceinline__ __device__ __half2 add_elements(__half2 a, __half2 b) {
+  return __hadd2(a, b);
+}
+
+template <typename T>
+__forceinline__ __device__ int4 add_vectors_helper(int4 a, int4 b) {
+  int4 ret;
+  ret.w = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.w), bit_cast<T, int>(b.w)));
+  ret.x = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
+  ret.y = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
+  ret.z = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.z), bit_cast<T, int>(b.z)));
+  return ret;
+}
+
+template <typename T>
+__forceinline__ __device__ int4 add_vectors(int4 a, int4 b) {
+  return add_vectors_helper<T>(a, b);
+}
+
+template <>
+__forceinline__ __device__ int4 add_vectors<__half>(int4 a, int4 b) {
+  return add_vectors_helper<__half2>(a, b);
+}
+
+template <typename T>
+__forceinline__ __device__ uint2 add_vectors_helper(uint2 a, uint2 b) {
+  uint2 ret;
+  ret.x = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
+  ret.y = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
+  return ret;
+}
+
+template <typename T>
+__forceinline__ __device__ uint2 add_vectors(uint2 a, uint2 b) {
+  return add_vectors_helper<T>(a, b);
+}
+
+template <>
+__forceinline__ __device__ uint2 add_vectors<__half>(uint2 a, uint2 b) {
+  return add_vectors_helper<__half2>(a, b);
+}
+
+template <typename T>
+__forceinline__ __device__ int add_vectors_helper(int a, int b) {
+  return bit_cast<int, T>(add_elements(bit_cast<T, int>(a), bit_cast<T, int>(b)));
+}
+
+template <typename T>
+__forceinline__ __device__ int add_vectors(int a, int b) {
+  return add_vectors_helper<T>(a, b);
+}
+
+template <>
+__forceinline__ __device__ int add_vectors<__half>(int a, int b) {
+  return add_vectors_helper<__half2>(a, b);
+}
+
+__forceinline__ __device__ void vectorSum(TYPE* dst, TYPE* src, size_t nElem, int blockId, int nBlocks) {
+  size_t nInt4 = nElem / 4;
+  size_t nLastInts = nElem % 4;
+  int4* dst4 = (int4*)dst;
+  int4* src4 = (int4*)src;
+  for (int i = threadIdx.x + blockId * blockDim.x; i < nInt4; i += blockDim.x * nBlocks) {
+    dst4[i] = add_vectors<TYPE>(dst4[i], src4[i]);
+  }
+  if (nLastInts > 0) {
+    int* dstLast = ((int*)dst) + nInt4 * 4;
+    int* srcLast = ((int*)src) + nInt4 * 4;
+    for (int i = threadIdx.x + blockId * blockDim.x; i < nLastInts; i += blockDim.x * nBlocks) {
+      dstLast[i] = add_vectors<TYPE>(dstLast[i], srcLast[i]);
+    }
+  }
+}
+
+__forceinline__ __device__ void vectorSum(TYPE* dst, TYPE* src, size_t nElem) {
+  vectorSum(dst, src, nElem, blockIdx.x, gridDim.x);
+}
+
+// -------------------------------------------
+// AllReduce1
+// -------------------------------------------
+
+template <int READ_ONLY>
+__device__ void allreduce1_helper(mscclpp::SmChannelDeviceHandle* smChans, TYPE* buff, int rank, int nranks,
+                                  size_t nelems) {
+  const size_t chunkSize = nelems / nranks;
+  if (nranks == 1) return;
+  const int nPeer = nranks - 1;
+  const size_t indexOffset = rank * chunkSize;
+  const size_t indexOffset4 = indexOffset / VECTOR_SIZE;
+  int4* buff4 = (int4*)buff;
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  // synchronize everyone
+  if (tid == 0) {
+    __threadfence_system();
+  }
+  __syncthreads();
+  if (tid < nPeer) {
+    smChans[tid].relaxedSignal();
+  }
+  if (tid >= nPeer && tid < nPeer * 2) {
+    smChans[tid - nPeer].wait();
+  }
+  deviceSyncer.sync(gridDim.x);
+
+  // use int4 as much as possible
+  const size_t nInt4 = chunkSize / VECTOR_SIZE;
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nInt4; idx += blockDim.x * gridDim.x) {
+    int4 tmp = buff4[indexOffset4 + idx];
+    for (int index = 0; index < nPeer; ++index) {
+      int4 val;
+      int peerIdx = (index + rank);
+      if (peerIdx >= nPeer) peerIdx -= nPeer;
+      val = smChans[peerIdx].read<int4>(indexOffset4 + idx);
+      tmp = add_vectors<TYPE>(tmp, val);
+    }
+    if (READ_ONLY == 0) {
+      for (int index = 0; index < nPeer; ++index) {
+        int peerIdx = (index + rank);
+        if (peerIdx >= nPeer) peerIdx -= nPeer;
+        smChans[peerIdx].write<int4>(indexOffset4 + idx, tmp);
+      }
+    }
+    buff4[indexOffset4 + idx] = tmp;
+  }
+
+  // use the given TYPE for the rest
+  size_t processed = nInt4 * VECTOR_SIZE * nranks;
+  const size_t nRemElems = nelems - processed;
+  const size_t startIdx = processed + (nRemElems * rank) / nranks;
+  const size_t endIdx = processed + (nRemElems * (rank + 1)) / nranks;
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x + startIdx; idx < endIdx; idx += blockDim.x * gridDim.x) {
+    TYPE tmp = buff[idx];
+    for (int index = 0; index < nPeer; ++index) {
+      int peerIdx = (index + rank);
+      if (peerIdx >= nPeer) peerIdx -= nPeer;
+      TYPE val = smChans[peerIdx].read<TYPE>(idx);
+      tmp += val;
+    }
+    if (READ_ONLY == 0) {
+      for (int index = 0; index < nPeer; ++index) {
+        int peerIdx = (index + rank);
+        if (peerIdx >= nPeer) peerIdx -= nPeer;
+        smChans[peerIdx].write<TYPE>(idx, tmp);
+      }
+    }
+    buff[idx] = tmp;
+  }
+
+  // synchronize everyone again
+  deviceSyncer.sync(gridDim.x);
+  if (tid == 0) {
+    __threadfence_system();
+  }
+  __syncthreads();
+  if (tid < nPeer) {
+    smChans[tid].relaxedSignal();
+  }
+  if (tid >= nPeer && tid < nPeer * 2) {
+    smChans[tid - nPeer].wait();
+  }
+
+  if (READ_ONLY) {
+    deviceSyncer.sync(gridDim.x);
+    for (int i = 0; i < nPeer; ++i) {
+      int peerIdx = (i + rank);
+      if (peerIdx >= nPeer) peerIdx -= nPeer;
+      const int remoteRank = (peerIdx < rank ? peerIdx : peerIdx + 1);
+      size_t offset = chunkSize * remoteRank * sizeof(TYPE);
+      smChans[peerIdx].get(offset, chunkSize * sizeof(TYPE), tid, blockDim.x * gridDim.x);
+    }
+  }
+}
+
+extern "C" __global__ void __launch_bounds__(1024, 1) allreduce1(mscclpp::SmChannelDeviceHandle* smChans, TYPE* buff,
+                                                                 int rank, int nranks, size_t nelems, int read_only) {
+  if (read_only)
+    allreduce1_helper<1>(smChans, buff, rank, nranks, nelems);
+  else
+    allreduce1_helper<0>(smChans, buff, rank, nranks, nelems);
+}
+
+// -------------------------------------------
+// AllReduce2
+// -------------------------------------------
+
+__device__ uint64_t globalFlag = 1;
+
+extern "C" __global__ void __launch_bounds__(1024, 1)
+    allreduce2(mscclpp::SmChannelDeviceHandle* smChans, TYPE* buff, TYPE* scratch, void* resultBuff, int rank,
+               int worldSize, size_t nelems) {
+  nelems = nelems / (sizeof(int) / sizeof(TYPE));
+  // This version of allreduce only works for single nodes
+  const int nPeers = worldSize - 1;
+  const size_t nPkts = nelems / 2;
+  const int nelemsPerRank = nelems / worldSize;
+  const int nPktsPerRank = nelemsPerRank / 2;
+  // flag for packets. Initially 1
+  const uint32_t flag = (uint32_t)globalFlag;
+  // thread block & channel info
+  const int nBlocksPerPeer = gridDim.x / nPeers;
+  const int localBlockIdx = blockIdx.x % nBlocksPerPeer;
+  const int peerIdx = blockIdx.x / nBlocksPerPeer;
+  const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1;
+  mscclpp::SmChannelDeviceHandle smChan = smChans[peerIdx];
+  const int tid = threadIdx.x + localBlockIdx * blockDim.x;
+  // double buffering
+  size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LLPacket);
+  void* scratchBuff = (void*)((char*)scratch + scratchBaseOffset);
+  size_t scratchOffset = scratchBaseOffset + rank * nPktsPerRank * sizeof(mscclpp::LLPacket);
+  size_t scratchResultOffset =
+      (flag & 1) ? 2 * nPkts * sizeof(mscclpp::LLPacket) : 3 * nPkts * sizeof(mscclpp::LLPacket);
+  size_t srcOffset = remoteRank * nelemsPerRank * sizeof(int);
+  uint2* src = (uint2*)((char*)buff + rank * nelemsPerRank * sizeof(int));
+  uint2* dst = (uint2*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int));
+
+  // step 1: write to scratch buffer
+  smChan.putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer, flag);
+  // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) {
+    uint2 data = make_uint2(0, 0);
+    for (int index = 0; index < nPeers; index++) {
+      const int remoteRank = index < rank ? index : index + 1;
+      mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank;
+      uint2 val = dstPkt[idx].read(flag);
+      data = add_vectors<TYPE>(val, data);
+    }
+    data = add_vectors<TYPE>(data, src[idx]);
+    dst[idx] = data;
+
+    mscclpp::LLPacket packet;
+    packet.data1 = data.x;
+    packet.flag1 = flag;
+    packet.data2 = data.y;
+    packet.flag2 = flag;
+    size_t offset = scratchResultOffset / sizeof(mscclpp::LLPacket) + (idx + rank * nPktsPerRank);
+    for (int index = 0; index < nPeers; index++) {
+      smChans[index].write(offset, packet);
+    }
+  }
+  // step 3: get data result from scratch buffer
+  mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)((char*)scratch + scratchResultOffset);
+  const int dstOffset = remoteRank * nPktsPerRank;
+  uint2* result = (uint2*)((char*)resultBuff + remoteRank * nelemsPerRank * sizeof(int));
+  for (int idx = threadIdx.x + localBlockIdx * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * nBlocksPerPeer) {
+    uint2 data = dstPkt[idx + dstOffset].read(flag);
+    result[idx].x = data.x;
+    result[idx].y = data.y;
+  }
+  if (threadIdx.x == 0 && blockIdx.x == 0) {
+    globalFlag += 1;
+  }
+}
+
+// -------------------------------------------
+// AllReduce3
+// -------------------------------------------
+
+extern "C" __global__ void __launch_bounds__(1024, 1)
+    allreduce3(mscclpp::SimpleProxyChannelDeviceHandle* fstRoundChans,
+               mscclpp::SimpleProxyChannelDeviceHandle* sndRoundChans, TYPE* buff, TYPE* scratch, int rank,
+               int worldSize, size_t nelems) {
+  nelems = nelems / (sizeof(int) / sizeof(TYPE));
+
+  int isComm = (threadIdx.x == 0) && (blockIdx.x == 0);
+  int remoteSendRank = (rank + 1) % worldSize;
+  int remoteRecvRank = (rank + worldSize - 1) % worldSize;
+  int peerSendId = (remoteSendRank < rank) ? remoteSendRank : remoteSendRank - 1;
+  int peerRecvId = (remoteRecvRank < rank) ? remoteRecvRank : remoteRecvRank - 1;
+
+  mscclpp::SimpleProxyChannelDeviceHandle& devFstSendChan = fstRoundChans[peerSendId];
+  mscclpp::SimpleProxyChannelDeviceHandle& devFstRecvChan = fstRoundChans[peerRecvId];
+  mscclpp::SimpleProxyChannelDeviceHandle& devSndSendChan = sndRoundChans[peerSendId];
+  mscclpp::SimpleProxyChannelDeviceHandle& devSndRecvChan = sndRoundChans[peerRecvId];
+
+  // Step 1
+  size_t chunkIndex = (rank + worldSize - 1) % worldSize;
+  size_t chunkNelem = nelems / worldSize;
+  size_t offset = chunkIndex * chunkNelem * sizeof(int);
+  if (isComm) {
+    if (chunkNelem > 1) {
+      devFstSendChan.putWithSignal(offset, chunkNelem / 2 * sizeof(int));
+    }
+  }
+
+  // Step 2 ~ Step n-1
+  for (int step = 2; step < worldSize; ++step) {
+    if (isComm) {
+      if (chunkNelem > 1) {
+        devFstRecvChan.wait();
+        devFstSendChan.flush();
+      }
+      devFstSendChan.putWithSignal(offset + chunkNelem / 2 * sizeof(int), (chunkNelem - chunkNelem / 2) * sizeof(int));
+    }
+    deviceSyncer.sync(gridDim.x);
+
+    // Reduce
+    chunkIndex = (rank + worldSize - step) % worldSize;
+    offset = chunkIndex * chunkNelem * sizeof(int);
+    int* dst = (int*)((char*)buff + offset);
+    int* src = (int*)((char*)scratch + offset);
+    vectorSum((TYPE*)dst, (TYPE*)src, chunkNelem / 2);
+
+    if (isComm) {
+      devFstRecvChan.wait();
+      devFstSendChan.flush();
+      if (chunkNelem > 1) {
+        devFstSendChan.putWithSignal(offset, chunkNelem / 2 * sizeof(int));
+      }
+    }
+    deviceSyncer.sync(gridDim.x);
+
+    dst += chunkNelem / 2;
+    src += chunkNelem / 2;
+    vectorSum((TYPE*)dst, (TYPE*)src, chunkNelem - chunkNelem / 2);
+  }
+
+  // Step n
+  if (isComm) {
+    if (chunkNelem > 1) {
+      devFstRecvChan.wait();
+      devFstSendChan.flush();
+    }
+    devFstSendChan.putWithSignal(offset + chunkNelem / 2 * sizeof(int), (chunkNelem - chunkNelem / 2) * sizeof(int));
+  }
+  deviceSyncer.sync(gridDim.x);
+
+  offset = rank * chunkNelem * sizeof(int);
+  int* dst = (int*)((char*)buff + offset);
+  int* src = (int*)((char*)scratch + offset);
+  vectorSum((TYPE*)dst, (TYPE*)src, chunkNelem / 2);
+
+  if (isComm) {
+    devFstRecvChan.wait();
+    devFstSendChan.flush();
+    if (chunkNelem > 1) {
+      devSndSendChan.putWithSignal(offset, chunkNelem / 2 * sizeof(int));
+    }
+  }
+  deviceSyncer.sync(gridDim.x);
+
+  dst += chunkNelem / 2;
+  src += chunkNelem / 2;
+  vectorSum((TYPE*)dst, (TYPE*)src, chunkNelem - chunkNelem / 2);
+
+  if (isComm) {
+    if (chunkNelem > 1) {
+      devSndRecvChan.wait();
+      devSndSendChan.flush();
+    }
+    devSndSendChan.putWithSignalAndFlush(offset + chunkNelem / 2 * sizeof(int),
+                                         (chunkNelem - chunkNelem / 2) * sizeof(int));
+  }
+
+  // Step n+1 ~ Step 2n-2
+  for (int i = 1; i < worldSize - 1; ++i) {
+    if (isComm) {
+      devSndRecvChan.wait();
+    }
+    deviceSyncer.sync(gridDim.x);
+
+    // Copy
+    chunkIndex = (rank + worldSize - i) % worldSize;
+    if (isComm) {
+      devSndSendChan.putWithSignalAndFlush(chunkIndex * chunkNelem * sizeof(int), chunkNelem * sizeof(int));
+    }
+  }
+
+  // Final receive
+  if (isComm) {
+    devSndRecvChan.wait();
+  }
+}
+
+// -------------------------------------------
+// AllReduce4
+// 2-node
+// -------------------------------------------
+__device__ void localReduceScatterSm(mscclpp::SmChannelDeviceHandle* smChans, TYPE* buff, int rank, int nRanksPerNode,
+                                     int startChunkIndex, size_t offsetInChunk, size_t chunkSize, size_t nelems,
+                                     int nBlocks) {
+  if (nRanksPerNode == 1) return;
+  if (blockIdx.x >= nBlocks) return;
+  const int nPeer = nRanksPerNode - 1;
+
+  const size_t localRankIndexInNode = rank % nRanksPerNode;
+  const size_t indexOffset = ((localRankIndexInNode + startChunkIndex) * chunkSize + offsetInChunk);
+  const size_t indexOffset4 = indexOffset / 4;
+
+  int4* buff4 = (int4*)buff;
+
+  for (int peerIdx = threadIdx.x + blockIdx.x * blockDim.x; peerIdx < nPeer; peerIdx += blockDim.x * nBlocks) {
+    smChans[peerIdx].relaxedSignal();
+  }
+  for (int peerIdx = threadIdx.x + blockIdx.x * blockDim.x; peerIdx < nPeer; peerIdx += blockDim.x * nBlocks) {
+    smChans[peerIdx].wait();
+  }
+  reduceScatterDeviceSyncer.sync(nBlocks);
+
+  const size_t nInt4 = nelems / 4;
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nInt4; idx += blockDim.x * nBlocks) {
+    int4 tmp = buff4[indexOffset4 + idx];
+    for (int index = 0; index < nPeer; ++index) {
+      int4 val;
+      int peerIdx = index + localRankIndexInNode;
+      if (peerIdx >= nPeer) peerIdx -= nPeer;
+      val = smChans[peerIdx].read<int4>(indexOffset4 + idx);
+      tmp = add_vectors<TYPE>(tmp, val);
+    }
+    buff4[indexOffset4 + idx] = tmp;
+  }
+
+  // TODO: deal with rest elements
+}
+
+// This kernel is the most performant when the number of blocks is a multiple of (nRanksPerNode - 1).
+__device__ void localAllGatherSm(mscclpp::SmChannelDeviceHandle* smChans, int rank, int nRanksPerNode,
+                                 int startRankChunkIndex, uint64_t offsetInRankChunk, uint64_t rankChunkSize,
+                                 uint64_t size, size_t nBlocks) {
+  if (nRanksPerNode == 1) return;
+  if (blockIdx.x >= nBlocks) return;
+  const size_t nPeer = nRanksPerNode - 1;
+  const size_t peerIdx = blockIdx.x % nPeer;
+  const size_t nBlockForThisPeer = nBlocks / nPeer + (nBlocks % nPeer > peerIdx ? 1 : 0);
+  const size_t peerLocalBlockIdx = blockIdx.x / nPeer;
+  const size_t rankLocalIndex = rank % nRanksPerNode;
+  const int remoteRankLocalIndex = (peerIdx < rankLocalIndex ? peerIdx : peerIdx + 1);
+
+  // Split the data into chunks for aligned data access. Ignore the remainder here and let the last block handle it.
+  constexpr size_t chunkBytes = 128;  // heuristic value
+  const size_t nChunk = size / chunkBytes;
+  const size_t nMinChunkPerBlock = nChunk / nBlockForThisPeer;
+  const size_t nRemainderChunk = nChunk % nBlockForThisPeer;
+
+  // Distribute chunks to blocks
+  size_t nChunkForThisBlock;
+  size_t offsetForThisBlock;
+  if (peerLocalBlockIdx < nRemainderChunk) {
+    nChunkForThisBlock = nMinChunkPerBlock + 1;
+    offsetForThisBlock = (nMinChunkPerBlock + 1) * peerLocalBlockIdx;
+  } else {
+    nChunkForThisBlock = nMinChunkPerBlock;
+    offsetForThisBlock =
+        (nMinChunkPerBlock + 1) * nRemainderChunk + (peerLocalBlockIdx - nRemainderChunk) * nMinChunkPerBlock;
+  }
+  offsetForThisBlock *= chunkBytes;
+
+  // Calculate the size of the data for this block
+  size_t sizeForThisBlock = nChunkForThisBlock * chunkBytes;
+  const size_t lastChunkSize = size - nChunk * chunkBytes;
+  if (lastChunkSize > 0 && peerLocalBlockIdx == nBlockForThisPeer - 1) {
+    sizeForThisBlock += lastChunkSize;
+  }
+  if (threadIdx.x == 0 && peerLocalBlockIdx == 0) {
+    smChans[peerIdx].relaxedSignal();
+    smChans[peerIdx].wait();
+  }
+  allGatherDeviceSyncer.sync(nBlocks);
+  size_t offset = rankChunkSize * (startRankChunkIndex + remoteRankLocalIndex) + offsetInRankChunk;
+  smChans[peerIdx].get(offset + offsetForThisBlock, sizeForThisBlock, threadIdx.x, blockDim.x);
+}
+
+__device__ void localAllGatherAllPairsSm(mscclpp::SmChannelDeviceHandle* smChans, int rank, int nRanksPerNode,
+                                         uint64_t size, size_t nBlocks) {
+  if (nRanksPerNode == 1) return;
+  if (blockIdx.x >= nBlocks) return;
+
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int nPeer = nRanksPerNode - 1;
+
+  if (tid < nPeer) {
+    smChans[tid].signal();
+  }
+  int waitStart = nBlocks * blockDim.x - nPeer;
+  if (tid >= waitStart && tid < nBlocks * blockDim.x) {
+    smChans[tid - waitStart].wait();
+  }
+  allGatherDeviceSyncer.sync(nBlocks);
+  for (int i = 0; i < nPeer; ++i) {
+    int peerIdx = (i + rank) % nPeer;
+    const int remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1);
+    size_t offset = size * remoteRankLocalIndex;
+    smChans[peerIdx].get(offset, size, tid, blockDim.x * nBlocks);
+  }
+}
+
+// This is an allgather4 equivalent
+__device__ void allGatherSm(mscclpp::SmChannelDeviceHandle* smChans,
+                            mscclpp::SimpleProxyChannelDeviceHandle* proxyChans, int rank, int worldSize,
+                            int nRanksPerNode, size_t nelemsPerGPU, int pipelineDepth) {
+  // this allgather is a pipelined and hierarchical one and only works for two nodes
+  // it is implemented as follows:
+  // Step 1: each node does a local allgather and concurrently,
+  // local GPU i exchange (piplineSize-1)/pipelineSize portion of their data with
+  // its cross-node neighbor (local GPU i on the other node) via IB
+  // Step 2: each node does a local allgather again with the data just received from its
+  // cross-node neighbor in step 1, and concurrently, exchange the rest of the data with
+  // its cross-node neighbor
+  // Step 3: each node does a local allgather for the last time with the rest of the data
+
+  int pipelineSize = pipelineDepth;
+  int peerRank = (rank + nRanksPerNode) % worldSize;
+  int peerNodeId = peerRank / nRanksPerNode;
+  int peer = (peerRank < rank) ? peerRank : peerRank - 1;
+  mscclpp::SimpleProxyChannelDeviceHandle proxyChan = proxyChans[peer];
+  const size_t nBlocksForLocalAllGather = gridDim.x / (nRanksPerNode - 1) * (nRanksPerNode - 1);
+  const size_t rankChunkSize = nelemsPerGPU * sizeof(int);
+  const int startRankIndexInLocalNode = (rank / nRanksPerNode) * nRanksPerNode;
+  const int startRankIndexInPeerNode = (peerRank / nRanksPerNode) * nRanksPerNode;
+
+  if (peerNodeId == rank / nRanksPerNode) {
+    localAllGatherSm(smChans, rank, nRanksPerNode, 0, 0, rankChunkSize, rankChunkSize, gridDim.x);
+    return;
+  }
+
+  constexpr size_t alignment = 128;
+  size_t step1Bytes = (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int);
+  step1Bytes = step1Bytes / alignment * alignment;
+  const size_t step2Bytes = nelemsPerGPU * sizeof(int) - step1Bytes;
+
+  // Step 1
+  if (threadIdx.x == 0 && blockIdx.x == 0 && step1Bytes > 0) {
+    proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), step1Bytes);
+  }
+  localAllGatherSm(smChans, rank, nRanksPerNode, startRankIndexInLocalNode, 0, rankChunkSize, rankChunkSize,
+                   nBlocksForLocalAllGather);
+  if (threadIdx.x == 0 && blockIdx.x == 0 && step1Bytes > 0) {
+    proxyChan.wait();
+    proxyChan.flush();
+  }
+  deviceSyncer.sync(gridDim.x);
+  // Step 2
+  if (threadIdx.x == 0 && blockIdx.x == 0) {
+    proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int) + step1Bytes, step2Bytes);
+  }
+  if (step1Bytes > 0)
+    localAllGatherSm(smChans, rank, nRanksPerNode, startRankIndexInPeerNode, 0, rankChunkSize, step1Bytes,
+                     nBlocksForLocalAllGather);
+  if (threadIdx.x == 0 && blockIdx.x == 0) {
+    proxyChan.wait();
+    proxyChan.flush();
+  }
+  deviceSyncer.sync(gridDim.x);
+  // Step 3
+  localAllGatherSm(smChans, rank, nRanksPerNode, startRankIndexInPeerNode, step1Bytes, rankChunkSize, step2Bytes,
+                   nBlocksForLocalAllGather);
+}
+
+__device__ void reduceScatterSm(mscclpp::SmChannelDeviceHandle* smChans,
+                                mscclpp::SimpleProxyChannelDeviceHandle* proxyChans, TYPE* buff, TYPE* scratch,
+                                int rank, int nRanksPerNode, int worldSize,
+                                size_t nelems,  // must be divisible by 3
+                                int pipelineDepth) {
+  // this reduce-scatter algorithm works as follows:
+  // Step 1: each node does a local reduce-scatter on peer node data chunks with 1/pipeline portion of chunk data. For
+  // example, 2 nodes and each node has 2 ranks. rank 0 and rank 1 perform reduce-scatter on chunk 2 and chunk 3, with
+  // 1/pipeline portion of the data.
+  // Step 2: each node does a local reduce-scatter on peers data chunks with (pipeline-1)/pipeline portion of chunk
+  // data. Meanwhile, exchange the reduced data of the previous step with its cross-node neighbor (same local rank
+  // number on the other node) via IB. Then performs a reduce operation.
+  // Step 3:  each node does a local reduce-scatter on local ranks, meanwhile exchange the reduced data of the previous
+  // step with its cross-node neighbor (same local rank number on the other node) via IB. Then performs a reduce
+  // operation.
+  int pipelineSize = pipelineDepth;
+  float nBlocksForReduceScatterRatio = 0.8;
+  const size_t chunkSize = nelems / worldSize;
+  const int peerRank = (rank + nRanksPerNode) % worldSize;
+  int peerNodeId = peerRank / nRanksPerNode;
+  int nBlocksForReduceScatter =
+      (int)(nBlocksForReduceScatterRatio * gridDim.x) / (nRanksPerNode - 1) * (nRanksPerNode - 1);
+  int isComm = (threadIdx.x == 0) && (blockIdx.x == nBlocksForReduceScatter);
+  int peer = (peerRank < rank) ? peerRank : peerRank - 1;
+  int nBlocksRemain = gridDim.x - nBlocksForReduceScatter;
+  mscclpp::SimpleProxyChannelDeviceHandle proxyChan = proxyChans[peer];
+  if (peerNodeId == rank / nRanksPerNode) {
+    localReduceScatterSm(smChans, buff, rank, nRanksPerNode, 0, 0, chunkSize, chunkSize, gridDim.x);
+    return;
+  }
+
+  // step 1: local reduce
+  int startChunkIndex = peerNodeId * nRanksPerNode;
+  localReduceScatterSm(smChans, buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize / pipelineSize,
+                       nBlocksForReduceScatter);
+  deviceSyncer.sync(gridDim.x);
+
+  // step 2: local reduce and exchange data with neighbor
+  if (isComm) {
+    size_t offset = (peerRank * chunkSize) * sizeof(int);
+    // opposite side
+    proxyChan.putWithSignal(offset, (chunkSize / pipelineSize * sizeof(int)));
+  }
+  if (pipelineSize > 1)
+    localReduceScatterSm(smChans, buff, rank, nRanksPerNode, startChunkIndex, chunkSize / pipelineSize, chunkSize,
+                         (pipelineSize - 1) * chunkSize / pipelineSize, nBlocksForReduceScatter);
+  if (isComm) {
+    proxyChan.wait();
+  }
+  if (blockIdx.x >= nBlocksForReduceScatter) {
+    ibDeviceSyncer.sync(nBlocksRemain);
+    // reduce data received from peer to related rank
+    size_t offset = rank * chunkSize * sizeof(int);
+    int* dst = (int*)((char*)buff + offset);
+    int* src = (int*)((char*)scratch + offset);
+    vectorSum((TYPE*)dst, (TYPE*)src, chunkSize / pipelineSize, blockIdx.x - nBlocksForReduceScatter, nBlocksRemain);
+  }
+  if (isComm) {
+    proxyChan.flush();
+  }
+  deviceSyncer.sync(gridDim.x);
+
+  // step 3: local reduce and exchange data with neighbor
+  startChunkIndex = (rank / nRanksPerNode) * nRanksPerNode;
+  if (isComm && pipelineSize > 1) {
+    size_t offset = (peerRank * chunkSize + chunkSize / pipelineSize) * sizeof(int);
+    proxyChan.putWithSignal(offset, (pipelineSize - 1) * chunkSize / pipelineSize * sizeof(int));
+  }
+  localReduceScatterSm(smChans, buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize,
+                       nBlocksForReduceScatter);
+  if (isComm && pipelineSize > 1) {
+    proxyChan.wait();
+  }
+  deviceSyncer.sync(gridDim.x);
+  // reduce to related rank, can not overlap since localReduceScatter also calculate the sum
+  size_t offset = (rank * chunkSize + chunkSize / pipelineSize) * sizeof(int);
+  int* dst = (int*)((char*)buff + offset);
+  int* src = (int*)((char*)scratch + offset);
+  if (pipelineSize > 1) vectorSum((TYPE*)dst, (TYPE*)src, (pipelineSize - 1) * chunkSize / pipelineSize);
+  if (isComm) {
+    proxyChan.flush();
+  }
+}
+
+extern "C" __global__ void __launch_bounds__(1024, 1) __global__
+    allreduce4(mscclpp::SmChannelDeviceHandle* smChans,
+               mscclpp::SimpleProxyChannelDeviceHandle* reduceScatterProxyChans,
+               mscclpp::SimpleProxyChannelDeviceHandle* allGatherProxyChans, TYPE* buff, TYPE* scratch, int rank,
+               int nRanksPerNode, int worldSize, size_t nelems, int pipelineDepth) {
+  nelems = nelems / (sizeof(int) / sizeof(TYPE));
+  reduceScatterSm(smChans, reduceScatterProxyChans, buff, scratch, rank, nRanksPerNode, worldSize, nelems,
+                  pipelineDepth);
+  deviceSyncer.sync(gridDim.x);
+  allGatherSm(smChans, allGatherProxyChans, rank, worldSize, nRanksPerNode, nelems / worldSize, pipelineDepth);
+}
+
+// allreduce 5 for 2-nodes
+extern "C" __global__ void __launch_bounds__(1024, 1)
+    allreduce5(mscclpp::SmChannelDeviceHandle* smChans, mscclpp::SimpleProxyChannelDeviceHandle* proxyChans, TYPE* buff,
+               TYPE* scratch, TYPE* putBuff, TYPE* resultBuff, int rank, int nRanksPerNode, int worldSize,
+               size_t nelems) {
+  nelems = nelems / (sizeof(int) / sizeof(TYPE));
+  // This version of allreduce only works for single nodes
+  const int nPeersInNode = nRanksPerNode - 1;
+  const int nPkts = nelems / 2;
+  const int nelemsPerLocalRank = nelems / nRanksPerNode;
+  const int nPktsPerLocalRank = nelemsPerLocalRank / 2;
+  const int localRankId = rank % nRanksPerNode;
+  // flag for packets. Initially 1
+  const uint32_t flag = (uint32_t)globalFlag;
+  // thread block & channel info
+  const int nBlocksPerPeer = gridDim.x / nPeersInNode;
+  const int localBlockIdx = blockIdx.x % nBlocksPerPeer;
+  const int peerIdx = blockIdx.x / nBlocksPerPeer;
+  const int remoteRankIdx = peerIdx < localRankId ? peerIdx : peerIdx + 1;
+  mscclpp::SmChannelDeviceHandle smChan = smChans[peerIdx];
+  mscclpp::SimpleProxyChannelDeviceHandle proxyChan = proxyChans[localRankId];
+  const int tid = threadIdx.x + localBlockIdx * blockDim.x;
+  // double buffering
+  size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LLPacket);
+  size_t putBaseOffset = (flag & 1) ? 0 : nPktsPerLocalRank * sizeof(mscclpp::LLPacket);
+  void* scratchBuff = (void*)((char*)scratch + scratchBaseOffset);
+  size_t scratchOffset = scratchBaseOffset + localRankId * nPktsPerLocalRank * sizeof(mscclpp::LLPacket);
+  size_t scratchResultOffset =
+      (flag & 1) ? 2 * nPkts * sizeof(mscclpp::LLPacket) : 3 * nPkts * sizeof(mscclpp::LLPacket);
+  size_t srcOffset = remoteRankIdx * nelemsPerLocalRank * sizeof(int);
+  uint2* src = (uint2*)((char*)buff + localRankId * nelemsPerLocalRank * sizeof(int));
+  uint2* dst = (uint2*)((char*)resultBuff + localRankId * nelemsPerLocalRank * sizeof(int));
+
+  // step 1: write to scratch buffer
+  if (nRanksPerNode > 1) {
+    smChan.putPackets(scratchOffset, srcOffset, nelemsPerLocalRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer,
+                      flag);
+  }
+  // step 2: get data from scratch buffer, do local reduce-scatter in each node.
+  mscclpp::LLPacket* putPkt = (mscclpp::LLPacket*)((char*)putBuff + putBaseOffset);
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerLocalRank; idx += blockDim.x * gridDim.x) {
+    uint2 data = make_uint2(0, 0);
+    for (int index = 0; index < nPeersInNode; index++) {
+      const int remoteRank = index < localRankId ? index : index + 1;
+      mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerLocalRank;
+      uint2 val = dstPkt[idx].read(flag);
+      data = add_vectors<TYPE>(val, data);
+    }
+    data = add_vectors<TYPE>(data, src[idx]);
+    putPkt[idx].write(data.x, data.y, flag);
+    dst[idx] = data;
+  }
+  deviceSyncer.sync(gridDim.x);
+  // step 3. send local reduced data to remote node.
+  if (threadIdx.x == 0 && blockIdx.x == 0) {
+    proxyChan.put(scratchOffset, putBaseOffset, nPktsPerLocalRank * sizeof(mscclpp::LLPacket));
+    if ((flag & 63) == 0) {
+      proxyChan.flush();
+    }
+  }
+  // step 4. try to read the data from scratch buffer and write to local peers
+  mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + localRankId * nPktsPerLocalRank;
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerLocalRank; idx += blockDim.x * gridDim.x) {
+    uint2 res = dst[idx];
+    uint2 val = dstPkt[idx].read(flag);
+    res = add_vectors<TYPE>(res, val);
+
+    mscclpp::LLPacket packet;
+    packet.data1 = res.x;
+    packet.flag1 = flag;
+    packet.data2 = res.y;
+    packet.flag2 = flag;
+    size_t offset = scratchResultOffset / sizeof(mscclpp::LLPacket) + (idx + localRankId * nPktsPerLocalRank);
+    for (int index = 0; index < nPeersInNode; index++) {
+      smChans[index].write(offset, packet);
+    }
+    dst[idx] = res;
+  }
+
+  // step 5: get data result from scratch buffer
+  dstPkt = (mscclpp::LLPacket*)((char*)scratch + scratchResultOffset);
+  const int dstOffset = remoteRankIdx * nPktsPerLocalRank;
+  uint2* result = (uint2*)((char*)resultBuff + remoteRankIdx * nelemsPerLocalRank * sizeof(int));
+  if (nRanksPerNode > 1) {
+    for (int idx = threadIdx.x + localBlockIdx * blockDim.x; idx < nPktsPerLocalRank;
+         idx += blockDim.x * nBlocksPerPeer) {
+      uint2 data = dstPkt[idx + dstOffset].read(flag);
+      result[idx] = data;
+    }
+  }
+  if (threadIdx.x == 0 && blockIdx.x == 0) {
+    globalFlag += 1;
+  }
+}
+
+// -------------------------------------------
+// AllReduce6
+// NVLS
+// -------------------------------------------
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+
+extern "C" __global__ void __launch_bounds__(1024, 1)
+    allreduce6(mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores,
+               mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs, TYPE* buff, int my_rank, int nranks,
+               size_t nelem) {
+  float* dev_ptr = (float*)nvlsPtrs.devicePtr;
+  float* mc_ptr = (float*)nvlsPtrs.mcPtr;
+  int tid = threadIdx.x;
+  int bid = blockIdx.x;
+
+  if (tid == 0 && bid == 0) {
+    __threadfence_system();
+  }
+  if (bid == 0) {
+    if (tid < nranks - 1) {
+      semaphores[tid].signal();
+      semaphores[tid].wait();
+    }
+  }
+  deviceSyncer.sync(gridDim.x);
+
+  int my_st = ((int64_t)nelem * (int64_t)my_rank) / (int64_t)nranks;
+  int my_en = ((int64_t)nelem * (int64_t)(my_rank + 1)) / (int64_t)nranks;
+
+  int my_offset = (tid + bid * blockDim.x) * 4;
+  int my_step = blockDim.x * gridDim.x * 4;
+
+  for (int idx = my_st + my_offset; idx < my_en; idx += my_step) {
+    uint4 val;
+    nvlsPtrs.multimemLoad(val, mc_ptr + idx);
+    nvlsPtrs.multimemStore(val, mc_ptr + idx);
+  }
+
+  deviceSyncer.sync(gridDim.x);
+  if (tid == 0 && bid == 0) {
+    __threadfence_system();
+  }
+
+  if (bid == 0) {
+    if (tid < nranks - 1) {
+      semaphores[tid].signal();
+      semaphores[tid].wait();
+    }
+  }
+  deviceSyncer.sync(gridDim.x);
+}
+#endif
diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py
new file mode 100644
index 000000000..5cc2593d8
--- /dev/null
+++ b/python/mscclpp_benchmark/allreduce_bench.py
@@ -0,0 +1,292 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import cupy as cp
+from mscclpp_op import (
+    MscclppAllReduce1,
+    MscclppAllReduce2,
+    MscclppAllReduce3,
+    MscclppAllReduce4,
+    MscclppAllReduce5,
+    MscclppAllReduce6,
+)
+from nccl_op import NcclAllReduce
+from mpi4py import MPI
+import cupy.cuda.nccl as nccl
+import mscclpp.comm as mscclpp_comm
+from mscclpp import ProxyService, is_nvls_supported
+from prettytable import PrettyTable
+import netifaces as ni
+
+data_type = cp.float32
+
+if data_type == cp.float16:
+    dtype_str = "fp16"
+elif data_type == cp.float32:
+    dtype_str = "fp32"
+elif data_type == cp.int32:
+    dtype_str = "int32"
+else:
+    raise RuntimeError("Unknown data type")
+
+
+def plot_graph(sizes, mscclpp_algbw, nccl_algbw, speed_ups):
+    import matplotlib.pyplot as plt
+
+    human_readable_sizes = [human_readable_size(size) for size in sizes]
+
+    fig, ax1 = plt.subplots(figsize=(10, 6))
+
+    # Plotting AlgBW for MSCCLPP and NCCL on the primary y-axis
+    (line1,) = ax1.plot(sizes, mscclpp_algbw, marker="o", color="blue", label="MSCCLPP AlgBW")
+    (line2,) = ax1.plot(sizes, nccl_algbw, marker="x", color="red", label="NCCL AlgBW")
+    ax1.set_ylabel("AlgBW (GB/s)")
+    ax1.set_xlabel("Data Size")
+
+    # Logarithmic x-axis
+    ax1.set_xscale("log", base=2)
+    ax1.set_xticks(sizes)
+    ax1.set_xticklabels(human_readable_sizes, rotation=45)
+
+    # Adding secondary y-axis for Speed Up
+    ax2 = ax1.twinx()
+    (line3,) = ax2.plot(sizes, speed_ups, marker="^", color="green", label="Speed Up")
+    ax2.set_ylabel("Speed Up (NCCL Time / MSCCLPP Time)", color="green")
+    ax2.tick_params(axis="y", labelcolor="green")
+
+    # Set the lower bound of the secondary y-axis to 0
+    ax2.set_ylim(bottom=0)
+
+    # Creating legends
+    lines = [line1, line2, line3]
+    labels = [line.get_label() for line in lines]
+    ax1.legend(lines, labels, loc="upper left")
+
+    # Setting title and grid
+    num_nodes = MPI.COMM_WORLD.size // N_GPUS_PER_NODE
+    ax1.set_title(f"MSCCLPP vs NCCL -- {num_nodes} Nodes")
+    ax2.grid(True, which="both", ls="--")
+
+    # Saving the plot
+    plt.savefig(f"mscclpp_vs_nccl_comparison_num_nodes_{num_nodes}.jpeg", format="jpeg")
+
+
+def human_readable_size(size, decimal_places=1):
+    for unit in ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]:
+        if size < 1024.0 or unit == "PiB":
+            break
+        size /= 1024.0
+    return f"{size:.{decimal_places}f} {unit}"
+
+
+def check_correctness(memory, func, niter=100):
+    ac = True
+    for p in range(niter):
+        memory[:] = cp.ones(memory.shape).astype(data_type) * (p * MPI.COMM_WORLD.size + MPI.COMM_WORLD.rank)
+        cp.cuda.runtime.deviceSynchronize()
+        output_memory = func(None)
+        cp.cuda.runtime.deviceSynchronize()
+        expected = cp.zeros_like(memory)
+        for i in range(MPI.COMM_WORLD.size):
+            expected += cp.ones(memory.shape).astype(data_type) * (p * MPI.COMM_WORLD.size + i)
+
+        is_close = cp.isclose(output_memory, expected, rtol=1.0e-2, atol=2)
+        icf = is_close == 0
+        all_close = cp.all(is_close)
+        ac = ac and all_close
+        if not all_close:
+            print(
+                f"not close: p={p}, rank={MPI.COMM_WORLD.rank}, output={output_memory[icf][0]}, expected={expected[icf][0]}",
+                flush=True,
+            )
+
+    ac = MPI.COMM_WORLD.allreduce(ac, op=MPI.SUM)
+    return ac
+
+
+def bench_time(niter: int, func):
+    # capture cuda graph for nites of the kernel launch
+    stream = cp.cuda.Stream(non_blocking=True)
+    with stream:
+        stream.begin_capture()
+        for i in range(niter):
+            func(stream)
+        graph = stream.end_capture()
+
+    # now run a warm up round
+    graph.launch(stream)
+
+    # now run the benchmark and measure time
+    start = cp.cuda.Event()
+    end = cp.cuda.Event()
+
+    start.record(stream)
+    graph.launch(stream)
+    end.record(stream)
+    end.synchronize()
+
+    return cp.cuda.get_elapsed_time(start, end) / niter * 1000.0
+
+
+def find_best_algo(mscclpp_algos, niter):
+    assert len(mscclpp_algos) > 0
+    best_time = 10000000.0
+    best_algo = None
+    for algo in mscclpp_algos:
+        config, cur_time = find_best_config(algo, niter)
+        if cur_time < best_time:
+            best_time = cur_time
+            best_algo = algo
+            algo.set_params(*config)
+    if MPI.COMM_WORLD.rank == 0:
+        print(best_algo, end="", flush=True)
+    return best_algo
+
+
+def find_best_config(mscclpp_call, niter):
+    best_time = 10000000.0
+    for config in mscclpp_call.auto_tune():
+        cur_time = bench_time(niter, mscclpp_call)
+        if cur_time < best_time:
+            best_time = cur_time
+            best_config = config
+        if MPI.COMM_WORLD.rank == 0:
+            print("t", end="", flush=True)
+    best_config = MPI.COMM_WORLD.bcast(best_config, root=0)
+    if MPI.COMM_WORLD.rank == 0:
+        print(best_config, end="", flush=True)
+    return best_config, best_time
+
+
+def run_benchmark(
+    mscclpp_group: mscclpp_comm.CommGroup, nccl_op: nccl.NcclCommunicator, table: PrettyTable, niter: int, nelem: int
+):
+    memory = cp.zeros(nelem, dtype=data_type)
+    memory_out = cp.zeros(nelem, dtype=data_type)
+    cp.cuda.runtime.deviceSynchronize()
+
+    proxy_service = ProxyService()
+    if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1:
+        if memory.nbytes < 2**20:
+            mscclpp_algos = [MscclppAllReduce2(mscclpp_group, memory, memory_out)]
+        else:
+            mscclpp_algos = [
+                MscclppAllReduce1(mscclpp_group, memory),
+                MscclppAllReduce3(mscclpp_group, memory, proxy_service),
+            ]
+            if is_nvls_supported():
+                mscclpp_algos.append(MscclppAllReduce6(mscclpp_group, nelem, data_type))
+    else:
+        if memory.nbytes < 2**22:
+            mscclpp_algos = [MscclppAllReduce5(mscclpp_group, memory, memory_out, N_GPUS_PER_NODE, proxy_service)]
+        else:
+            mscclpp_algos = [MscclppAllReduce4(mscclpp_group, memory, N_GPUS_PER_NODE, proxy_service)]
+
+    proxy_service.start_proxy()
+    MPI.COMM_WORLD.barrier()
+    mscclpp_call = find_best_algo(mscclpp_algos, 20)
+    if isinstance(mscclpp_call, MscclppAllReduce6):
+        memory = mscclpp_call.get_memory()
+
+    nccl_call = NcclAllReduce(nccl_op, memory)
+
+    memory_nbytes = memory.nbytes
+    mscclpp_time = bench_time(niter, mscclpp_call)
+    mscclpp_algBw = memory_nbytes / mscclpp_time / 1e3
+    mscclpp_check = "PASS" if check_correctness(memory, mscclpp_call) else "FAIL"
+
+    nccl_time = bench_time(niter, nccl_call)
+    nccl_algBw = memory_nbytes / nccl_time / 1e3
+    nccl_check = "PASS" if check_correctness(memory, nccl_call) else "FAIL"
+
+    MPI.COMM_WORLD.barrier()
+    proxy_service.stop_proxy()
+
+    speed_up = nccl_time / mscclpp_time
+    if MPI.COMM_WORLD.rank == 0:
+        table.add_row(
+            [
+                human_readable_size(memory_nbytes),
+                "{:.2f}".format(mscclpp_time),
+                "{:.2f}".format(mscclpp_algBw),
+                mscclpp_check,
+                "{:.2f}".format(nccl_time),
+                "{:.2f}".format(nccl_algBw),
+                nccl_check,
+                "{:.2f}".format(speed_up),
+            ]
+        )
+    if MPI.COMM_WORLD.rank == 0:
+        print(".", end="", flush=True)
+
+    return memory.nbytes, mscclpp_algBw, nccl_algBw, speed_up
+
+
+if __name__ == "__main__":
+    shm_comm = MPI.COMM_WORLD.Split_type(MPI.COMM_TYPE_SHARED, 0, MPI.INFO_NULL)
+    N_GPUS_PER_NODE = shm_comm.size
+    shm_comm.Free()
+    cp.cuda.Device(MPI.COMM_WORLD.rank % N_GPUS_PER_NODE).use()
+
+    # create a MscclppGroup
+    network_interface = "eth0"
+    my_ip = ni.ifaddresses(network_interface)[ni.AF_INET][0]["addr"]
+    root_ip = MPI.COMM_WORLD.bcast(my_ip, root=0)
+    ifIpPortTrio = network_interface + ":" + root_ip + ":50000"  # some random port
+    mscclpp_group = mscclpp_comm.CommGroup(
+        interfaceIpPortTrio=ifIpPortTrio, rank=MPI.COMM_WORLD.rank, size=MPI.COMM_WORLD.size
+    )
+
+    # create a NcclComm
+    if MPI.COMM_WORLD.rank == 0:
+        uid = nccl.get_unique_id()
+    else:
+        uid = None
+    uid = MPI.COMM_WORLD.bcast(uid, root=0)
+    nccl_comm = nccl.NcclCommunicator(MPI.COMM_WORLD.size, uid, MPI.COMM_WORLD.rank)
+
+    table = None
+    if MPI.COMM_WORLD.rank == 0:
+        # Set table headers
+        table = PrettyTable()
+        table.field_names = [
+            f"Size ({dtype_str})",
+            "Time (us)",
+            "AlgBW (GB/s)",
+            "Correctness",
+            "NCCL Time (us)",
+            "NCCL AlgBW (GB/s)",
+            "NCCL Correctness",
+            "Speed Up",
+        ]
+
+    sizes = []
+    mscclpp_algbw = []
+    nccl_algbw = []
+    speed_ups = []
+    end_range = 28 if is_nvls_supported() else 29
+    for i in range(10, end_range):
+        if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1:
+            nelems = 2**i
+        elif MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 2:
+            nelems = 3 * 2**i
+        else:
+            raise RuntimeError("Only support one node/two nodes communication")
+
+        if nelems * data_type().itemsize > 2**32:
+            break  # due to trigger bit width limitation, we can only support up to 2**32
+
+        size, mscclpp_algBw, nccl_algBw, speed_up = run_benchmark(mscclpp_group, nccl_comm, table, 100, nelems)
+        sizes.append(size)
+        mscclpp_algbw.append(mscclpp_algBw)
+        nccl_algbw.append(nccl_algBw)
+        speed_ups.append(speed_up)
+
+    if MPI.COMM_WORLD.rank == 0:
+        print()
+        print(table)
+
+        plot_graph(sizes, mscclpp_algbw, nccl_algbw, speed_ups)
+
+    mscclpp_group = None
+    nccl_comm = None
diff --git a/python/mscclpp_benchmark/mscclpp_op.py b/python/mscclpp_benchmark/mscclpp_op.py
new file mode 100644
index 000000000..6068a9104
--- /dev/null
+++ b/python/mscclpp_benchmark/mscclpp_op.py
@@ -0,0 +1,499 @@
+import os
+import cupy as cp
+import ctypes
+from mscclpp import Transport, ProxyService, SmDevice2DeviceSemaphore
+import mscclpp.comm as mscclpp_comm
+from mscclpp.utils import KernelBuilder, pack
+
+
+IB_TRANSPORTS = [
+    Transport.IB0,
+    Transport.IB1,
+    Transport.IB2,
+    Transport.IB3,
+    Transport.IB4,
+    Transport.IB5,
+    Transport.IB6,
+    Transport.IB7,
+]
+
+
+def type_to_str(dtype):
+    if dtype == cp.float16:
+        return "__half"
+    elif dtype == cp.float32:
+        return "float"
+    elif dtype == cp.int32:
+        return "int"
+    else:
+        raise RuntimeError("Unknown data type")
+
+
+class MscclppAllReduce1:
+    def __init__(
+        self,
+        group: mscclpp_comm.CommGroup,
+        memory: cp.ndarray,
+        read_only: int = 1,
+        block_size: int = 1024,
+        nblocks: int = 24,
+    ):
+        self.group = group
+        self.memory = memory
+        remote_nghrs = list(range(self.group.nranks))
+        remote_nghrs.remove(self.group.my_rank)
+
+        self.group.barrier()
+        # create a connection for each remote neighbor
+        self.connections = self.group.make_connection(remote_nghrs, Transport.CudaIpc)
+        type_str = type_to_str(memory.dtype)
+
+        # create a sm_channel for each remote neighbor
+        self.sm_channels = self.group.make_sm_channels(self.memory, self.connections)
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        self.kernel = KernelBuilder(
+            file="allreduce.cu",
+            kernel_name="allreduce1",
+            file_dir=file_dir,
+            macro_dict={"TYPE": type_str},
+        ).get_compiled_kernel()
+        self.device_handles = []
+        for rank in range(self.group.nranks):
+            if rank != self.group.my_rank:
+                self.device_handles.append(self.sm_channels[rank].device_handle().raw)
+
+        self.device_handles_cp = cp.asarray(memoryview(b"".join(self.device_handles)), dtype=cp.uint8)
+
+        self.set_params(nblocks, block_size, read_only)
+
+    def __call__(self, stream):
+        self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream)
+        return self.memory
+
+    def set_params(self, nblocks, block_size, read_only):
+        self.nblocks = nblocks
+        self.block_size = block_size
+        self.read_only = read_only
+        self.params = b""
+        self.params += pack(
+            self.device_handles_cp,
+            self.memory,
+            self.group.my_rank,
+            self.group.nranks,
+            ctypes.c_size_t(self.memory.size),
+            self.read_only,
+        )
+
+    def auto_tune(self):
+        nblocks_to_try = [8, 12, 16, 24, 32, 48, 64, 72, 96, 108]
+        block_size_to_try = [256, 512, 1024]
+        read_only_to_try = [0, 1]
+        for nblocks in nblocks_to_try:
+            for block_size in block_size_to_try:
+                for read_only in read_only_to_try:
+                    self.set_params(nblocks, block_size, read_only)
+                    yield nblocks, block_size, read_only
+
+
+class MscclppAllReduce2:
+    def __init__(
+        self,
+        group: mscclpp_comm.CommGroup,
+        memory: cp.ndarray,
+        memory_out: cp.ndarray,
+        block_size: int = 512,
+        nblocks: int = 21,
+    ):
+        self.group = group
+        self.memory = memory
+        self.memory_out = memory_out
+        remote_nghrs = list(range(self.group.nranks))
+        remote_nghrs.remove(self.group.my_rank)
+
+        self.group.barrier()
+        # create a connection for each remote neighbor
+        self.connections = self.group.make_connection(remote_nghrs, Transport.CudaIpc)
+        type_str = type_to_str(memory.dtype)
+
+        self.scratch = cp.zeros(self.memory.size * 8, dtype=self.memory.dtype)
+        # create a sm_channel for each remote neighbor
+        self.sm_channels = self.group.make_sm_channels_with_scratch(self.memory, self.scratch, self.connections)
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        self.kernel = KernelBuilder(
+            file="allreduce.cu", kernel_name="allreduce2", file_dir=file_dir, macro_dict={"TYPE": type_str}
+        ).get_compiled_kernel()
+        self.device_handles = []
+        for rank in range(self.group.nranks):
+            if rank != self.group.my_rank:
+                self.device_handles.append(self.sm_channels[rank].device_handle().raw)
+
+        self.device_handles_cp = cp.asarray(memoryview(b"".join(self.device_handles)), dtype=cp.uint8)
+
+        self.set_params(nblocks, block_size)
+
+    def __call__(self, stream):
+        self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream)
+        return self.memory_out
+
+    def set_params(self, nblocks, block_size):
+        self.nblocks = nblocks
+        self.block_size = block_size
+
+        self.params = b""
+        self.params += pack(
+            self.device_handles_cp,
+            self.memory,
+            self.scratch,
+            self.memory_out,
+            self.group.my_rank,
+            self.group.nranks,
+            ctypes.c_size_t(self.memory.size),
+        )
+
+    def auto_tune(self):
+        nblocks_to_try = [21, 42, 63, 84, 105]
+        block_size_to_try = [256, 512, 1024]
+        for nblocks in nblocks_to_try:
+            for block_size in block_size_to_try:
+                self.set_params(nblocks, block_size)
+                yield nblocks, block_size
+
+
+class MscclppAllReduce3:
+    def __init__(
+        self,
+        group: mscclpp_comm.CommGroup,
+        memory: cp.ndarray,
+        proxy_service: ProxyService,
+        block_size: int = 1024,
+        nblocks: int = 24,
+    ):
+        self.group = group
+        self.memory = memory
+        remote_nghrs = list(range(self.group.nranks))
+        remote_nghrs.remove(self.group.my_rank)
+
+        self.group.barrier()
+        # create a connection for each remote neighbor
+        self.connections = self.group.make_connection(remote_nghrs, Transport.CudaIpc)
+        type_str = type_to_str(memory.dtype)
+
+        self.proxy_service = proxy_service
+        self.scratch = cp.zeros(self.memory.size, dtype=self.memory.dtype)
+
+        # create a sm_channel for each remote neighbor
+        self.fst_round_proxy_chans = self.group.make_proxy_channels_with_scratch(
+            self.proxy_service, self.memory, self.scratch, self.connections
+        )
+        self.snd_round_proxy_chans = self.group.make_proxy_channels(self.proxy_service, self.memory, self.connections)
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        self.kernel = KernelBuilder(
+            file="allreduce.cu", kernel_name="allreduce3", file_dir=file_dir, macro_dict={"TYPE": type_str}
+        ).get_compiled_kernel()
+        self.fst_device_handles = []
+        self.snd_device_handles = []
+        for rank in range(self.group.nranks):
+            if rank != self.group.my_rank:
+                self.fst_device_handles.append(self.fst_round_proxy_chans[rank].device_handle().raw)
+                self.snd_device_handles.append(self.snd_round_proxy_chans[rank].device_handle().raw)
+        self.fst_device_handles_cp = cp.asarray(memoryview(b"".join(self.fst_device_handles)), dtype=cp.uint8)
+        self.snd_device_handles_cp = cp.asarray(memoryview(b"".join(self.snd_device_handles)), dtype=cp.uint8)
+
+        self.set_params(nblocks, block_size)
+
+    def __call__(self, stream):
+        self.kernel.launch_kernel(self.params, 24, 1024, 0, stream)
+        return self.memory
+
+    def set_params(self, nblocks, block_size):
+        self.nblocks = nblocks
+        self.block_size = block_size
+        self.params = b""
+        self.params += pack(
+            self.fst_device_handles_cp,
+            self.snd_device_handles_cp,
+            self.memory,
+            self.scratch,
+            self.group.my_rank,
+            self.group.nranks,
+            ctypes.c_size_t(self.memory.size),
+        )
+
+    def auto_tune(self):
+        nblocks_to_try = [8, 12, 16, 24, 32, 48, 64, 72, 96, 108]
+        block_size_to_try = [256, 512, 1024]
+        for nblocks in nblocks_to_try:
+            for block_size in block_size_to_try:
+                self.set_params(nblocks, block_size)
+                yield nblocks, block_size
+
+
+class MscclppAllReduce4:
+    def __init__(
+        self,
+        group: mscclpp_comm.CommGroup,
+        memory: cp.ndarray,
+        nranks_per_node: int,
+        proxy_service: ProxyService,
+        nblocks: int = 45,
+        block_size: int = 512,
+        pipeline_depth: int = 3,
+    ):
+        self.group = group
+        self.memory = memory
+
+        self.nranks_per_node = nranks_per_node
+        in_same_node = lambda rank: rank // nranks_per_node == self.group.my_rank // nranks_per_node
+        remote_nghrs = list(range(self.group.nranks))
+        remote_nghrs.remove(self.group.my_rank)
+        transports = {}
+        for rank in remote_nghrs:
+            if in_same_node(rank):
+                transports[rank] = Transport.CudaIpc
+            else:
+                transports[rank] = IB_TRANSPORTS[rank % nranks_per_node]
+
+        self.group.barrier()
+        # create a connection for each remote neighbor
+        self.connections = self.group.make_connection(remote_nghrs, transports)
+        type_str = type_to_str(memory.dtype)
+
+        self.proxy_service = proxy_service
+        self.scratch = cp.zeros(self.memory.size, dtype=self.memory.dtype)
+        same_node_connections = {rank: conn for rank, conn in self.connections.items() if in_same_node(rank)}
+        # create a sm_channel for each remote neighbor
+        self.sm_channels = self.group.make_sm_channels(self.memory, same_node_connections)
+        self.reduce_scatter_proxy_channels = self.group.make_proxy_channels_with_scratch(
+            self.proxy_service, self.memory, self.scratch, self.connections
+        )
+        self.all_gather_proxy_channels = self.group.make_proxy_channels(
+            self.proxy_service, self.memory, self.connections
+        )
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        self.kernel = KernelBuilder(
+            file="allreduce.cu", kernel_name="allreduce4", file_dir=file_dir, macro_dict={"TYPE": type_str}
+        ).get_compiled_kernel()
+        self.sm_device_handles = []
+        self.reduce_sactter_proxy_device_handles = []
+        self.all_gather_proxy_device_handles = []
+        for rank in range(self.group.nranks):
+            if rank != self.group.my_rank and in_same_node(rank):
+                self.sm_device_handles.append(self.sm_channels[rank].device_handle().raw)
+            if rank != self.group.my_rank:
+                self.reduce_sactter_proxy_device_handles.append(
+                    self.reduce_scatter_proxy_channels[rank].device_handle().raw
+                )
+                self.all_gather_proxy_device_handles.append(self.all_gather_proxy_channels[rank].device_handle().raw)
+
+        self.sm_device_handles_cp = cp.asarray(memoryview(b"".join(self.sm_device_handles)), dtype=cp.uint8)
+        self.reduce_sactter_proxy_device_handles_cp = cp.asarray(
+            memoryview(b"".join(self.reduce_sactter_proxy_device_handles)), dtype=cp.uint8
+        )
+        self.all_gather_proxy_device_handles_cp = cp.asarray(
+            memoryview(b"".join(self.all_gather_proxy_device_handles)), dtype=cp.uint8
+        )
+
+        self.set_params(nblocks, block_size, pipeline_depth)
+
+    def __call__(self, stream):
+        self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream)
+        return self.memory
+
+    def set_params(self, nblocks, block_size, pipeline_depth):
+        self.nblocks = nblocks
+        self.block_size = block_size
+        self.pipeline_depth = pipeline_depth
+
+        self.params = b""
+        self.params += pack(
+            self.sm_device_handles_cp,
+            self.reduce_sactter_proxy_device_handles_cp,
+            self.all_gather_proxy_device_handles_cp,
+            self.memory,
+            self.scratch,
+            self.group.my_rank,
+            self.nranks_per_node,
+            self.group.nranks,
+            bytes(4),  # padding for memory alignment
+            ctypes.c_size_t(self.memory.size),
+            self.pipeline_depth,
+        )
+
+    def auto_tune(self):
+        nblocks_to_try = [24, 32, 40, 45, 48, 64, 72, 90, 96, 108]
+        block_size_to_try = [256, 512, 1024]
+        pipeline_depth_to_try = [1, 2, 3, 4]
+        for nblocks in nblocks_to_try:
+            for block_size in block_size_to_try:
+                for pipeline_depth in pipeline_depth_to_try:
+                    self.set_params(nblocks, block_size, pipeline_depth)
+                    yield nblocks, block_size, pipeline_depth
+
+
+class MscclppAllReduce5:
+    def __init__(
+        self,
+        group: mscclpp_comm.CommGroup,
+        memory: cp.ndarray,
+        memory_out: cp.ndarray,
+        nranks_per_node: int,
+        proxy_service: ProxyService,
+        nblocks: int = 21,
+        block_size: int = 512,
+    ):
+        self.group = group
+        self.memory = memory
+        self.memory_out = memory_out
+
+        self.nranks_per_node = nranks_per_node
+        in_same_node = lambda rank: rank // nranks_per_node == self.group.my_rank // nranks_per_node
+        remote_nghrs = list(range(self.group.nranks))
+        remote_nghrs.remove(self.group.my_rank)
+        transports = {}
+        for rank in remote_nghrs:
+            if in_same_node(rank):
+                transports[rank] = Transport.CudaIpc
+            else:
+                transports[rank] = IB_TRANSPORTS[rank % nranks_per_node]
+
+        self.group.barrier()
+        # create a connection for each remote neighbor
+        self.connections = self.group.make_connection(remote_nghrs, transports)
+        type_str = type_to_str(memory.dtype)
+
+        self.proxy_service = proxy_service
+        self.scratch = cp.zeros(self.memory.size * 8, dtype=self.memory.dtype)
+        self.put_buff = cp.zeros(self.memory.size * 8 // nranks_per_node, dtype=self.memory.dtype)
+        same_node_connections = {rank: conn for rank, conn in self.connections.items() if in_same_node(rank)}
+        across_node_connections = {rank: conn for rank, conn in self.connections.items() if not in_same_node(rank)}
+        # create a sm_channel for each remote neighbor
+        self.sm_channels = self.group.make_sm_channels_with_scratch(self.memory, self.scratch, same_node_connections)
+        self.proxy_channels = self.group.make_proxy_channels_with_scratch(
+            self.proxy_service, self.put_buff, self.scratch, across_node_connections
+        )
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        self.kernel = KernelBuilder(
+            file="allreduce.cu", kernel_name="allreduce5", file_dir=file_dir, macro_dict={"TYPE": type_str}
+        ).get_compiled_kernel()
+        self.sm_device_handles = []
+        self.proxy_device_handles = []
+        for rank in range(self.group.nranks):
+            if rank != self.group.my_rank and in_same_node(rank):
+                self.sm_device_handles.append(self.sm_channels[rank].device_handle().raw)
+            if rank != self.group.my_rank and not in_same_node(rank):
+                self.proxy_device_handles.append(self.proxy_channels[rank].device_handle().raw)
+
+        self.sm_device_handles_cp = cp.asarray(memoryview(b"".join(self.sm_device_handles)), dtype=cp.uint8)
+        self.proxy_device_handles_cp = cp.asarray(memoryview(b"".join(self.proxy_device_handles)), dtype=cp.uint8)
+
+        self.set_params(nblocks, block_size)
+
+    def __call__(self, stream):
+        self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream)
+        return self.memory_out
+
+    def set_params(self, nblocks, block_size):
+        self.nblocks = nblocks
+        self.block_size = block_size
+
+        self.params = b""
+        self.params += pack(
+            self.sm_device_handles_cp,
+            self.proxy_device_handles_cp,
+            self.memory,
+            self.scratch,
+            self.put_buff,
+            self.memory_out,
+            self.group.my_rank,
+            self.nranks_per_node,
+            self.group.nranks,
+            bytes(4),  # padding for memory alignment
+            ctypes.c_size_t(self.memory.size),
+        )
+
+    def auto_tune(self):
+        nblocks_to_try = [21, 42, 84]
+        block_size_to_try = [256, 512, 1024]
+        for nblocks in nblocks_to_try:
+            for block_size in block_size_to_try:
+                self.set_params(nblocks, block_size)
+                yield nblocks, block_size
+
+
+class MscclppAllReduce6:
+    def __init__(
+        self,
+        group: mscclpp_comm.CommGroup,
+        nelem: int,
+        memory_dtype: cp.dtype,
+        block_size: int = 1024,
+        nblocks: int = 32,
+    ):
+        self.group = group
+        datatype_size = memory_dtype().itemsize
+        buffer_size = nelem * datatype_size
+        type_str = type_to_str(memory_dtype)
+        all_ranks = list(range(group.nranks))
+        remote_nghrs = all_ranks.copy()
+        remote_nghrs.remove(self.group.my_rank)
+
+        self.group.barrier()
+        # create a connection for each remote neighbor
+        self.nvlink_connections = self.group.make_connection(remote_nghrs, Transport.CudaIpc)
+        self.nvls_connection = group.make_connection(all_ranks, Transport.Nvls)
+        min_gran = self.nvls_connection.get_multicast_min_granularity()
+        aligned_buffer_size = int(((buffer_size + min_gran - 1) // min_gran) * min_gran)
+        self.nvls_mem_handle = self.nvls_connection.allocate_bind_memory(
+            aligned_buffer_size
+        )  # just using recommended size for now
+        self.memory_ptr = self.nvls_mem_handle.get_device_ptr()
+
+        self.cp_memory_ptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(self.memory_ptr, aligned_buffer_size, None), 0)
+        self.memory = cp.ndarray(nelem, memory_dtype, self.cp_memory_ptr)
+
+        # create a sm_channel for each remote neighbor
+        self.semaphores = group.make_semaphore(self.nvlink_connections, SmDevice2DeviceSemaphore)
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        self.kernel = KernelBuilder(
+            file="allreduce.cu",
+            kernel_name="allreduce6",
+            file_dir=file_dir,
+            macro_dict={"TYPE": type_str},
+        ).get_compiled_kernel()
+        self.device_handles = []
+        for rank in range(self.group.nranks):
+            if rank != self.group.my_rank:
+                self.device_handles.append(self.semaphores[rank].device_handle().raw)
+
+        self.device_handles_cp = cp.asarray(memoryview(b"".join(self.device_handles)), dtype=cp.uint8)
+        self.nvls_handle = self.nvls_mem_handle.device_handle().raw
+
+        self.set_params(nblocks, block_size)
+
+    def get_memory(self):
+        return self.memory
+
+    def __call__(self, stream_ptr):
+        self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream_ptr)
+        return self.memory
+
+    def set_params(self, nblocks, block_size):
+        self.nblocks = nblocks
+        self.block_size = block_size
+        self.params = b""
+        self.params += pack(
+            self.device_handles_cp,
+            self.nvls_handle,
+            self.memory,
+            self.group.my_rank,
+            self.group.nranks,
+            ctypes.c_size_t(self.memory.size),
+        )
+
+    def auto_tune(self):
+        nblocks_to_try = [8, 12, 16, 24, 32, 48, 64, 72, 96, 108]
+        block_size_to_try = [256, 512, 1024]
+        for nblocks in nblocks_to_try:
+            for block_size in block_size_to_try:
+                self.set_params(nblocks, block_size)
+                yield nblocks, block_size
diff --git a/python/mscclpp_benchmark/nccl_op.py b/python/mscclpp_benchmark/nccl_op.py
new file mode 100644
index 000000000..2eb27b6fd
--- /dev/null
+++ b/python/mscclpp_benchmark/nccl_op.py
@@ -0,0 +1,24 @@
+import cupy.cuda.nccl as nccl
+from mpi4py import MPI
+import cupy as cp
+
+
+class NcclAllReduce:
+    def __init__(self, nccl_comm: nccl.NcclCommunicator, memory: cp.ndarray):
+        self.nccl_comm = nccl_comm
+        self.memory = memory
+        if memory.dtype == cp.float32:
+            self.nccl_dtype = nccl.NCCL_FLOAT32
+        elif memory.dtype == cp.float16:
+            self.nccl_dtype = nccl.NCCL_FLOAT16
+        elif memory.dtype == cp.int32:
+            self.nccl_dtype = nccl.NCCL_INT32
+        else:
+            raise RuntimeError("Make sure that the data type is mapped to the correct NCCL data type")
+
+    def __call__(self, stream):
+        stream_ptr = stream.ptr if stream else 0
+        self.nccl_comm.allReduce(
+            self.memory.data.ptr, self.memory.data.ptr, self.memory.size, self.nccl_dtype, nccl.NCCL_SUM, stream_ptr
+        )
+        return self.memory
diff --git a/python/requirements_cu11.txt b/python/requirements_cu11.txt
new file mode 100644
index 000000000..371626edf
--- /dev/null
+++ b/python/requirements_cu11.txt
@@ -0,0 +1,7 @@
+mpi4py
+cupy-cuda11x
+prettytable
+netifaces
+pytest
+numpy
+matplotlib
diff --git a/python/requirements_cu12.txt b/python/requirements_cu12.txt
new file mode 100644
index 000000000..71014a544
--- /dev/null
+++ b/python/requirements_cu12.txt
@@ -0,0 +1,7 @@
+mpi4py
+cupy-cuda12x
+prettytable
+netifaces
+pytest
+numpy
+matplotlib
diff --git a/python/test/CMakeLists.txt b/python/test/CMakeLists.txt
index 356934536..be62aea99 100644
--- a/python/test/CMakeLists.txt
+++ b/python/test/CMakeLists.txt
@@ -9,5 +9,5 @@ FetchContent_MakeAvailable(nanobind)
 file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cpp)
 nanobind_add_module(mscclpp_py_test ${SOURCES})
 set_target_properties(mscclpp_py_test PROPERTIES OUTPUT_NAME _ext)
-target_link_libraries(mscclpp_py_test PRIVATE mscclpp_static)
-target_include_directories(mscclpp_py_test PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+target_link_libraries(mscclpp_py_test PRIVATE mscclpp_static ${GPU_LIBRARIES})
+target_include_directories(mscclpp_py_test SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
diff --git a/python/test/_cpp/proxy_test.cpp b/python/test/_cpp/proxy_test.cpp
index 4a1a0f754..6ad0df72b 100644
--- a/python/test/_cpp/proxy_test.cpp
+++ b/python/test/_cpp/proxy_test.cpp
@@ -1,7 +1,6 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include <cuda.h>
 #include <nanobind/nanobind.h>
 #include <nanobind/stl/shared_ptr.h>
 #include <nanobind/stl/vector.h>
@@ -9,8 +8,8 @@
 #include <iostream>
 #include <memory>
 #include <mscclpp/core.hpp>
-#include <mscclpp/cuda_utils.hpp>
 #include <mscclpp/fifo.hpp>
+#include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/numa.hpp>
 #include <mscclpp/proxy.hpp>
 #include <mscclpp/semaphore.hpp>
diff --git a/python/test/mscclpp_group.py b/python/test/mscclpp_group.py
deleted file mode 100644
index 7a7c7b017..000000000
--- a/python/test/mscclpp_group.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from __future__ import annotations
-import logging
-from typing import Type
-
-import cupy as cp
-from mscclpp import (
-    Communicator,
-    Connection,
-    Host2DeviceSemaphore,
-    Host2HostSemaphore,
-    ProxyService,
-    RegisteredMemory,
-    SimpleProxyChannel,
-    SmChannel,
-    SmDevice2DeviceSemaphore,
-    TcpBootstrap,
-    Transport,
-    TransportFlags,
-)
-import numpy as np
-
-from .mscclpp_mpi import MpiGroup
-
-logger = logging.getLogger(__name__)
-
-
-class MscclppGroup:
-    def __init__(self, mpi_group: MpiGroup, interfaceIpPortTrio=""):
-        self.bootstrap = TcpBootstrap.create(mpi_group.comm.rank, mpi_group.comm.size)
-        if interfaceIpPortTrio == "":
-            uniq_id = None
-            if mpi_group.comm.rank == 0:
-                # similar to NCCL's unique id
-                uniq_id = self.bootstrap.create_unique_id()
-            uniq_id_global = mpi_group.comm.bcast(uniq_id, 0)
-            self.bootstrap.initialize(uniq_id_global)
-        else:
-            # use this instead
-            self.bootstrap.initialize(interfaceIpPortTrio)
-        self.communicator = Communicator(self.bootstrap)
-        self.my_rank = self.bootstrap.get_rank()
-        self.nranks = self.bootstrap.get_n_ranks()
-
-    def barrier(self):
-        self.bootstrap.barrier()
-
-    def send(self, tensor: np.ndarray, peer: int, tag: int):
-        self.bootstrap.send(tensor.ctypes.data, tensor.size * tensor.itemsize, peer, tag)
-
-    def recv(self, tensor: np.ndarray, peer: int, tag: int):
-        self.bootstrap.recv(tensor.ctypes.data, tensor.size * tensor.itemsize, peer, tag)
-
-    def my_ib_device(self, local_rank: int) -> Transport:
-        if local_rank == 0:
-            return Transport.IB0
-        if local_rank == 1:
-            return Transport.IB1
-        if local_rank == 2:
-            return Transport.IB2
-        if local_rank == 3:
-            return Transport.IB3
-        if local_rank == 4:
-            return Transport.IB4
-        if local_rank == 5:
-            return Transport.IB5
-        if local_rank == 6:
-            return Transport.IB6
-        if local_rank == 7:
-            return Transport.IB7
-        else:
-            assert False  # only 8 IBs are supported
-
-    def make_connection(self, remote_ranks: list[int], transport: Transport) -> dict[int, Connection]:
-        connections = {}
-        for rank in remote_ranks:
-            connections[rank] = self.communicator.connect_on_setup(rank, 0, transport)
-        self.communicator.setup()
-        connections = {rank: connections[rank].get() for rank in connections}
-        return connections
-
-    def register_tensor_with_connections(
-        self, tensor: Type[cp.ndarray] or Type[np.ndarray], connections: dict[int, Connection]
-    ) -> dict[int, RegisteredMemory]:
-        transport_flags = TransportFlags()
-        for rank in connections:
-            transport_flags |= connections[rank].transport()
-        data_ptr = tensor.data.ptr if isinstance(tensor, cp.ndarray) else tensor.ctypes.data
-        local_reg_memory = self.communicator.register_memory(data_ptr, tensor.size * tensor.itemsize, transport_flags)
-        all_registered_memories = {}
-        all_registered_memories[self.my_rank] = local_reg_memory
-        future_memories = {}
-        for rank in connections:
-            self.communicator.send_memory_on_setup(local_reg_memory, rank, 0)
-            future_memories[rank] = self.communicator.recv_memory_on_setup(rank, 0)
-        self.communicator.setup()
-        for rank in connections:
-            all_registered_memories[rank] = future_memories[rank].get()
-        return all_registered_memories
-
-    def make_semaphore(
-        self,
-        connections: dict[int, Connection],
-        semaphore_type: Type[Host2HostSemaphore] or Type[Host2DeviceSemaphore] or Type[SmDevice2DeviceSemaphore],
-    ) -> dict[int, Host2HostSemaphore]:
-        semaphores = {}
-        for rank in connections:
-            semaphores[rank] = semaphore_type(self.communicator, connections[rank])
-        self.communicator.setup()
-        return semaphores
-
-    def make_sm_channels(self, tensor: cp.ndarray, connections: dict[int, Connection]) -> dict[int, SmChannel]:
-        semaphores = self.make_semaphore(connections, SmDevice2DeviceSemaphore)
-        registered_memories = self.register_tensor_with_connections(tensor, connections)
-        channels = {}
-        for rank in connections:
-            channels[rank] = SmChannel(semaphores[rank], registered_memories[rank], tensor.data.ptr)
-        return channels
-
-    def make_sm_channels_with_packet(
-        self, tensor: cp.ndarray, packetTensor: cp.ndarray, connections: dict[int, Connection]
-    ) -> dict[int, SmChannel]:
-        semaphores = self.make_semaphore(connections, SmDevice2DeviceSemaphore)
-        registered_memories = self.register_tensor_with_connections(packetTensor, connections)
-        channels = {}
-        for rank in connections:
-            channels[rank] = SmChannel(
-                semaphores[rank], registered_memories[rank], tensor.data.ptr, packetTensor.data.ptr
-            )
-        return channels
-
-    def make_proxy_channels_with_packet(
-        self, proxy_service: ProxyService, tensor: cp.ndarray, connections: dict[int, Connection]
-    ) -> dict[int, SmChannel]:
-        semaphores = self.make_semaphore(connections, Host2DeviceSemaphore)
-        registered_memories = self.register_tensor_with_connections(tensor, connections)
-        memory_ids = {}
-        semaphore_ids = {}
-        for rank in registered_memories:
-            memory_ids[rank] = proxy_service.add_memory(registered_memories[rank])
-        for rank in semaphores:
-            semaphore_ids[rank] = proxy_service.add_semaphore(semaphores[rank])
-        channels = {}
-        for rank in semaphores:
-            channels[rank] = SimpleProxyChannel(
-                proxy_service.proxy_channel(semaphore_ids[rank]), memory_ids[rank], memory_ids[self.my_rank]
-            )
-        return channels
diff --git a/python/test/mscclpp_mpi.py b/python/test/mscclpp_mpi.py
index 1f37eb9c6..21a7fff44 100644
--- a/python/test/mscclpp_mpi.py
+++ b/python/test/mscclpp_mpi.py
@@ -38,10 +38,13 @@ def finalize_mpi():
 
 
 class MpiGroup:
-    def __init__(self, ranks: list):
+    def __init__(self, ranks: list = []):
         world_group = MPI.COMM_WORLD.group
-        group = world_group.Incl(ranks)
-        self.comm = MPI.COMM_WORLD.Create(group)
+        if len(ranks) == 0:
+            self.comm = MPI.COMM_WORLD
+        else:
+            group = world_group.Incl(ranks)
+            self.comm = MPI.COMM_WORLD.Create(group)
 
 
 @pytest.fixture
diff --git a/python/test/nvls_test.cu b/python/test/nvls_test.cu
new file mode 100644
index 000000000..022b4d6ca
--- /dev/null
+++ b/python/test/nvls_test.cu
@@ -0,0 +1,66 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <mscclpp/concurrency_device.hpp>
+#include <mscclpp/nvls_device.hpp>
+#include <mscclpp/poll_device.hpp>
+#include <mscclpp/semaphore_device.hpp>
+
+__device__ mscclpp::DeviceSyncer deviceSyncer;
+
+extern "C" __global__ void __launch_bounds__(1024, 1)
+    nvls_test(mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs,
+              mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores, int my_rank, int nranks, int nbytes) {
+  int nelem = nbytes / sizeof(float);
+  float* dev_ptr = (float*)nvlsPtrs.devicePtr;
+  float* mc_ptr = (float*)nvlsPtrs.mcPtr;
+  int tid = threadIdx.x;
+  int bid = blockIdx.x;
+
+  for (int idx = bid * blockDim.x + tid; idx < nelem; idx += blockDim.x * gridDim.x) {
+    dev_ptr[idx] = my_rank;
+  }
+  deviceSyncer.sync(gridDim.x);
+  if (tid == 0 && bid == 0) {
+    __threadfence_system();
+  }
+
+  if (bid == 0) {
+    if (tid < nranks && tid != my_rank) {
+      semaphores[tid].signal();
+      semaphores[tid].wait();
+    }
+  }
+  deviceSyncer.sync(gridDim.x);
+
+  int my_st = ((int64_t)nelem * (int64_t)my_rank) / (int64_t)nranks;
+  int my_en = ((int64_t)nelem * (int64_t)(my_rank + 1)) / (int64_t)nranks;
+
+  int my_offset = (tid + bid * blockDim.x) * 4;
+  int my_step = blockDim.x * gridDim.x * 4;
+
+  for (int idx = my_st + my_offset; idx < my_en; idx += my_step) {
+    uint4 val;
+    nvlsPtrs.multimemLoad(val, mc_ptr + idx);
+    nvlsPtrs.multimemStore(val, mc_ptr + idx);
+  }
+
+  deviceSyncer.sync(gridDim.x);
+  if (tid == 0 && bid == 0) {
+    __threadfence_system();
+  }
+
+  if (bid == 0) {
+    if (tid < nranks && tid != my_rank) {
+      semaphores[tid].signal();
+      semaphores[tid].wait();
+    }
+  }
+  deviceSyncer.sync(gridDim.x);
+
+  for (int idx = bid * blockDim.x + tid; idx < nelem; idx += blockDim.x * gridDim.x) {
+    if (dev_ptr[idx] != ((nranks * (nranks - 1)) / 2)) {
+      __assert_fail("dev_ptr[idx] != nranks", __FILE__, __LINE__, __PRETTY_FUNCTION__);
+    }
+  }
+}
diff --git a/python/test/requirements_cu11.txt b/python/test/requirements_cu11.txt
deleted file mode 100644
index 2b79ab977..000000000
--- a/python/test/requirements_cu11.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-cuda-python==12.1.0
-mpi4py==3.1.4
-netifaces==0.11.0
-numpy==1.22.2
-pytest==7.2.2
-cupy-cuda11x
diff --git a/python/test/requirements_cu12.txt b/python/test/requirements_cu12.txt
deleted file mode 100644
index 0061438d2..000000000
--- a/python/test/requirements_cu12.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-cuda-python==12.1.0
-mpi4py==3.1.4
-netifaces==0.11.0
-numpy==1.22.2
-pytest==7.2.2
-cupy-cuda12x
diff --git a/python/test/simple_proxy_channel_test.cu b/python/test/simple_proxy_channel_test.cu
index bfe59623a..0a7542768 100644
--- a/python/test/simple_proxy_channel_test.cu
+++ b/python/test/simple_proxy_channel_test.cu
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include <mscclpp/packet.hpp>
+#include <mscclpp/packet_device.hpp>
 #include <mscclpp/proxy_channel_device.hpp>
 
 // be careful about using channels[my_rank] as it is inavlie and it is there just for simplicity of indexing
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index 3af1580a4..4b3cb6ebf 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -2,18 +2,30 @@
 # Licensed under the MIT license.
 
 from concurrent.futures import ThreadPoolExecutor
+import os
 import time
+import threading
 
 import cupy as cp
 import numpy as np
 import netifaces as ni
 import pytest
 
-from mscclpp import Fifo, Host2DeviceSemaphore, Host2HostSemaphore, ProxyService, SmDevice2DeviceSemaphore, Transport
+from mscclpp import (
+    EndpointConfig,
+    Fifo,
+    Host2DeviceSemaphore,
+    Host2HostSemaphore,
+    ProxyService,
+    SmDevice2DeviceSemaphore,
+    TcpBootstrap,
+    Transport,
+    is_nvls_supported,
+)
+import mscclpp.comm as mscclpp_comm
+from mscclpp.utils import KernelBuilder, pack
 from ._cpp import _ext
-from .mscclpp_group import MscclppGroup
 from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
-from .utils import KernelBuilder, pack
 
 ethernet_interface_name = "eth0"
 
@@ -41,7 +53,7 @@ def test_group_with_ip(mpi_group: MpiGroup, ifIpPortTrio: str):
         # ranks are on different nodes
         pytest.skip("this case is not supported as localhost will be different for different nodes")
 
-    group = MscclppGroup(mpi_group, ifIpPortTrio)
+    group = mscclpp_comm.CommGroup(mpi_group.comm, ifIpPortTrio)
 
     nelem = 1024
     memory = np.zeros(nelem, dtype=np.int32)
@@ -63,13 +75,59 @@ def test_group_with_ip(mpi_group: MpiGroup, ifIpPortTrio: str):
     assert np.array_equal(memory, memory_expected)
 
 
-def create_and_connect(mpi_group: MpiGroup, transport: str):
-    if transport == "NVLink" and all_ranks_on_the_same_node(mpi_group) is False:
-        pytest.skip("cannot use nvlink for cross node")
-    group = MscclppGroup(mpi_group)
+@parametrize_mpi_groups(2, 4, 8, 16)
+def test_bootstrap_init_gil_release(mpi_group: MpiGroup):
+    bootstrap = TcpBootstrap.create(mpi_group.comm.rank, mpi_group.comm.size)
+    uniq_id = None
+    if mpi_group.comm.rank == 0:
+        # similar to NCCL's unique id
+        uniq_id = bootstrap.create_unique_id()
+    uniq_id_global = mpi_group.comm.bcast(uniq_id, 0)
+
+    if mpi_group.comm.rank == 0:
+        # rank 0 never initializes the bootstrap, making other ranks block
+        pass
+    else:
+        check_list = []
+
+        def check_target():
+            check_list.append("this thread could run.")
+
+        def init_target():
+            try:
+                # expected to raise a timeout after 3 seconds
+                bootstrap.initialize(uniq_id_global, 3)
+            except:
+                pass
+
+        init_thread = threading.Thread(target=init_target)
+        check_thread = threading.Thread(target=check_target)
+        init_thread.start()
+
+        time.sleep(0.1)
 
-    remote_nghrs = list(range(mpi_group.comm.size))
-    remote_nghrs.remove(mpi_group.comm.rank)
+        # check that the check thread is not blocked
+        s = time.time()
+        check_thread.start()
+        check_thread.join()
+        e = time.time()
+        assert e - s < 0.1
+        assert len(check_list) == 1
+
+        init_thread.join()
+
+    mpi_group.comm.barrier()
+
+
+def create_connection(group: mscclpp_comm.CommGroup, transport: str):
+    if transport == "NVLS":
+        all_ranks = list(range(group.nranks))
+        tran = Transport.Nvls
+        connection = group.make_connection(all_ranks, tran)
+        return connection
+
+    remote_nghrs = list(range(group.nranks))
+    remote_nghrs.remove(group.my_rank)
     if transport == "NVLink":
         tran = Transport.CudaIpc
     elif transport == "IB":
@@ -77,20 +135,28 @@ def create_and_connect(mpi_group: MpiGroup, transport: str):
     else:
         assert False
     connections = group.make_connection(remote_nghrs, tran)
-    return group, connections
+    return connections
+
+
+def create_group_and_connection(mpi_group: MpiGroup, transport: str):
+    if (transport == "NVLink" or transport == "NVLS") and all_ranks_on_the_same_node(mpi_group) is False:
+        pytest.skip("cannot use nvlink/nvls for cross node")
+    group = mscclpp_comm.CommGroup(mpi_group.comm)
+    connection = create_connection(group, transport)
+    return group, connection
 
 
 @parametrize_mpi_groups(2, 4, 8, 16)
 @pytest.mark.parametrize("transport", ["IB", "NVLink"])
 def test_group_with_connections(mpi_group: MpiGroup, transport: str):
-    create_and_connect(mpi_group, transport)
+    create_group_and_connection(mpi_group, transport)
 
 
 @parametrize_mpi_groups(2, 4, 8, 16)
 @pytest.mark.parametrize("transport", ["IB", "NVLink"])
 @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
 def test_connection_write(mpi_group: MpiGroup, transport: Transport, nelem: int):
-    group, connections = create_and_connect(mpi_group, transport)
+    group, connections = create_group_and_connection(mpi_group, transport)
     memory = cp.zeros(nelem, dtype=cp.int32)
     nelemPerRank = nelem // group.nranks
     sizePerRank = nelemPerRank * memory.itemsize
@@ -131,7 +197,7 @@ def test_connection_write_and_signal(mpi_group: MpiGroup, transport: Transport,
 
     if device == "cpu" and transport == "NVLink":
         pytest.skip("nvlink doesn't work with host allocated memory")
-    group, connections = create_and_connect(mpi_group, transport)
+    group, connections = create_group_and_connection(mpi_group, transport)
     xp = cp if device == "cuda" else np
     if group.my_rank == 0:
         memory = xp.random.randn(nelem)
@@ -175,7 +241,7 @@ def test_connection_write_and_signal(mpi_group: MpiGroup, transport: Transport,
 
 @parametrize_mpi_groups(2, 4, 8, 16)
 def test_h2h_semaphores(mpi_group: MpiGroup):
-    group, connections = create_and_connect(mpi_group, "IB")
+    group, connections = create_group_and_connection(mpi_group, "IB")
 
     semaphores = group.make_semaphore(connections, Host2HostSemaphore)
     for rank in connections:
@@ -186,6 +252,51 @@ def test_h2h_semaphores(mpi_group: MpiGroup):
     group.barrier()
 
 
+@parametrize_mpi_groups(2, 4, 8, 16)
+def test_h2h_semaphores_gil_release(mpi_group: MpiGroup):
+    group, connections = create_group_and_connection(mpi_group, "IB")
+
+    semaphores = group.make_semaphore(connections, Host2HostSemaphore)
+
+    def target_wait(sems, conns):
+        for rank in conns:
+            sems[rank].wait(-1)
+
+    def target_signal(sems, conns):
+        # sleep 1 sec to let target_wait() starts a bit earlier
+        time.sleep(1)
+        # if wait() doesn't release GIL, this will block forever
+        for rank in conns:
+            sems[rank].signal()
+
+    wait_thread = threading.Thread(target=target_wait, args=(semaphores, connections))
+    signal_thread = threading.Thread(target=target_signal, args=(semaphores, connections))
+    wait_thread.start()
+    signal_thread.start()
+    signal_thread.join()
+    wait_thread.join()
+
+    group.barrier()
+
+
+@parametrize_mpi_groups(8)
+@pytest.mark.skipif(is_nvls_supported() is False, reason="NVLS is not supported")
+def test_nvls_connection(mpi_group: MpiGroup):
+    if all_ranks_on_the_same_node(mpi_group) is False:
+        pytest.skip("cannot use nvls for cross node")
+    group = mscclpp_comm.CommGroup(mpi_group.comm)
+    all_ranks = list(range(group.nranks))
+    endpoint = EndpointConfig(Transport.Nvls, 2**22)
+    nvls_connection = group.make_connection(all_ranks, endpoint)
+    mem_handle1 = nvls_connection.allocate_bind_memory(2**21)
+    mem_handle2 = nvls_connection.allocate_bind_memory(2**21)
+    with pytest.raises(Exception):
+        mem_handle3 = nvls_connection.allocate_bind_memory(2**21)
+    # the memory is freed on the destructor of mem_handle2
+    mem_handle2 = None
+    mem_handle3 = nvls_connection.allocate_bind_memory(2**21)
+
+
 class MscclppKernel:
     def __init__(
         self,
@@ -197,42 +308,57 @@ def __init__(
         use_packet=False,
         scratch=None,
         fifo=None,
+        nvls_mem_handle=None,
+        nvls_buffer_size=None,
     ):
+        file_dir = os.path.dirname(os.path.abspath(__file__))
         if test_name == "h2d_semaphore":
             self._kernel = KernelBuilder(
-                file="h2d_semaphore_test.cu", kernel_name="h2d_semaphore"
+                file="h2d_semaphore_test.cu", kernel_name="h2d_semaphore", file_dir=file_dir
             ).get_compiled_kernel()
             self.nblocks = 1
             self.nthreads = nranks
         elif test_name == "d2d_semaphore":
             self._kernel = KernelBuilder(
-                file="d2d_semaphore_test.cu", kernel_name="d2d_semaphore"
+                file="d2d_semaphore_test.cu", kernel_name="d2d_semaphore", file_dir=file_dir
             ).get_compiled_kernel()
             self.nblocks = 1
             self.nthreads = nranks
         elif test_name == "sm_channel":
-            self._kernel = KernelBuilder(file="sm_channel_test.cu", kernel_name="sm_channel").get_compiled_kernel()
+            self._kernel = KernelBuilder(
+                file="sm_channel_test.cu", kernel_name="sm_channel", file_dir=file_dir
+            ).get_compiled_kernel()
             self.nblocks = nranks
             self.nthreads = 1024
         elif test_name == "fifo":
-            self._kernel = KernelBuilder(file="fifo_test.cu", kernel_name="fifo").get_compiled_kernel()
+            self._kernel = KernelBuilder(
+                file="fifo_test.cu", kernel_name="fifo", file_dir=file_dir
+            ).get_compiled_kernel()
             self.nblocks = 1
             self.nthreads = 1
         elif test_name == "proxy":
-            self._kernel = KernelBuilder(file="proxy_test.cu", kernel_name="proxy").get_compiled_kernel()
+            self._kernel = KernelBuilder(
+                file="proxy_test.cu", kernel_name="proxy", file_dir=file_dir
+            ).get_compiled_kernel()
             self.nblocks = 1
             self.nthreads = nranks
         elif test_name == "simple_proxy_channel":
             self._kernel = KernelBuilder(
-                file="simple_proxy_channel_test.cu", kernel_name="simple_proxy_channel"
+                file="simple_proxy_channel_test.cu", kernel_name="simple_proxy_channel", file_dir=file_dir
             ).get_compiled_kernel()
             self.nblocks = 1
             self.nthreads = 1024
+        elif test_name == "nvls":
+            self._kernel = KernelBuilder(
+                file="nvls_test.cu", kernel_name="nvls_test", file_dir=file_dir
+            ).get_compiled_kernel()
+            self.nblocks = 64
+            self.nthreads = 1024
         else:
             assert False
 
         self.params = b""
-        if test_name in ["h2d_semaphore", "d2d_semaphore", "sm_channel", "simple_proxy_channel"]:
+        if semaphore_or_channels != None:
             first_arg = next(iter(semaphore_or_channels.values()))
             size_of_semaphore_or_channels = len(first_arg.device_handle().raw)
             device_handles = []
@@ -245,6 +371,8 @@ def __init__(
                     device_handles.append(semaphore_or_channels[rank].device_handle().raw)
             # keep a reference to the device handles so that they don't get garbage collected
             self._d_semaphore_or_channels = cp.asarray(memoryview(b"".join(device_handles)), dtype=cp.uint8)
+
+        if test_name in ["h2d_semaphore", "d2d_semaphore", "sm_channel", "simple_proxy_channel"]:
             self.params += pack(self._d_semaphore_or_channels, my_rank, nranks)
             if test_name == "sm_channel":
                 self.params += pack(tensor.size, use_packet)
@@ -253,9 +381,13 @@ def __init__(
         elif test_name == "fifo":
             self.params = fifo.device_handle().raw
         elif test_name == "proxy":
-            semaphore_device_handles = [semaphore.device_handle().raw for semaphore in semaphore_or_channels]
-            self._d_semaphore_or_channels = cp.asarray(memoryview(b"".join(semaphore_device_handles)), dtype=cp.uint8)
             self.params = pack(my_rank, nranks) + fifo.raw + pack(self._d_semaphore_or_channels)
+        elif test_name == "nvls":
+            self.params = (
+                nvls_mem_handle.device_handle().raw
+                + pack(self._d_semaphore_or_channels)
+                + pack(my_rank, nranks, nvls_buffer_size)
+            )
 
     def __call__(self):
         return self._kernel.launch_kernel(self.params, self.nblocks, self.nthreads, 0, None)
@@ -268,7 +400,7 @@ def signal(semaphores):
         for rank in semaphores:
             semaphores[rank].signal()
 
-    group, connections = create_and_connect(mpi_group, transport)
+    group, connections = create_group_and_connection(mpi_group, transport)
 
     semaphores = group.make_semaphore(connections, Host2DeviceSemaphore)
     kernel = MscclppKernel("h2d_semaphore", group.my_rank, group.nranks, semaphores)
@@ -284,7 +416,7 @@ def signal(semaphores):
 
 @parametrize_mpi_groups(2, 4, 8, 16)
 def test_d2d_semaphores(mpi_group: MpiGroup):
-    group, connections = create_and_connect(mpi_group, "NVLink")
+    group, connections = create_group_and_connection(mpi_group, "NVLink")
 
     semaphores = group.make_semaphore(connections, SmDevice2DeviceSemaphore)
     group.barrier()
@@ -298,7 +430,7 @@ def test_d2d_semaphores(mpi_group: MpiGroup):
 @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
 @pytest.mark.parametrize("use_packet", [False, True])
 def test_sm_channels(mpi_group: MpiGroup, nelem: int, use_packet: bool):
-    group, connections = create_and_connect(mpi_group, "NVLink")
+    group, connections = create_group_and_connection(mpi_group, "NVLink")
 
     memory = cp.zeros(nelem, dtype=cp.int32)
     if use_packet:
@@ -313,7 +445,7 @@ def test_sm_channels(mpi_group: MpiGroup, nelem: int, use_packet: bool):
         memory_expected[(nelemPerRank * rank) : (nelemPerRank * (rank + 1))] = rank + 1
 
     if use_packet:
-        channels = group.make_sm_channels_with_packet(memory, scratch, connections)
+        channels = group.make_sm_channels_with_scratch(memory, scratch, connections)
     else:
         channels = group.make_sm_channels(memory, connections)
     kernel = MscclppKernel("sm_channel", group.my_rank, group.nranks, channels, memory, use_packet, scratch)
@@ -346,7 +478,7 @@ def test_fifo(
 @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
 @pytest.mark.parametrize("transport", ["IB", "NVLink"])
 def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
-    group, connections = create_and_connect(mpi_group, transport)
+    group, connections = create_group_and_connection(mpi_group, transport)
 
     memory = cp.zeros(nelem, dtype=cp.int32)
     nelemPerRank = nelem // group.nranks
@@ -380,7 +512,7 @@ def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
     fifo_device_handle = proxy.fifo_device_handle()
 
     kernel = MscclppKernel(
-        "proxy", my_rank=group.my_rank, nranks=group.nranks, semaphore_or_channels=list_sem, fifo=fifo_device_handle
+        "proxy", my_rank=group.my_rank, nranks=group.nranks, semaphore_or_channels=semaphores, fifo=fifo_device_handle
     )
     proxy.start()
     group.barrier()
@@ -396,7 +528,7 @@ def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
 @pytest.mark.parametrize("transport", ["NVLink", "IB"])
 @pytest.mark.parametrize("use_packet", [False, True])
 def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool):
-    group, connections = create_and_connect(mpi_group, transport)
+    group, connections = create_group_and_connection(mpi_group, transport)
 
     memory = cp.zeros(nelem, dtype=cp.int32)
     if use_packet:
@@ -416,7 +548,7 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u
         memory_to_register = scratch
     else:
         memory_to_register = memory
-    simple_channels = group.make_proxy_channels_with_packet(proxy_service, memory_to_register, connections)
+    simple_channels = group.make_proxy_channels(proxy_service, memory_to_register, connections)
 
     kernel = MscclppKernel(
         "simple_proxy_channel",
@@ -434,3 +566,27 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u
     proxy_service.stop_proxy()
     group.barrier()
     assert cp.array_equal(memory, memory_expected)
+
+
+@parametrize_mpi_groups(4, 8)
+@pytest.mark.skipif(is_nvls_supported() is False, reason="NVLS is not supported")
+def test_nvls(mpi_group: MpiGroup):
+    group, nvls_connection = create_group_and_connection(mpi_group, "NVLS")
+    nbytes = 2**21
+    mem_handle = nvls_connection.allocate_bind_memory(nbytes)
+
+    nvlinks_connections = create_connection(group, "NVLink")
+    semaphores = group.make_semaphore(nvlinks_connections, SmDevice2DeviceSemaphore)
+
+    kernel = MscclppKernel(
+        "nvls",
+        my_rank=group.my_rank,
+        nranks=group.nranks,
+        nvls_mem_handle=mem_handle,
+        nvls_buffer_size=nbytes,
+        semaphore_or_channels=semaphores,
+    )
+
+    kernel()
+    cp.cuda.runtime.deviceSynchronize()
+    group.barrier()
diff --git a/python/test/utils.py b/python/test/utils.py
deleted file mode 100644
index ca11407d1..000000000
--- a/python/test/utils.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import ctypes
-import os
-import struct
-import subprocess
-import tempfile
-from typing import Type
-
-from cuda import cuda, nvrtc, cudart
-import cupy as cp
-import numpy as np
-
-
-def _check_cuda_errors(result):
-    if result[0].value:
-        raise RuntimeError(f"CUDA error code={result[0].value}({_cuda_get_error(result[0])})")
-    if len(result) == 1:
-        return None
-    elif len(result) == 2:
-        return result[1]
-    else:
-        return result[1:]
-
-
-def _cuda_get_error(error):
-    if isinstance(error, cuda.CUresult):
-        err, name = cuda.cuGetErrorName(error)
-        return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>"
-    elif isinstance(error, cudart.cudaError_t):
-        return cudart.cudaGetErrorName(error)[1]
-    elif isinstance(error, nvrtc.nvrtcResult):
-        return nvrtc.nvrtcGetErrorString(error)[1]
-    else:
-        raise RuntimeError("Unknown error type: {}".format(error))
-
-
-class Kernel:
-    def __init__(self, ptx: bytes, kernel_name: str, device_id: int):
-        self._context = _check_cuda_errors(cuda.cuCtxGetCurrent())
-        assert self._context is not None
-        self._module = _check_cuda_errors(cuda.cuModuleLoadData(ptx))
-        self._kernel = _check_cuda_errors(cuda.cuModuleGetFunction(self._module, kernel_name.encode()))
-
-    def launch_kernel(
-        self,
-        params: bytes,
-        nblocks: int,
-        nthreads: int,
-        shared: int,
-        stream: Type[cuda.CUstream] or Type[cudart.cudaStream_t],
-    ):
-        buffer = (ctypes.c_byte * len(params)).from_buffer_copy(params)
-        buffer_size = ctypes.c_size_t(len(params))
-        config = np.array(
-            [
-                cuda.CU_LAUNCH_PARAM_BUFFER_POINTER,
-                ctypes.addressof(buffer),
-                cuda.CU_LAUNCH_PARAM_BUFFER_SIZE,
-                ctypes.addressof(buffer_size),
-                cuda.CU_LAUNCH_PARAM_END,
-            ],
-            dtype=np.uint64,
-        )
-        _check_cuda_errors(
-            cuda.cuLaunchKernel(self._kernel, nblocks, 1, 1, nthreads, 1, 1, shared, stream, 0, config.ctypes.data)
-        )
-
-    def __del__(self):
-        cuda.cuModuleUnload(self._module)
-
-
-class KernelBuilder:
-    kernel_map: dict = {}
-
-    def __init__(self, file: str, kernel_name: str):
-        if kernel_name in self.kernel_map:
-            self._kernel = self.kernel_map[kernel_name]
-            return
-        self._tempdir = tempfile.TemporaryDirectory(suffix=f"{os.getpid()}")
-        self._current_file_dir = os.path.dirname(os.path.abspath(__file__))
-        device_id = cp.cuda.Device().id
-        ptx = self._compile_cuda(os.path.join(self._current_file_dir, file), f"{kernel_name}.ptx", device_id)
-        self._kernel = Kernel(ptx, kernel_name, device_id)
-        self.kernel_map[kernel_name] = self._kernel
-
-    def _compile_cuda(self, source_file, output_file, device_id, std_version="c++17"):
-        include_dir = os.path.join(self._current_file_dir, "../../include")
-        major = _check_cuda_errors(
-            cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device_id)
-        )
-        minor = _check_cuda_errors(
-            cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device_id)
-        )
-        cuda_home = os.environ.get("CUDA_HOME")
-        nvcc = os.path.join(cuda_home, "bin/nvcc") if cuda_home else "nvcc"
-        command = [
-            nvcc,
-            f"-std={std_version}",
-            "-ptx",
-            "-Xcompiler",
-            "-Wall,-Wextra",
-            f"-I{include_dir}",
-            f"{source_file}",
-            f"--gpu-architecture=compute_{major}{minor}",
-            f"--gpu-code=sm_{major}{minor},compute_{major}{minor}",
-            "-o",
-            f"{self._tempdir.name}/{output_file}",
-        ]
-        try:
-            subprocess.run(command, capture_output=True, text=True, check=True, bufsize=1)
-            with open(f"{self._tempdir.name}/{output_file}", "rb") as f:
-                return f.read()
-        except subprocess.CalledProcessError as e:
-            raise RuntimeError("Compilation failed:", e.stderr, " ".join(command))
-
-    def get_compiled_kernel(self):
-        return self._kernel
-
-    def __del__(self):
-        if hasattr(self, "_tempdir"):
-            self._tempdir.cleanup()
-
-
-def pack(*args):
-    res = b""
-    for arg in list(args):
-        if isinstance(arg, int):
-            res += struct.pack("i", arg)
-        elif isinstance(arg, np.ndarray):
-            res += struct.pack("P", arg.ctypes.data)
-        elif isinstance(arg, cp.ndarray):
-            res += struct.pack("P", arg.data.ptr)
-        # use int to represent bool, which can avoid CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES error
-        elif isinstance(arg, bool):
-            res += struct.pack("i", arg)
-        else:
-            raise RuntimeError(f"Unsupported type: {type(arg)}")
-    return res
diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc
index 649a1f62e..c9cea10f4 100644
--- a/src/bootstrap/bootstrap.cc
+++ b/src/bootstrap/bootstrap.cc
@@ -35,6 +35,20 @@ struct ExtInfo {
   SocketAddress extAddressListen;
 };
 
+MSCCLPP_API_CPP void Bootstrap::groupBarrier(const std::vector<int>& ranks) {
+  int dummy = 0;
+  for (auto rank : ranks) {
+    if (rank != this->getRank()) {
+      this->send(static_cast<void*>(&dummy), sizeof(dummy), rank, 0);
+    }
+  }
+  for (auto rank : ranks) {
+    if (rank != this->getRank()) {
+      this->recv(static_cast<void*>(&dummy), sizeof(dummy), rank, 0);
+    }
+  }
+}
+
 MSCCLPP_API_CPP void Bootstrap::send(const std::vector<char>& data, int peer, int tag) {
   size_t size = data.size();
   send((void*)&size, sizeof(size_t), peer, tag);
@@ -56,12 +70,14 @@ static_assert(sizeof(UniqueIdInternal) <= sizeof(UniqueId), "UniqueIdInternal is
 
 class TcpBootstrap::Impl {
  public:
+  static UniqueId createUniqueId();
+  static UniqueId getUniqueId(const UniqueIdInternal& uniqueId);
+
   Impl(int rank, int nRanks);
   ~Impl();
   void initialize(const UniqueId& uniqueId, int64_t timeoutSec);
   void initialize(const std::string& ifIpPortTrio, int64_t timeoutSec);
   void establishConnections(int64_t timeoutSec);
-  UniqueId createUniqueId();
   UniqueId getUniqueId() const;
   int getRank();
   int getNranks();
@@ -85,7 +101,6 @@ class TcpBootstrap::Impl {
   std::unique_ptr<uint32_t> abortFlagStorage_;
   volatile uint32_t* abortFlag_;
   std::thread rootThread_;
-  char netIfName_[MAX_IF_NAME_SIZE + 1];
   SocketAddress netIfAddr_;
   std::unordered_map<std::pair<int, int>, std::shared_ptr<Socket>, PairHash> peerSendSockets_;
   std::unordered_map<std::pair<int, int>, std::shared_ptr<Socket>, PairHash> peerRecvSockets_;
@@ -96,15 +111,33 @@ class TcpBootstrap::Impl {
   std::shared_ptr<Socket> getPeerSendSocket(int peer, int tag);
   std::shared_ptr<Socket> getPeerRecvSocket(int peer, int tag);
 
+  static void assignPortToUniqueId(UniqueIdInternal& uniqueId);
+  static void netInit(std::string ipPortPair, std::string interface, SocketAddress& netIfAddr);
+
   void bootstrapCreateRoot();
   void bootstrapRoot();
   void getRemoteAddresses(Socket* listenSock, std::vector<SocketAddress>& rankAddresses,
                           std::vector<SocketAddress>& rankAddressesRoot, int& rank);
   void sendHandleToPeer(int peer, const std::vector<SocketAddress>& rankAddresses,
                         const std::vector<SocketAddress>& rankAddressesRoot);
-  void netInit(std::string ipPortPair, std::string interface);
 };
 
+UniqueId TcpBootstrap::Impl::createUniqueId() {
+  UniqueIdInternal uniqueId;
+  SocketAddress netIfAddr;
+  netInit("", "", netIfAddr);
+  getRandomData(&uniqueId.magic, sizeof(uniqueId_.magic));
+  std::memcpy(&uniqueId.addr, &netIfAddr, sizeof(SocketAddress));
+  assignPortToUniqueId(uniqueId);
+  return getUniqueId(uniqueId);
+}
+
+UniqueId TcpBootstrap::Impl::getUniqueId(const UniqueIdInternal& uniqueId) {
+  UniqueId ret;
+  std::memcpy(&ret, &uniqueId, sizeof(uniqueId));
+  return ret;
+}
+
 TcpBootstrap::Impl::Impl(int rank, int nRanks)
     : rank_(rank),
       nRanks_(nRanks),
@@ -114,29 +147,26 @@ TcpBootstrap::Impl::Impl(int rank, int nRanks)
       abortFlagStorage_(new uint32_t(0)),
       abortFlag_(abortFlagStorage_.get()) {}
 
-UniqueId TcpBootstrap::Impl::getUniqueId() const {
-  UniqueId ret;
-  std::memcpy(&ret, &uniqueId_, sizeof(uniqueId_));
-  return ret;
-}
-
-UniqueId TcpBootstrap::Impl::createUniqueId() {
-  netInit("", "");
-  getRandomData(&uniqueId_.magic, sizeof(uniqueId_.magic));
-  std::memcpy(&uniqueId_.addr, &netIfAddr_, sizeof(SocketAddress));
-  bootstrapCreateRoot();
-  return getUniqueId();
-}
+UniqueId TcpBootstrap::Impl::getUniqueId() const { return getUniqueId(uniqueId_); }
 
 int TcpBootstrap::Impl::getRank() { return rank_; }
 
 int TcpBootstrap::Impl::getNranks() { return nRanks_; }
 
 void TcpBootstrap::Impl::initialize(const UniqueId& uniqueId, int64_t timeoutSec) {
-  netInit("", "");
+  if (!netInitialized) {
+    netInit("", "", netIfAddr_);
+    netInitialized = true;
+  }
 
   std::memcpy(&uniqueId_, &uniqueId, sizeof(uniqueId_));
+  if (rank_ == 0) {
+    bootstrapCreateRoot();
+  }
 
+  char line[MAX_IF_NAME_SIZE + 1];
+  SocketToString(&uniqueId_.addr, line);
+  INFO(MSCCLPP_INIT, "rank %d nranks %d - connecting to %s", rank_, nRanks_, line);
   establishConnections(timeoutSec);
 }
 
@@ -156,7 +186,10 @@ void TcpBootstrap::Impl::initialize(const std::string& ifIpPortTrio, int64_t tim
     ipPortPair = ifIpPortTrio.substr(ipPortPair.find_first_of(':') + 1);
   }
 
-  netInit(ipPortPair, interface);
+  if (!netInitialized) {
+    netInit(ipPortPair, interface, netIfAddr_);
+    netInitialized = true;
+  }
 
   uniqueId_.magic = 0xdeadbeef;
   std::memcpy(&uniqueId_.addr, &netIfAddr_, sizeof(SocketAddress));
@@ -216,9 +249,15 @@ void TcpBootstrap::Impl::sendHandleToPeer(int peer, const std::vector<SocketAddr
   netSend(&sock, &rankAddresses[next], sizeof(SocketAddress));
 }
 
+void TcpBootstrap::Impl::assignPortToUniqueId(UniqueIdInternal& uniqueId) {
+  std::unique_ptr<Socket> socket = std::make_unique<Socket>(&uniqueId.addr, uniqueId.magic, SocketTypeBootstrap);
+  socket->bind();
+  uniqueId.addr = socket->getAddr();
+}
+
 void TcpBootstrap::Impl::bootstrapCreateRoot() {
   listenSockRoot_ = std::make_unique<Socket>(&uniqueId_.addr, uniqueId_.magic, SocketTypeBootstrap, abortFlag_, 0);
-  listenSockRoot_->listen();
+  listenSockRoot_->bindAndListen();
   uniqueId_.addr = listenSockRoot_->getAddr();
 
   rootThread_ = std::thread([this]() {
@@ -265,34 +304,33 @@ void TcpBootstrap::Impl::bootstrapRoot() {
   TRACE(MSCCLPP_INIT, "DONE");
 }
 
-void TcpBootstrap::Impl::netInit(std::string ipPortPair, std::string interface) {
-  if (netInitialized) return;
+void TcpBootstrap::Impl::netInit(std::string ipPortPair, std::string interface, SocketAddress& netIfAddr) {
+  char netIfName[MAX_IF_NAME_SIZE + 1];
   if (!ipPortPair.empty()) {
     if (interface != "") {
       // we know the <interface>
-      int ret = FindInterfaces(netIfName_, &netIfAddr_, MAX_IF_NAME_SIZE, 1, interface.c_str());
+      int ret = FindInterfaces(netIfName, &netIfAddr, MAX_IF_NAME_SIZE, 1, interface.c_str());
       if (ret <= 0) throw Error("NET/Socket : No interface named " + interface + " found.", ErrorCode::InternalError);
     } else {
       // we do not know the <interface> try to match it next
       SocketAddress remoteAddr;
       SocketGetAddrFromString(&remoteAddr, ipPortPair.c_str());
-      if (FindInterfaceMatchSubnet(netIfName_, &netIfAddr_, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
+      if (FindInterfaceMatchSubnet(netIfName, &netIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
         throw Error("NET/Socket : No usable listening interface found", ErrorCode::InternalError);
       }
     }
 
   } else {
-    int ret = FindInterfaces(netIfName_, &netIfAddr_, MAX_IF_NAME_SIZE, 1);
+    int ret = FindInterfaces(netIfName, &netIfAddr, MAX_IF_NAME_SIZE, 1);
     if (ret <= 0) {
       throw Error("TcpBootstrap : no socket interface found", ErrorCode::InternalError);
     }
   }
 
   char line[SOCKET_NAME_MAXLEN + MAX_IF_NAME_SIZE + 2];
-  std::sprintf(line, " %s:", netIfName_);
-  SocketToString(&netIfAddr_, line + strlen(line));
+  std::sprintf(line, " %s:", netIfName);
+  SocketToString(&netIfAddr, line + strlen(line));
   INFO(MSCCLPP_INIT, "TcpBootstrap : Using%s", line);
-  netInitialized = true;
 }
 
 #define TIMEOUT(__exp)                                                      \
@@ -331,13 +369,13 @@ void TcpBootstrap::Impl::establishConnections(int64_t timeoutSec) {
   uint64_t magic = uniqueId_.magic;
   // Create socket for other ranks to contact me
   listenSock_ = std::make_unique<Socket>(&netIfAddr_, magic, SocketTypeBootstrap, abortFlag_);
-  listenSock_->listen();
+  listenSock_->bindAndListen();
   info.extAddressListen = listenSock_->getAddr();
 
   {
     // Create socket for root to contact me
     Socket lsock(&netIfAddr_, magic, SocketTypeBootstrap, abortFlag_);
-    lsock.listen();
+    lsock.bindAndListen();
     info.extAddressListenRoot = lsock.getAddr();
 
     // stagger connection times to avoid an overload of the root
@@ -472,9 +510,9 @@ void TcpBootstrap::Impl::close() {
   peerRecvSockets_.clear();
 }
 
-MSCCLPP_API_CPP TcpBootstrap::TcpBootstrap(int rank, int nRanks) { pimpl_ = std::make_unique<Impl>(rank, nRanks); }
+MSCCLPP_API_CPP UniqueId TcpBootstrap::createUniqueId() { return Impl::createUniqueId(); }
 
-MSCCLPP_API_CPP UniqueId TcpBootstrap::createUniqueId() { return pimpl_->createUniqueId(); }
+MSCCLPP_API_CPP TcpBootstrap::TcpBootstrap(int rank, int nRanks) { pimpl_ = std::make_unique<Impl>(rank, nRanks); }
 
 MSCCLPP_API_CPP UniqueId TcpBootstrap::getUniqueId() const { return pimpl_->getUniqueId(); }
 
diff --git a/src/bootstrap/socket.cc b/src/bootstrap/socket.cc
index 2267af9b3..a79821f1b 100644
--- a/src/bootstrap/socket.cc
+++ b/src/bootstrap/socket.cc
@@ -390,7 +390,7 @@ Socket::Socket(const SocketAddress* addr, uint64_t magic, enum SocketType type,
 
 Socket::~Socket() { close(); }
 
-void Socket::listen() {
+void Socket::bind() {
   if (fd_ == -1) {
     throw Error("file descriptor is -1", ErrorCode::InvalidUsage);
   }
@@ -433,7 +433,11 @@ void Socket::listen() {
   if (::getsockname(fd_, &addr_.sa, &size) != 0) {
     throw SysError("getsockname failed", errno);
   }
+  state_ = SocketStateBound;
+}
 
+void Socket::bindAndListen() {
+  bind();
 #ifdef ENABLE_TRACE
   char line[SOCKET_NAME_MAXLEN + 1];
   TRACE(MSCCLPP_INIT | MSCCLPP_NET, "Listening on socket %s", SocketToString(&addr_, line));
diff --git a/src/communicator.cc b/src/communicator.cc
index d2f0e6172..d0fb07a23 100644
--- a/src/communicator.cc
+++ b/src/communicator.cc
@@ -105,6 +105,45 @@ MSCCLPP_API_CPP NonblockingFuture<std::shared_ptr<Connection>> Communicator::con
   return NonblockingFuture<std::shared_ptr<Connection>>(connector->connectionPromise_.get_future());
 }
 
+MSCCLPP_API_CPP std::shared_ptr<NvlsConnection> Communicator::connctNvlsCollective(std::vector<int> allRanks,
+                                                                                   EndpointConfig config) {
+  auto bootstrap = this->bootstrap();
+  int rank = bootstrap->getRank();
+  bool isRoot = false;
+  bool amongAllRanks = false;
+  int rootRank = allRanks[0];
+  for (auto nvlsRank : allRanks) {
+    if (nvlsRank == rank) amongAllRanks = true;
+    rootRank = std::min(rootRank, nvlsRank);
+  }
+  if (amongAllRanks == false) {
+    throw Error("rank is not among allRanks", ErrorCode::InvalidUsage);
+  }
+  if (rootRank == rank) isRoot = true;
+
+  std::shared_ptr<NvlsConnection> conn;
+  if (isRoot) {
+    conn = std::make_shared<NvlsConnection>(config.nvlsBufferSize, allRanks.size());
+    auto serialized = conn->serialize();
+    for (auto nvlsRank : allRanks) {
+      if (nvlsRank != rank) bootstrap->send(serialized, nvlsRank, 0);
+    }
+  } else {
+    std::vector<char> data;
+    bootstrap->recv(data, rootRank, 0);
+    conn = std::make_shared<NvlsConnection>(data);
+  }
+
+  // Now let's synchronize all ranks
+  bootstrap->groupBarrier(allRanks);
+  // now it is safe to add my device
+  conn->addDevice();
+
+  // sync here to make sure all ranks have added their devices
+  bootstrap->groupBarrier(allRanks);
+  return conn;
+}
+
 MSCCLPP_API_CPP int Communicator::remoteRankOf(const Connection& connection) {
   return pimpl_->connectionInfos_.at(&connection).remoteRank;
 }
diff --git a/src/connection.cc b/src/connection.cc
index 7d8b4b7f7..65b76b33f 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -10,7 +10,6 @@
 #include "endpoint.hpp"
 #include "infiniband/verbs.h"
 #include "npkit/npkit.h"
-#include "registered_memory.hpp"
 
 namespace mscclpp {
 
@@ -73,9 +72,9 @@ void CudaIpcConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset,
   validateTransport(dst, remoteTransport());
   uint64_t oldValue = *src;
   *src = newValue;
-  uint64_t* dstPtr = (uint64_t*)dst.data();
+  uint64_t* dstPtr = reinterpret_cast<uint64_t*>(reinterpret_cast<char*>(dst.data()) + dstOffset);
 
-  MSCCLPP_CUDATHROW(cudaMemcpyAsync(dstPtr + dstOffset, src, sizeof(uint64_t), cudaMemcpyHostToDevice, stream_));
+  MSCCLPP_CUDATHROW(cudaMemcpyAsync(dstPtr, src, sizeof(uint64_t), cudaMemcpyHostToDevice, stream_));
   INFO(MSCCLPP_P2P, "CudaIpcConnection atomic write: from %p to %p, %lu -> %lu", src, dstPtr + dstOffset, oldValue,
        newValue);
 
@@ -97,7 +96,6 @@ void CudaIpcConnection::flush(int64_t timeoutUsec) {
 IBConnection::IBConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, Context& context)
     : transport_(localEndpoint.transport()),
       remoteTransport_(remoteEndpoint.transport()),
-      numSignaledSends(0),
       dummyAtomicSource_(std::make_unique<uint64_t>(0)) {
   qp = getImpl(localEndpoint)->ibQp_;
   qp->rtr(getImpl(remoteEndpoint)->ibQpInfo_);
@@ -131,7 +129,6 @@ void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMem
 
   qp->stageSend(srcMr, dstMrInfo, (uint32_t)size, /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset,
                 /*signaled=*/true);
-  numSignaledSends++;
 
   qp->postSend();
   INFO(MSCCLPP_NET, "IBConnection write: from %p to %p, size %lu", (uint8_t*)srcMr->getBuff() + srcOffset,
@@ -152,7 +149,6 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6
   *src = newValue;
 
   qp->stageAtomicAdd(dstTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue, /*signaled=*/true);
-  numSignaledSends++;
 
   qp->postSend();
   INFO(MSCCLPP_NET, "IBConnection atomic Write: from %p to %p, %lu -> %lu", src, (uint8_t*)dstMrInfo.addr + dstOffset,
@@ -161,24 +157,23 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6
 
 void IBConnection::flush(int64_t timeoutUsec) {
   Timer timer;
-  while (numSignaledSends) {
+  while (qp->getNumCqItems()) {
     int wcNum = qp->pollCq();
     if (wcNum < 0) {
       throw mscclpp::IbError("pollCq failed: error no " + std::to_string(errno), errno);
-    }
-
-    auto elapsed = timer.elapsed();
-    if ((timeoutUsec >= 0) && (elapsed > timeoutUsec)) {
-      throw Error("pollCq is stuck: waited for " + std::to_string(elapsed / 1e6) + " seconds. Expected " +
-                      std::to_string(numSignaledSends) + " signals",
-                  ErrorCode::InternalError);
+    } else if (timeoutUsec >= 0) {
+      auto elapsed = timer.elapsed();
+      if (elapsed > timeoutUsec) {
+        throw Error("pollCq timed out: waited for " + std::to_string(elapsed / 1e6) + " seconds. Expected " +
+                        std::to_string(qp->getNumCqItems()) + " signals",
+                    ErrorCode::Timeout);
+      }
     }
     for (int i = 0; i < wcNum; ++i) {
       const ibv_wc* wc = qp->getWc(i);
       if (wc->status != IBV_WC_SUCCESS) {
-        throw mscclpp::IbError("pollCq failed: status " + std::to_string(wc->status), wc->status);
+        throw mscclpp::IbError("a work item failed: status " + std::to_string(wc->status), wc->status);
       }
-      numSignaledSends--;
     }
   }
   INFO(MSCCLPP_NET, "IBConnection flushing connection");
diff --git a/src/cuda_utils.cc b/src/cuda_utils.cc
index 5b1449eb0..2e0d4a1b0 100644
--- a/src/cuda_utils.cc
+++ b/src/cuda_utils.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include <mscclpp/cuda_utils.hpp>
+#include <mscclpp/gpu_utils.hpp>
 
 namespace mscclpp {
 
@@ -9,11 +9,11 @@ AvoidCudaGraphCaptureGuard::AvoidCudaGraphCaptureGuard() : mode_(cudaStreamCaptu
   MSCCLPP_CUDATHROW(cudaThreadExchangeStreamCaptureMode(&mode_));
 }
 
-AvoidCudaGraphCaptureGuard::~AvoidCudaGraphCaptureGuard() { cudaThreadExchangeStreamCaptureMode(&mode_); }
+AvoidCudaGraphCaptureGuard::~AvoidCudaGraphCaptureGuard() { (void)cudaThreadExchangeStreamCaptureMode(&mode_); }
 
 CudaStreamWithFlags::CudaStreamWithFlags(unsigned int flags) {
   MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&stream_, flags));
 }
-CudaStreamWithFlags::~CudaStreamWithFlags() { cudaStreamDestroy(stream_); }
+CudaStreamWithFlags::~CudaStreamWithFlags() { (void)cudaStreamDestroy(stream_); }
 
 }  // namespace mscclpp
diff --git a/src/debug.cc b/src/debug.cc
index 8af75daa0..aa97b09db 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -6,12 +6,12 @@
 
 #include "debug.h"
 
-#include <cuda_runtime.h>
 #include <stdarg.h>
 #include <stdlib.h>
 #include <sys/syscall.h>
 #include <unistd.h>
 
+#include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/utils.hpp>
 #include <string>
 
@@ -181,7 +181,7 @@ void mscclppDebugLog(mscclppDebugLogLevel level, unsigned long flags, const char
 
   int cudaDev;
   if (!(level == MSCCLPP_LOG_TRACE && flags == MSCCLPP_CALL)) {
-    cudaGetDevice(&cudaDev);
+    MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDev));
   }
 
   char buffer[1024];
diff --git a/src/errors.cc b/src/errors.cc
index c3b3ac707..537b3fc27 100644
--- a/src/errors.cc
+++ b/src/errors.cc
@@ -3,6 +3,7 @@
 
 #include <cstring>
 #include <mscclpp/errors.hpp>
+#include <mscclpp/gpu.hpp>
 
 #include "api.h"
 
@@ -42,13 +43,15 @@ MSCCLPP_API_CPP SysError::SysError(const std::string& message, int errorCode) :
   message_ = message + " (System failure: " + std::strerror(errorCode) + ")";
 }
 
-MSCCLPP_API_CPP CudaError::CudaError(const std::string& message, cudaError_t errorCode) : BaseError(errorCode) {
-  message_ = message + " (Cuda failure: " + cudaGetErrorString(errorCode) + ")";
+MSCCLPP_API_CPP CudaError::CudaError(const std::string& message, int errorCode) : BaseError(errorCode) {
+  message_ = message + " (Cuda failure: " + cudaGetErrorString(static_cast<cudaError_t>(errorCode)) + ")";
 }
 
-MSCCLPP_API_CPP CuError::CuError(const std::string& message, CUresult errorCode) : BaseError(errorCode) {
+MSCCLPP_API_CPP CuError::CuError(const std::string& message, int errorCode) : BaseError(errorCode) {
   const char* errStr;
-  cuGetErrorString(errorCode, &errStr);
+  if (cuGetErrorString(static_cast<CUresult>(errorCode), &errStr) != CUDA_SUCCESS) {
+    errStr = "failed to get error string";
+  }
   message_ = message + " (Cu failure: " + errStr + ")";
 }
 
diff --git a/src/fifo.cc b/src/fifo.cc
index c0bdedba8..592bf7d00 100644
--- a/src/fifo.cc
+++ b/src/fifo.cc
@@ -1,11 +1,11 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include <cuda/atomic>
-#include <mscclpp/cuda_utils.hpp>
 #include <mscclpp/fifo.hpp>
+#include <mscclpp/gpu_utils.hpp>
 
 #include "api.h"
+#include "atomic.hpp"
 
 namespace mscclpp {
 
@@ -43,21 +43,20 @@ MSCCLPP_API_CPP ProxyTrigger Fifo::poll() {
   ProxyTrigger trigger;
   ProxyTrigger* ptr = &pimpl->triggers.get()[pimpl->hostTail % pimpl->size];
   // we are loading fst first. if fst is non-zero then snd is also valid
-  trigger.fst = cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{ptr->fst}.load(cuda::memory_order_acquire);
+  trigger.fst = atomicLoad(&(ptr->fst), memoryOrderAcquire);
   trigger.snd = ptr->snd;
   return trigger;
 }
 
 MSCCLPP_API_CPP void Fifo::pop() {
-  cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{pimpl->triggers.get()[pimpl->hostTail % pimpl->size].fst}.store(
-      0, cuda::memory_order_release);
-
+  atomicStore(&(pimpl->triggers.get()[pimpl->hostTail % pimpl->size].fst), uint64_t{0}, memoryOrderRelease);
   (pimpl->hostTail)++;
 }
 
 MSCCLPP_API_CPP void Fifo::flushTail(bool sync) {
   // Flush the tail to device memory. This is either triggered every ProxyFlushPeriod to make sure that the fifo can
   // make progress even if there is no request mscclppSync. However, mscclppSync type is for flush request.
+  AvoidCudaGraphCaptureGuard cgcGuard;
   MSCCLPP_CUDATHROW(cudaMemcpyAsync(pimpl->tailReplica.get(), &pimpl->hostTail, sizeof(uint64_t),
                                     cudaMemcpyHostToDevice, pimpl->stream));
   if (sync) {
diff --git a/src/ib.cc b/src/ib.cc
index 4cac11fe0..9955c5269 100644
--- a/src/ib.cc
+++ b/src/ib.cc
@@ -8,6 +8,7 @@
 #include <unistd.h>
 
 #include <cstring>
+#include <fstream>
 #include <mscclpp/core.hpp>
 #include <mscclpp/fifo.hpp>
 #include <sstream>
@@ -16,6 +17,20 @@
 #include "api.h"
 #include "debug.h"
 
+#if !defined(__HIP_PLATFORM_AMD__)
+
+// Check if nvidia_peermem kernel module is loaded
+static bool checkNvPeerMemLoaded() {
+  std::ifstream file("/proc/modules");
+  std::string line;
+  while (std::getline(file, line)) {
+    if (line.find("nvidia_peermem") != std::string::npos) return true;
+  }
+  return false;
+}
+
+#endif  // !defined(__HIP_PLATFORM_AMD__)
+
 namespace mscclpp {
 
 IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : buff(buff) {
@@ -54,7 +69,7 @@ uint32_t IbMr::getLkey() const { return this->mr->lkey; }
 
 IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int port, int maxCqSize, int maxCqPollNum, int maxSendWr, int maxRecvWr,
            int maxWrPerSend)
-    : maxCqPollNum(maxCqPollNum), maxWrPerSend(maxWrPerSend) {
+    : numSignaledPostedItems(0), numSignaledStagedItems(0), maxCqPollNum(maxCqPollNum), maxWrPerSend(maxWrPerSend) {
   this->cq = ibv_create_cq(ctx, maxCqSize, nullptr, nullptr, 0);
   if (this->cq == nullptr) {
     std::stringstream err;
@@ -212,6 +227,7 @@ void IbQp::stageSend(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64
   wrInfo.sge->addr = (uint64_t)(mr->getBuff()) + srcOffset;
   wrInfo.sge->length = size;
   wrInfo.sge->lkey = mr->getLkey();
+  if (signaled) (this->numSignaledStagedItems)++;
 }
 
 void IbQp::stageAtomicAdd(const IbMr* mr, const IbMrInfo& info, uint64_t wrId, uint64_t dstOffset, uint64_t addVal,
@@ -226,6 +242,7 @@ void IbQp::stageAtomicAdd(const IbMr* mr, const IbMrInfo& info, uint64_t wrId, u
   wrInfo.sge->addr = (uint64_t)(mr->getBuff());
   wrInfo.sge->length = sizeof(uint64_t);  // atomic op is always on uint64_t
   wrInfo.sge->lkey = mr->getLkey();
+  if (signaled) (this->numSignaledStagedItems)++;
 }
 
 void IbQp::stageSendWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset,
@@ -240,6 +257,7 @@ void IbQp::stageSendWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size,
   wrInfo.sge->addr = (uint64_t)(mr->getBuff()) + srcOffset;
   wrInfo.sge->length = size;
   wrInfo.sge->lkey = mr->getLkey();
+  if (signaled) (this->numSignaledStagedItems)++;
 }
 
 void IbQp::postSend() {
@@ -254,29 +272,34 @@ void IbQp::postSend() {
     throw mscclpp::IbError(err.str(), errno);
   }
   this->wrn = 0;
+  this->numSignaledPostedItems += this->numSignaledStagedItems;
+  this->numSignaledStagedItems = 0;
+  if (this->numSignaledPostedItems + 4 > this->cq->cqe) {
+    WARN("IB: CQ is almost full ( %d / %d ). The connection needs to be flushed to prevent timeout errors.",
+         this->numSignaledPostedItems, this->cq->cqe);
+  }
 }
 
-void IbQp::postRecv(uint64_t wrId) {
-  struct ibv_recv_wr wr, *bad_wr;
-  wr.wr_id = wrId;
-  wr.sg_list = nullptr;
-  wr.num_sge = 0;
-  wr.next = nullptr;
-  int ret = ibv_post_recv(this->qp, &wr, &bad_wr);
-  if (ret != 0) {
-    std::stringstream err;
-    err << "ibv_post_recv failed (errno " << errno << ")";
-    throw mscclpp::IbError(err.str(), errno);
+int IbQp::pollCq() {
+  int wcNum = ibv_poll_cq(this->cq, this->maxCqPollNum, this->wcs.get());
+  if (wcNum > 0) {
+    this->numSignaledPostedItems -= wcNum;
   }
+  return wcNum;
 }
 
-int IbQp::pollCq() { return ibv_poll_cq(this->cq, this->maxCqPollNum, this->wcs.get()); }
-
 IbQpInfo& IbQp::getInfo() { return this->info; }
 
 const ibv_wc* IbQp::getWc(int idx) const { return &this->wcs[idx]; }
 
+int IbQp::getNumCqItems() const { return this->numSignaledPostedItems; }
+
 IbCtx::IbCtx(const std::string& devName) : devName(devName) {
+#if !defined(__HIP_PLATFORM_AMD__)
+  if (!checkNvPeerMemLoaded()) {
+    throw mscclpp::Error("nvidia_peermem kernel module is not loaded", ErrorCode::InternalError);
+  }
+#endif  // !defined(__HIP_PLATFORM_AMD__)
   int num;
   struct ibv_device** devices = ibv_get_device_list(&num);
   for (int i = 0; i < num; ++i) {
diff --git a/src/include/atomic.hpp b/src/include/atomic.hpp
new file mode 100644
index 000000000..d7f61fec4
--- /dev/null
+++ b/src/include/atomic.hpp
@@ -0,0 +1,17 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef MSCCLPP_ATOMIC_HPP_
+#define MSCCLPP_ATOMIC_HPP_
+
+#if defined(USE_CUDA)
+#define MSCCLPP_DEVICE_CUDA
+#include <mscclpp/atomic_device.hpp>
+#undef MSCCLPP_DEVICE_CUDA
+#else  // !defined(USE_CUDA)
+#define MSCCLPP_DEVICE_HIP
+#include <mscclpp/atomic_device.hpp>
+#undef MSCCLPP_DEVICE_HIP
+#endif  // !defined(USE_CUDA)
+
+#endif  // MSCCLPP_ATOMIC_HPP_
diff --git a/src/include/connection.hpp b/src/include/connection.hpp
index d073d96b3..47b154758 100644
--- a/src/include/connection.hpp
+++ b/src/include/connection.hpp
@@ -4,9 +4,8 @@
 #ifndef MSCCLPP_CONNECTION_HPP_
 #define MSCCLPP_CONNECTION_HPP_
 
-#include <cuda_runtime.h>
-
 #include <mscclpp/core.hpp>
+#include <mscclpp/gpu.hpp>
 
 #include "communicator.hpp"
 #include "context.hpp"
@@ -36,7 +35,6 @@ class IBConnection : public Connection {
   Transport transport_;
   Transport remoteTransport_;
   IbQp* qp;
-  int numSignaledSends;
   std::unique_ptr<uint64_t> dummyAtomicSource_;  // not used anywhere but IB needs a source
   RegisteredMemory dummyAtomicSourceMem_;
   mscclpp::TransportInfo dstTransportInfo_;
diff --git a/src/include/context.hpp b/src/include/context.hpp
index 6468b1d33..abb95b27d 100644
--- a/src/include/context.hpp
+++ b/src/include/context.hpp
@@ -5,7 +5,7 @@
 #define MSCCLPP_CONTEXT_HPP_
 
 #include <mscclpp/core.hpp>
-#include <mscclpp/cuda_utils.hpp>
+#include <mscclpp/gpu_utils.hpp>
 #include <unordered_map>
 #include <vector>
 
@@ -17,6 +17,7 @@ struct Context::Impl {
   std::vector<std::shared_ptr<Connection>> connections_;
   std::unordered_map<Transport, std::unique_ptr<IbCtx>> ibContexts_;
   CudaStreamWithFlags ipcStream_;
+  CUmemGenericAllocationHandle mcHandle_;
 
   Impl();
 
diff --git a/src/include/ib.hpp b/src/include/ib.hpp
index db2c426e6..0ea661617 100644
--- a/src/include/ib.hpp
+++ b/src/include/ib.hpp
@@ -68,11 +68,11 @@ class IbQp {
   void stageSendWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset,
                         uint64_t dstOffset, bool signaled, unsigned int immData);
   void postSend();
-  void postRecv(uint64_t wrId);
   int pollCq();
 
   IbQpInfo& getInfo();
   const ibv_wc* getWc(int idx) const;
+  int getNumCqItems() const;
 
  private:
   struct WrInfo {
@@ -92,6 +92,8 @@ class IbQp {
   std::unique_ptr<ibv_send_wr[]> wrs;
   std::unique_ptr<ibv_sge[]> sges;
   int wrn;
+  int numSignaledPostedItems;
+  int numSignaledStagedItems;
 
   const int maxCqPollNum;
   const int maxWrPerSend;
diff --git a/src/include/registered_memory.hpp b/src/include/registered_memory.hpp
index 3804bfd62..11cd30231 100644
--- a/src/include/registered_memory.hpp
+++ b/src/include/registered_memory.hpp
@@ -4,10 +4,9 @@
 #ifndef MSCCLPP_REGISTERED_MEMORY_HPP_
 #define MSCCLPP_REGISTERED_MEMORY_HPP_
 
-#include <cuda_runtime.h>
-
 #include <mscclpp/core.hpp>
 #include <mscclpp/errors.hpp>
+#include <mscclpp/gpu.hpp>
 
 #include "communicator.hpp"
 #include "ib.hpp"
diff --git a/src/include/socket.h b/src/include/socket.h
index 9f043414e..ed125c990 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -35,10 +35,11 @@ enum SocketState {
   SocketStateConnecting = 4,
   SocketStateConnectPolling = 5,
   SocketStateConnected = 6,
-  SocketStateReady = 7,
-  SocketStateClosed = 8,
-  SocketStateError = 9,
-  SocketStateNum = 10
+  SocketStateBound = 7,
+  SocketStateReady = 8,
+  SocketStateClosed = 9,
+  SocketStateError = 10,
+  SocketStateNum = 11
 };
 
 enum SocketType {
@@ -62,7 +63,8 @@ class Socket {
          enum SocketType type = SocketTypeUnknown, volatile uint32_t* abortFlag = nullptr, int asyncFlag = 0);
   ~Socket();
 
-  void listen();
+  void bind();
+  void bindAndListen();
   void connect(int64_t timeout = -1);
   void accept(const Socket* listenSocket, int64_t timeout = -1);
   void send(void* ptr, int size);
diff --git a/src/npkit/npkit.cc b/src/npkit/npkit.cc
index 4bee4597e..466806d1f 100644
--- a/src/npkit/npkit.cc
+++ b/src/npkit/npkit.cc
@@ -3,11 +3,11 @@
 
 #include "npkit.h"
 
-#include <cuda_runtime.h>
 #include <unistd.h>
 
 #include <chrono>
 #include <fstream>
+#include <mscclpp/gpu.hpp>
 
 uint64_t NpKit::rank_ = 0;
 
diff --git a/src/npkit/npkit.h b/src/npkit/npkit.h
index 6af98be84..21ba928ae 100644
--- a/src/npkit/npkit.h
+++ b/src/npkit/npkit.h
@@ -4,7 +4,7 @@
 #ifndef NPKIT_H_
 #define NPKIT_H_
 
-#include <mscclpp/cuda_utils.hpp>
+#include <mscclpp/gpu_utils.hpp>
 #include <string>
 #include <vector>
 
diff --git a/src/numa.cc b/src/numa.cc
index a1d4129d1..d72c99ee7 100644
--- a/src/numa.cc
+++ b/src/numa.cc
@@ -4,7 +4,7 @@
 #include <numa.h>
 
 #include <fstream>
-#include <mscclpp/cuda_utils.hpp>
+#include <mscclpp/gpu_utils.hpp>
 
 #include "api.h"
 
diff --git a/src/nvls_connection.cc b/src/nvls_connection.cc
new file mode 100644
index 000000000..78f3e52d5
--- /dev/null
+++ b/src/nvls_connection.cc
@@ -0,0 +1,272 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <mscclpp/core.hpp>
+#include <mscclpp/utils.hpp>
+
+#include "debug.h"
+#include "endpoint.hpp"
+
+namespace mscclpp {
+
+#if (USE_NVLS)
+class NvlsConnection::Impl : public std::enable_shared_from_this<NvlsConnection::Impl> {
+ public:
+  // use this only for the root of the NVLS
+  Impl(size_t bufferSize, int numDevices);
+  Impl(const std::vector<char>& data);
+  ~Impl();
+
+  Impl(const Impl&) = delete;
+  Impl& operator=(const Impl&) = delete;
+
+  size_t getMinMcGran() { return minMcGran_; }
+  std::vector<char> serialize();
+  void addDevice(int cudaDeviceId);
+  size_t allocateBuffer(size_t size);
+  void freeBuffer(size_t offset, size_t size) noexcept;
+  std::shared_ptr<char> bindMemory(std::shared_ptr<PhysicalCudaMemory<char>> physicalMem, size_t devBuffSize);
+
+ private:
+  friend class NvlsConnection;
+  CUmemGenericAllocationHandle mcHandle_;
+  CUmulticastObjectProp mcProp_;
+  size_t bufferSize_;
+  size_t minMcGran_;
+  size_t mcGran_;
+  // These are only defined for multicast (NVLS) capability
+  pid_t rootPid_;
+  int mcFileDesc_;
+
+  std::list<std::pair<size_t, size_t>> allocatedRanges_;
+  std::list<std::pair<size_t, size_t>> freeRanges_;
+};
+
+NvlsConnection::Impl::Impl(size_t bufferSize, int numDevices) {
+  minMcGran_ = 0;
+  mcGran_ = 0;
+  mcProp_ = {};
+  mcProp_.size = bufferSize;
+  mcProp_.numDevices = numDevices;
+  mcProp_.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+  MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minMcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_MINIMUM));
+  MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_RECOMMENDED));
+  mcProp_.size = ((mcProp_.size + minMcGran_ - 1) / minMcGran_) * minMcGran_;
+  bufferSize_ = mcProp_.size;
+  MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp_));
+  mcFileDesc_ = 0;
+  MSCCLPP_CUTHROW(
+      cuMemExportToShareableHandle(&mcFileDesc_, mcHandle_, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0 /*flags*/));
+  freeRanges_.emplace_back(0, bufferSize_);
+
+  rootPid_ = getpid();
+  if (rootPid_ < 0) {
+    throw mscclpp::SysError("getpid() failed", errno);
+  }
+
+  INFO(MSCCLPP_COLL, "NVLS handle created on root with size %ld. minGranularity %ld and recommendedGranularity %ld\n",
+       mcProp_.size, minMcGran_, mcGran_);
+}
+
+NvlsConnection::Impl::Impl(const std::vector<char>& data) {
+  auto it = data.begin();
+  std::copy_n(it, sizeof(this->mcHandle_), reinterpret_cast<char*>(&this->mcHandle_));
+  it += sizeof(this->mcHandle_);
+  std::copy_n(it, sizeof(this->bufferSize_), reinterpret_cast<char*>(&this->bufferSize_));
+  it += sizeof(this->bufferSize_);
+  std::copy_n(it, sizeof(this->minMcGran_), reinterpret_cast<char*>(&this->minMcGran_));
+  it += sizeof(this->minMcGran_);
+  std::copy_n(it, sizeof(this->mcGran_), reinterpret_cast<char*>(&this->mcGran_));
+  it += sizeof(this->mcGran_);
+  std::copy_n(it, sizeof(this->rootPid_), reinterpret_cast<char*>(&this->rootPid_));
+  it += sizeof(this->rootPid_);
+  std::copy_n(it, sizeof(this->mcFileDesc_), reinterpret_cast<char*>(&this->mcFileDesc_));
+
+  freeRanges_.emplace_back(0, bufferSize_);
+  int rootPidFd = syscall(SYS_pidfd_open, rootPid_, 0);
+  if (rootPidFd < 0) {
+    throw mscclpp::SysError("pidfd_open() failed", errno);
+  }
+  int mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0);
+  if (mcRootFileDescFd < 0) {
+    throw mscclpp::SysError("pidfd_getfd() failed", errno);
+  }
+  MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&mcHandle_, reinterpret_cast<void*>(mcRootFileDescFd),
+                                                 CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
+  close(rootPidFd);
+  close(mcRootFileDescFd);
+
+  INFO(MSCCLPP_COLL, "NVLS handle was imported from root");
+}
+
+NvlsConnection::Impl::~Impl() {
+  // we don't need to free multicast handle object according to NCCL.
+  if (rootPid_ == getpid()) {
+    close(mcFileDesc_);
+  }
+}
+
+std::vector<char> NvlsConnection::Impl::serialize() {
+  std::vector<char> result;
+  std::copy_n(reinterpret_cast<char*>(&mcHandle_), sizeof(mcHandle_), std::back_inserter(result));
+  std::copy_n(reinterpret_cast<char*>(&bufferSize_), sizeof(bufferSize_), std::back_inserter(result));
+  std::copy_n(reinterpret_cast<char*>(&minMcGran_), sizeof(minMcGran_), std::back_inserter(result));
+  std::copy_n(reinterpret_cast<char*>(&mcGran_), sizeof(mcGran_), std::back_inserter(result));
+  std::copy_n(reinterpret_cast<char*>(&rootPid_), sizeof(rootPid_), std::back_inserter(result));
+  std::copy_n(reinterpret_cast<char*>(&mcFileDesc_), sizeof(mcFileDesc_), std::back_inserter(result));
+  return result;
+}
+
+void NvlsConnection::Impl::addDevice(int cudaDeviceId) {
+  MSCCLPP_CUTHROW(cuMulticastAddDevice(mcHandle_, cudaDeviceId));
+  INFO(MSCCLPP_COLL, "NVLS connection created");
+}
+
+size_t NvlsConnection::Impl::allocateBuffer(size_t size) {
+  if (freeRanges_.empty()) {
+    throw Error("This NVLS connection mapped more than it was supposed to", ErrorCode::InvalidUsage);
+  }
+  auto it = std::find_if(freeRanges_.begin(), freeRanges_.end(),
+                         [size](const std::pair<size_t, size_t>& range) { return range.second >= size; });
+  if (it != freeRanges_.end()) {
+    size_t offset = it->first;
+    size_t rangeSize = it->second;
+    if (rangeSize == size) {
+      freeRanges_.erase(it);
+    } else {
+      it->first += size;
+      it->second -= size;
+    }
+    allocatedRanges_.emplace_back(offset, size);
+    INFO(MSCCLPP_COLL, "NVLS connection allocated %ld bytes at offset %ld", size, offset);
+    return offset;
+  }
+  throw Error("This NVLS connection cannot map the requested devBuffSize", ErrorCode::InvalidUsage);
+}
+
+void NvlsConnection::Impl::freeBuffer(size_t offset, size_t size) noexcept {
+  auto it = std::find_if(
+      allocatedRanges_.begin(), allocatedRanges_.end(),
+      [offset, size](const std::pair<size_t, size_t>& range) { return range.first == offset && range.second == size; });
+  if (it == allocatedRanges_.end()) {
+    WARN("NVLS connection tried to free a buffer that was not allocated");
+    return;
+  }
+  allocatedRanges_.erase(it);
+  it = std::find_if(freeRanges_.begin(), freeRanges_.end(), [offset, size](const std::pair<size_t, size_t>& range) {
+    return range.first + range.second >= offset;
+  });
+  if (it == freeRanges_.end()) {
+    freeRanges_.emplace_back(offset, size);
+    return;
+  }
+  if (it->first + it->second == offset) {
+    // merge with the previous free range if possible
+    it->second += size;
+    // merge with the next free range if possible
+    auto nextItr = std::next(it);
+    if (nextItr != freeRanges_.end() && it->first + it->second == nextItr->first) {
+      it->second += nextItr->second;
+      freeRanges_.erase(nextItr);
+    }
+    return;
+  } else if (it->first == offset + size) {
+    // merge with the next free range if possible
+    it->first -= size;
+    it->second += size;
+    return;
+  } else {
+    freeRanges_.emplace(it, offset, size);
+    return;
+  }
+}
+
+std::shared_ptr<char> NvlsConnection::Impl::bindMemory(std::shared_ptr<PhysicalCudaMemory<char>> physicalMem,
+                                                       size_t devBuffSize) {
+  size_t offset = allocateBuffer(devBuffSize);
+  MSCCLPP_CUTHROW(
+      cuMulticastBindMem(mcHandle_, offset /*mcOffset*/, physicalMem->memHandle_, 0 /*memOffset*/, devBuffSize, 0));
+
+  char* mcPtr;
+
+  CUmemAccessDesc accessDesc = {};
+  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  int deviceId = -1;
+  MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
+  accessDesc.location.id = deviceId;
+  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)(&mcPtr), devBuffSize, minMcGran_, 0U, 0));
+  MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)(mcPtr), devBuffSize, 0, mcHandle_, 0));
+  MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)(mcPtr), devBuffSize, &accessDesc, 1));
+
+  auto deleter = [=, self = shared_from_this()](char* ptr) {
+    CUdevice device;
+    MSCCLPP_CUTHROW(cuDeviceGet(&device, deviceId));
+    MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, devBuffSize));
+    MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, devBuffSize));
+    MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, device, offset, devBuffSize));
+    self->freeBuffer(offset, devBuffSize);
+  };
+
+  return std::shared_ptr<char>(mcPtr, deleter);
+}
+#else   // !(USE_NVLS)
+class NvlsConnection::Impl {
+ public:
+  // use this only for the root of the NVLS
+  Impl(size_t, int) { throw notSupportedError; }
+  Impl(const std::vector<char>&) { throw notSupportedError; }
+
+  Impl(const Impl&) = delete;
+  Impl& operator=(const Impl&) = delete;
+
+  std::vector<char> serialize() { throw notSupportedError; }
+  size_t allocateBuffer(size_t) { throw notSupportedError; }
+  void freeBuffer(size_t, size_t) { throw notSupportedError; }
+  std::shared_ptr<char> bindMemory(std::shared_ptr<PhysicalCudaMemory<char>>, size_t) { throw notSupportedError; }
+  void addDevice(int) { throw notSupportedError; }
+  size_t getMinMcGran() { throw notSupportedError; }
+
+ private:
+  Error notSupportedError = Error("NVLS is not supported on this CUDA version", ErrorCode::InvalidUsage);
+};
+#endif  // !(USE_NVLS)
+
+NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices)
+    : pimpl_(std::make_shared<Impl>(bufferSize, numDevices)) {}
+
+void NvlsConnection::addDevice() {
+  int cudaDeviceId;
+  MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId));
+  this->addDevice(cudaDeviceId);
+}
+
+void NvlsConnection::addDevice(int cudaDeviceId) { pimpl_->addDevice(cudaDeviceId); }
+
+NvlsConnection::NvlsConnection(const std::vector<char>& data) : pimpl_(std::make_shared<Impl>(data)) {}
+
+std::vector<char> NvlsConnection::serialize() { return pimpl_->serialize(); }
+
+std::shared_ptr<NvlsConnection::DeviceMulticastPointer> NvlsConnection::allocateAndBindCuda(size_t size) {
+  auto mem = allocSharedPhysicalCuda<char>(size, pimpl_->getMinMcGran());
+  auto mcPtr = pimpl_->bindMemory(mem, size);
+  return std::make_shared<DeviceMulticastPointer>(mem, mcPtr, size);
+}
+
+NvlsConnection::DeviceMulticastPointer::DeviceHandle NvlsConnection::DeviceMulticastPointer::deviceHandle() {
+  NvlsConnection::DeviceMulticastPointer::DeviceHandle device;
+  device.devicePtr = this->deviceMem_->devicePtr_;
+  device.mcPtr = this->mcPtr_.get();
+  device.bufferSize = this->bufferSize_;
+  return device;
+};
+
+char* NvlsConnection::DeviceMulticastPointer::getDevicePtr() { return deviceMem_->devicePtr_; };
+
+size_t NvlsConnection::getMultiCastMinGranularity() { return pimpl_->getMinMcGran(); }
+
+}  // namespace mscclpp
diff --git a/src/proxy.cc b/src/proxy.cc
index 3fe3b1645..9787ed0cf 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -3,7 +3,7 @@
 
 #include <atomic>
 #include <mscclpp/core.hpp>
-#include <mscclpp/cuda_utils.hpp>
+#include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/proxy.hpp>
 #include <mscclpp/utils.hpp>
 #include <thread>
@@ -25,15 +25,17 @@ struct Proxy::Impl {
   std::thread service;
   std::atomic_bool running;
 
-  Impl(ProxyHandler handler, std::function<void()> threadInit)
-      : handler(handler), threadInit(threadInit), running(false) {}
+  Impl(ProxyHandler handler, std::function<void()> threadInit, size_t fifoSize)
+      : handler(handler), threadInit(threadInit), fifo(fifoSize), running(false) {}
 };
 
-MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler, std::function<void()> threadInit) {
-  pimpl = std::make_unique<Impl>(handler, threadInit);
+MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler, std::function<void()> threadInit, size_t fifoSize) {
+  pimpl = std::make_unique<Impl>(handler, threadInit, fifoSize);
 }
 
-MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler) : Proxy(handler, [] {}) {}
+MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler, size_t fifoSize)
+    : Proxy(
+          handler, [] {}, fifoSize) {}
 
 MSCCLPP_API_CPP Proxy::~Proxy() {
   if (pimpl) {
diff --git a/src/proxy_channel.cc b/src/proxy_channel.cc
index 3114cc180..3e63e54eb 100644
--- a/src/proxy_channel.cc
+++ b/src/proxy_channel.cc
@@ -16,9 +16,9 @@ MSCCLPP_API_CPP ProxyChannel::ProxyChannel(SemaphoreId semaphoreId, std::shared_
 MSCCLPP_API_CPP SimpleProxyChannel::SimpleProxyChannel(ProxyChannel proxyChan, MemoryId dst, MemoryId src)
     : proxyChan_(proxyChan), dst_(dst), src_(src) {}
 
-MSCCLPP_API_CPP ProxyService::ProxyService()
+MSCCLPP_API_CPP ProxyService::ProxyService(size_t fifoSize)
     : proxy_(std::make_shared<Proxy>([&](ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); },
-                                     [&]() { bindThread(); })) {
+                                     [&]() { bindThread(); }, fifoSize)) {
   int cudaDevice;
   MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDevice));
   deviceNumaNode = getDeviceNumaNode(cudaDevice);
diff --git a/src/registered_memory.cc b/src/registered_memory.cc
index 9c35e1443..6d5fd79f5 100644
--- a/src/registered_memory.cc
+++ b/src/registered_memory.cc
@@ -3,10 +3,8 @@
 
 #include "registered_memory.hpp"
 
-#include <cuda.h>
-
 #include <algorithm>
-#include <mscclpp/cuda_utils.hpp>
+#include <mscclpp/gpu_utils.hpp>
 
 #include "api.h"
 #include "context.hpp"
diff --git a/src/semaphore.cc b/src/semaphore.cc
index 6605be99f..7dec60c3d 100644
--- a/src/semaphore.cc
+++ b/src/semaphore.cc
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include <cuda/atomic>
 #include <mscclpp/semaphore.hpp>
 
 #include "api.h"
+#include "atomic.hpp"
 #include "debug.h"
 
 namespace mscclpp {
@@ -21,7 +21,7 @@ static NonblockingFuture<RegisteredMemory> setupInboundSemaphoreId(Communicator&
 
 MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(Communicator& communicator,
                                                            std::shared_ptr<Connection> connection)
-    : BaseSemaphore(allocUniqueCuda<uint64_t>(), allocUniqueCuda<uint64_t>(), std::make_unique<uint64_t>()),
+    : BaseSemaphore(allocExtUniqueCuda<uint64_t>(), allocExtUniqueCuda<uint64_t>(), std::make_unique<uint64_t>()),
       connection_(connection) {
   INFO(MSCCLPP_INIT, "Creating a Host2Device semaphore for %s transport from %d to %d",
        connection->getTransportName().c_str(), communicator.bootstrap()->getRank(),
@@ -67,8 +67,8 @@ MSCCLPP_API_CPP void Host2HostSemaphore::signal() {
 }
 
 MSCCLPP_API_CPP bool Host2HostSemaphore::poll() {
-  bool signaled = (cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{*(uint64_t*)localInboundSemaphore_.get()}.load(
-                       cuda::memory_order_acquire) > (*expectedInboundSemaphore_));
+  bool signaled =
+      (atomicLoad((uint64_t*)localInboundSemaphore_.get(), memoryOrderAcquire) > (*expectedInboundSemaphore_));
   if (signaled) (*expectedInboundSemaphore_) += 1;
   return signaled;
 }
@@ -76,8 +76,7 @@ MSCCLPP_API_CPP bool Host2HostSemaphore::poll() {
 MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) {
   (*expectedInboundSemaphore_) += 1;
   int64_t spinCount = 0;
-  while (cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{*(uint64_t*)localInboundSemaphore_.get()}.load(
-             cuda::memory_order_acquire) < (*expectedInboundSemaphore_)) {
+  while (atomicLoad((uint64_t*)localInboundSemaphore_.get(), memoryOrderAcquire) < (*expectedInboundSemaphore_)) {
     if (maxSpinCount >= 0 && spinCount++ == maxSpinCount) {
       throw Error("Host2HostSemaphore::wait timed out", ErrorCode::Timeout);
     }
@@ -86,7 +85,7 @@ MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) {
 
 MSCCLPP_API_CPP SmDevice2DeviceSemaphore::SmDevice2DeviceSemaphore(Communicator& communicator,
                                                                    std::shared_ptr<Connection> connection)
-    : BaseSemaphore(allocUniqueCuda<uint64_t>(), allocUniqueCuda<uint64_t>(), allocUniqueCuda<uint64_t>()) {
+    : BaseSemaphore(allocExtUniqueCuda<uint64_t>(), allocExtUniqueCuda<uint64_t>(), allocExtUniqueCuda<uint64_t>()) {
   INFO(MSCCLPP_INIT, "Creating a Device2Device semaphore for %s transport from %d to %d",
        connection->getTransportName().c_str(), communicator.bootstrap()->getRank(),
        communicator.remoteRankOf(*connection));
diff --git a/src/utils.cc b/src/utils.cc
index 7153d55c5..8475f2f60 100644
--- a/src/utils.cc
+++ b/src/utils.cc
@@ -7,6 +7,7 @@
 #include <chrono>
 #include <iostream>
 #include <mscclpp/errors.hpp>
+#include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/utils.hpp>
 #include <sstream>
 #include <string>
@@ -66,4 +67,15 @@ std::string getHostName(int maxlen, const char delim) {
   return hostname.substr(0, i);
 }
 
+bool isNvlsSupported() {
+#if (CUDART_VERSION >= 12010)
+  CUdevice dev;
+  int isNvlsSupported;
+  MSCCLPP_CUTHROW(cuCtxGetDevice(&dev));
+  MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isNvlsSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
+  return isNvlsSupported == 1;
+#endif
+  return false;
+}
+
 }  // namespace mscclpp
diff --git a/src/utils_internal.cc b/src/utils_internal.cc
index 37b96bbea..c94e1620e 100644
--- a/src/utils_internal.cc
+++ b/src/utils_internal.cc
@@ -95,6 +95,10 @@ uint64_t computeHostHash(void) {
 
 uint64_t getHostHash(void) {
   thread_local std::unique_ptr<uint64_t> hostHash = std::make_unique<uint64_t>(computeHostHash());
+  // avoid crash on static destruction
+  if (hostHash == nullptr) {
+    hostHash = std::make_unique<uint64_t>(computeHostHash());
+  }
   return *hostHash;
 }
 
@@ -120,6 +124,10 @@ uint64_t computePidHash(void) {
 
 uint64_t getPidHash(void) {
   thread_local std::unique_ptr<uint64_t> pidHash = std::make_unique<uint64_t>(computePidHash());
+  // avoid crash on static destruction
+  if (pidHash == nullptr) {
+    pidHash = std::make_unique<uint64_t>(computePidHash());
+  }
   return *pidHash;
 }
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 076798966..da47066ea 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -3,11 +3,16 @@
 
 find_package(MPI)
 
-set(TEST_LIBS_COMMON mscclpp ${CUDA_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES} ${GDRCOPY_LIBRARIES})
+set(TEST_LIBS_COMMON mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES} Threads::Threads)
 set(TEST_LIBS_GTEST GTest::gtest_main GTest::gmock_main)
-set(TEST_INC_COMMON PRIVATE ${PROJECT_SOURCE_DIR}/include ${CUDAToolkit_INCLUDE_DIRS})
+set(TEST_INC_COMMON PRIVATE ${PROJECT_SOURCE_DIR}/include SYSTEM PRIVATE ${GPU_INCLUDE_DIRS})
 set(TEST_INC_INTERNAL PRIVATE ${PROJECT_SOURCE_DIR}/src/include)
 
+if(USE_ROCM)
+    file(GLOB_RECURSE CU_SOURCES CONFIGURE_DEPENDS *.cu)
+    set_source_files_properties(${CU_SOURCES} PROPERTIES LANGUAGE CXX)
+endif()
+
 function(add_test_executable name sources)
     add_executable(${name} ${sources})
     target_link_libraries(${name} ${TEST_LIBS_COMMON} MPI::MPI_CXX)
@@ -18,6 +23,7 @@ endfunction()
 
 add_test_executable(allgather_test_cpp allgather_test_cpp.cu)
 add_test_executable(allgather_test_host_offloading allgather_test_host_offloading.cu)
+add_test_executable(nvls_test nvls_test.cu)
 
 configure_file(run_mpi_test.sh.in run_mpi_test.sh)
 
diff --git a/test/allgather_test_cpp.cu b/test/allgather_test_cpp.cu
index fcebd7590..2f56b221d 100644
--- a/test/allgather_test_cpp.cu
+++ b/test/allgather_test_cpp.cu
@@ -74,7 +74,12 @@ __device__ void localAllGather(DeviceHandle<mscclpp::SimpleProxyChannel> proxyCh
     if ((remoteRank % nranksPerNode) == ((rank - i + nranksPerNode) % nranksPerNode)) {
       if ((threadIdx.x % 32) == 0) proxyChan.wait();
     }
+#if defined(__HIP_PLATFORM_AMD__)
+    // NOTE: we actually need a group barrier here for better performance, but __syncthreads() is still correct.
+    __syncthreads();
+#else
     asm volatile("bar.sync %0, %1;" ::"r"(11), "r"((nranksPerNode - 1) * 32) : "memory");
+#endif
   }
 }
 
@@ -237,7 +242,7 @@ void setupMscclppConnections(int rank, int world_size, mscclpp::Communicator& co
   }
 
   if (proxyChannels.size() > sizeof(constProxyChans) / sizeof(DeviceHandle<mscclpp::SimpleProxyChannel>)) {
-    std::unexpected();
+    std::runtime_error("unexpected error");
   }
   CUDACHECK(cudaMemcpyToSymbol(constProxyChans, proxyChannels.data(),
                                sizeof(DeviceHandle<mscclpp::SimpleProxyChannel>) * proxyChannels.size()));
diff --git a/test/allgather_test_host_offloading.cu b/test/allgather_test_host_offloading.cu
index 76e6b6311..7f50994b9 100644
--- a/test/allgather_test_host_offloading.cu
+++ b/test/allgather_test_host_offloading.cu
@@ -2,8 +2,8 @@
 // Licensed under the MIT license.
 
 #include <mscclpp/core.hpp>
-#include <mscclpp/cuda_utils.hpp>
 #include <mscclpp/fifo.hpp>
+#include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/numa.hpp>
 #include <mscclpp/proxy.hpp>
 #include <mscclpp/semaphore.hpp>
diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh
index 248c09c43..dee5af2d6 100644
--- a/test/deploy/deploy.sh
+++ b/test/deploy/deploy.sh
@@ -1,14 +1,10 @@
 set -e
 
 KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-SRC_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/build"
-SRC_INCLUDE_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/include"
-PYTHON_SRC_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/python"
+ROOT_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/"
 DST_DIR="/tmp/mscclpp"
 HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile"
-DEPLOY_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy"
 SSH_OPTION="StrictHostKeyChecking=no"
-MSCCLPP_TEST_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/mscclpp-test"
 
 chmod 400 ${KeyFilePath}
 ssh-keygen -t rsa -f sshkey -P ""
@@ -25,23 +21,15 @@ done
 
 set -e
 parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "rm -rf ${DST_DIR}"
-parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "mkdir -p ${DST_DIR}"
-parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${SRC_DIR} ${DST_DIR}
-parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${PYTHON_SRC_DIR} ${DST_DIR}
-parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${SRC_INCLUDE_DIR} ${DST_DIR}
-
-parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION sshkey ${DST_DIR}
-parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION sshkey.pub ${DST_DIR}
-parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${DEPLOY_DIR}/* ${DST_DIR}
-parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${MSCCLPP_TEST_DIR}/check_perf_result.py ${DST_DIR}
+parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
 
 # force to pull the latest image
 parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
   "sudo docker pull ${CONTAINERIMAGE}"
 parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
   "sudo docker run --rm -itd --privileged --net=host --ipc=host --gpus=all \
-  -w /root -v ${DST_DIR}:/root/mscclpp --name=mscclpp-test \
+  -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --name=mscclpp-test \
   --entrypoint /bin/bash ${CONTAINERIMAGE}"
 parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
-  "sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/setup.sh'"
+  "sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh'"
 
diff --git a/test/deploy/perf_ndmv4.jsonl b/test/deploy/perf_ndmv4.jsonl
index 8d76e9059..757998851 100644
--- a/test/deploy/perf_ndmv4.jsonl
+++ b/test/deploy/perf_ndmv4.jsonl
@@ -11,9 +11,9 @@
 {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":3.3919,"busBw":5.9359, "size":24576,      "time":7.24,    "target":"latency"}
 {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":6.21,  "busBw":10.87,  "size":49152,      "time":7.91,    "target":"latency"}
 {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":8.90,  "busBw":15.57,  "size":73728,      "time":8.28,    "target":"latency"}
-{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":84.55,  "busBw":158.53, "size":25165824,   "time":297.64,  "target":"throughput"}
-{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":99.43,  "busBw":186.44, "size":50331648,   "time":506.16,  "target":"throughput"}
-{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":124.60, "busBw":233.64, "size":3221225472, "time":25850.67,"target":"throughput"}
+{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":80.84,  "busBw":151.58, "size":25165824,   "time":311.28,  "target":"throughput"}
+{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":97.27,  "busBw":182.38, "size":50331648,   "time":517.43,  "target":"throughput"}
+{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":125.99, "busBw":236.24, "size":3221225472, "time":25565.46,"target":"throughput"}
 {"name":"allreduce", "kernel":3, "ranks":16,"ranksPerNode":8, "algBw":119.5,  "busBw":224.06, "size":3221225472, "time":26955.85,"target":"throughput"}
 {"name":"alltoall",  "kernel":0, "ranks":16,"ranksPerNode":8, "algBw":46.53,  "busBw":43.63,  "size":1073741824, "time":23071.5, "target":"throughput"}
 {"name":"alltoall",  "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":276.17, "busBw":241.65, "size":1073741824, "time":3887.87, "target":"throughput"}
diff --git a/test/deploy/run_tests.sh b/test/deploy/run_tests.sh
index d862d39bb..a5def0f71 100644
--- a/test/deploy/run_tests.sh
+++ b/test/deploy/run_tests.sh
@@ -1,62 +1,63 @@
 set -e
+HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi
 
 function run_mscclpp_test()
 {
   echo "=================Run allgather_test_perf on 2 nodes========================="
-  /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
+  /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
     -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
     -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
 
   # For kernel 2, the message size must can be divided by 3
-  /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
+  /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
     -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
     -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl
 
-  /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
+  /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
     -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
     -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl
 
   echo "==================Run allreduce_test_perf on 2 nodes========================="
-  /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
+  /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
     -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
     -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
 
-  /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
+  /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
     -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
     -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl
 
-  /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
+  /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
     -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
     -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1M -f 2 -k 2 -o /root/mscclpp/output.jsonl
 
-  /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
+  /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
     -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
     -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 3 -o /root/mscclpp/output.jsonl
 
-  /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
+  /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
     -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
     -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 4 -o /root/mscclpp/output.jsonl
 
   echo "==================Run alltoall_test_perf on 2 nodes========================="
-  /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
+  /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
     -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
     -npernode 8 /root/mscclpp/build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
 
   echo "========================Run performance check==============================="
-  python3 /root/mscclpp/check_perf_result.py --perf-file /root/mscclpp/output.jsonl \
-    --baseline-file /root/mscclpp/perf_ndmv4.jsonl
+  python3 /root/mscclpp/test/mscclpp-test/check_perf_result.py --perf-file /root/mscclpp/output.jsonl \
+    --baseline-file /root/mscclpp/test/deploy/perf_ndmv4.jsonl
 }
 
 function run_mp_ut()
 {
   echo "============Run multi-process unit tests on 2 nodes (np=2, npernode=1)========================="
   /usr/local/mpi/bin/mpirun -allow-run-as-root -tag-output -np 2 --bind-to numa \
-  -hostfile /root/mscclpp/hostfile_mpi -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
+  -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
   -npernode 1 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003
 
   echo "============Run multi-process unit tests on 2 nodes (np=16, npernode=8)========================="
   /usr/local/mpi/bin/mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \
-  -hostfile /root/mscclpp/hostfile_mpi -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
+  -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
   -npernode 8 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003
 }
 
@@ -64,12 +65,23 @@ function run_pytests()
 {
   echo "==================Run python tests================================"
   /usr/local/mpi/bin/mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \
-  -hostfile /root/mscclpp/hostfile_mpi -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-  -npernode 8 bash /root/mscclpp/pytest.sh
+  -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
+  -x MSCCLPP_HOME=/root/mscclpp -npernode 8 bash /root/mscclpp/test/deploy/pytest.sh
+}
+
+function run_py_benchmark()
+{
+  echo "==================Run python benchmark================================"
+  /usr/local/mpi/bin/mpirun -allow-run-as-root -np 16 --bind-to numa \
+  -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
+  -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0 -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \
+  -x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=/opt/microsoft/ndv4-topo.xml \
+  -x NCCL_NET_PLUGIN=none -x NCCL_IB_DISABLE=0 -x NCCL_MIN_NCHANNELS=32 -x NCCL_DEBUG=WARN -x NCCL_P2P_DISABLE=0 -x NCCL_SHM_DISABLE=0 \
+  -x MSCCLPP_HOME=/root/mscclpp -np 16 -npernode 8 python3 /root/mscclpp/python/mscclpp_benchmark/allreduce_bench.py
 }
 
 if [ $# -lt 1 ]; then
-    echo "Usage: $0 <mscclpp-test/mp-ut>"
+    echo "Usage: $0 <mscclpp-test/mp-ut/run_pytests/run_py_benchmark>"
     exit 1
 fi
 test_name=$1
@@ -83,9 +95,13 @@ case $test_name in
     run_mp_ut
     ;;
   pytests)
-    echo "==================Run python tests================================"
+    echo "==================Run python tests===================================="
     run_pytests
     ;;
+  py-benchmark)
+    echo "==================Run python benchmark================================"
+    run_py_benchmark
+    ;;
   *)
     echo "Unknown test name: $test_name"
     exit 1
diff --git a/test/deploy/setup.sh b/test/deploy/setup.sh
index 2b2c7f7e8..1d0641773 100644
--- a/test/deploy/setup.sh
+++ b/test/deploy/setup.sh
@@ -3,7 +3,7 @@ set -e
 mkdir -p /root/.ssh
 mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys
 chown root:root /root/.ssh/authorized_keys
-mv /root/mscclpp/config /root/.ssh/config
+mv /root/mscclpp/test/deploy/config /root/.ssh/config
 chown root:root /root/.ssh/config
 chmod 400 /root/mscclpp/sshkey
 chown root:root /root/mscclpp/sshkey
@@ -14,10 +14,12 @@ for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
 done
 
 if [[ "${CUDA_VERSION}" == *"11."* ]]; then
-    pip3 install -r /root/mscclpp/python/test/requirements_cu11.txt
+    pip3 install -r /root/mscclpp/python/requirements_cu11.txt
 else
-    pip3 install -r /root/mscclpp/python/test/requirements_cu12.txt
+    pip3 install -r /root/mscclpp/python/requirements_cu12.txt
 fi
 
+cd /root/mscclpp && pip3 install .
+
 mkdir -p /var/run/sshd
 /usr/sbin/sshd -p 22345
diff --git a/test/mp_unit/bootstrap_tests.cc b/test/mp_unit/bootstrap_tests.cc
index 82120a1f7..69e566dbd 100644
--- a/test/mp_unit/bootstrap_tests.cc
+++ b/test/mp_unit/bootstrap_tests.cc
@@ -67,7 +67,7 @@ TEST_F(BootstrapTest, ResumeWithId) {
   // This test may take a few minutes.
   bootstrapTestTimer.set(300);
 
-  for (int i = 0; i < 3000; ++i) {
+  for (int i = 0; i < 10; ++i) {
     auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, gEnv->worldSize);
     mscclpp::UniqueId id;
     if (bootstrap->getRank() == 0) id = bootstrap->createUniqueId();
diff --git a/test/mp_unit/communicator_tests.cu b/test/mp_unit/communicator_tests.cu
index e37093574..30727667d 100644
--- a/test/mp_unit/communicator_tests.cu
+++ b/test/mp_unit/communicator_tests.cu
@@ -3,7 +3,7 @@
 
 #include <mpi.h>
 
-#include <mscclpp/cuda_utils.hpp>
+#include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/semaphore.hpp>
 
 #include "mp_unit_tests.hpp"
diff --git a/test/mp_unit/ib_tests.cu b/test/mp_unit/ib_tests.cu
index d32f102c3..e878154d7 100644
--- a/test/mp_unit/ib_tests.cu
+++ b/test/mp_unit/ib_tests.cu
@@ -3,7 +3,7 @@
 
 #include <mpi.h>
 
-#include <mscclpp/cuda_utils.hpp>
+#include <mscclpp/gpu_utils.hpp>
 
 #include "infiniband/verbs.h"
 #include "mp_unit_tests.hpp"
@@ -118,6 +118,8 @@ __global__ void kernelMemoryConsistency(uint64_t* data, volatile uint64_t* curIt
                                         uint64_t nelem, uint64_t maxIter) {
   if (blockIdx.x != 0) return;
 
+  __shared__ int errs[1024];
+
   constexpr int FlagWrong = 1;
   constexpr int FlagAbort = 2;
 
@@ -165,15 +167,18 @@ __global__ void kernelMemoryConsistency(uint64_t* data, volatile uint64_t* curIt
 #endif
       }
     }
+
+    errs[threadIdx.x] = err;
     __threadfence();
     __syncthreads();
 
-    // Shuffle err
-    for (int i = 16; i > 0; i /= 2) {
-      err += __shfl_xor_sync(0xffffffff, err, i);
+    // Check if any error is detected.
+    int total_err = 0;
+    for (size_t i = 0; i < blockDim.x; ++i) {
+      total_err += errs[i];
     }
 
-    if (err > 0) {
+    if (total_err > 0) {
       // Exit if any error is detected.
       return;
     }
diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp
index c3a3b1efe..e934dee49 100644
--- a/test/mp_unit/mp_unit_tests.hpp
+++ b/test/mp_unit/mp_unit_tests.hpp
@@ -7,7 +7,7 @@
 #include <gtest/gtest.h>
 
 #include <mscclpp/core.hpp>
-#include <mscclpp/packet.hpp>
+#include <mscclpp/packet_device.hpp>
 #include <mscclpp/proxy_channel.hpp>
 #include <mscclpp/sm_channel.hpp>
 #include <mscclpp/utils.hpp>
@@ -135,6 +135,7 @@ class ProxyChannelOneToOneTest : public CommunicatorTestBase {
   void setupMeshConnections(std::vector<mscclpp::SimpleProxyChannel>& proxyChannels, bool useIbOnly, void* sendBuff,
                             size_t sendBuffBytes, void* recvBuff = nullptr, size_t recvBuffBytes = 0);
   void testPingPong(bool useIbOnly, bool waitWithPoll);
+  void testPingPongPerf(bool useIbOnly, bool waitWithPoll);
   void testPacketPingPong(bool useIbOnly);
   void testPacketPingPongPerf(bool useIbOnly);
 
@@ -148,6 +149,8 @@ class SmChannelOneToOneTest : public CommunicatorTestBase {
 
   void setupMeshConnections(std::vector<mscclpp::SmChannel>& smChannels, void* inputBuff, size_t inputBuffBytes,
                             void* outputBuff = nullptr, size_t outputBuffBytes = 0);
+  using PacketPingPongKernelWrapper = std::function<void(int*, int, int, int*, int)>;
+  void packetPingPongTest(const std::string testName, PacketPingPongKernelWrapper kernelWrapper);
 
   std::unordered_map<int, std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>> smSemaphores;
 };
diff --git a/test/mp_unit/proxy_channel_tests.cu b/test/mp_unit/proxy_channel_tests.cu
index ae0ea4c68..796a565d4 100644
--- a/test/mp_unit/proxy_channel_tests.cu
+++ b/test/mp_unit/proxy_channel_tests.cu
@@ -2,7 +2,7 @@
 // Licensed under the MIT license.
 
 #include <cstdint>
-#include <mscclpp/concurrency.hpp>
+#include <mscclpp/concurrency_device.hpp>
 
 #include "mp_unit_tests.hpp"
 
@@ -67,10 +67,9 @@ void ProxyChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::SimpleP
 
 __constant__ DeviceHandle<mscclpp::SimpleProxyChannel> gChannelOneToOneTestConstProxyChans;
 
-__global__ void kernelProxyPingPong(int* buff, int rank, int nElem, bool waitWithPoll, int* ret) {
+__global__ void kernelProxyPingPong(int* buff, int rank, int nElem, bool waitWithPoll, int nTries, int* ret) {
   DeviceHandle<mscclpp::SimpleProxyChannel>& proxyChan = gChannelOneToOneTestConstProxyChans;
   volatile int* sendBuff = (volatile int*)buff;
-  int nTries = 1000;
   int flusher = 0;
   int rank1Offset = 10000000;
   for (int i = 0; i < nTries; i++) {
@@ -139,7 +138,7 @@ __global__ void kernelProxyPingPong(int* buff, int rank, int nElem, bool waitWit
       }
     }
     flusher++;
-    if (flusher == 100) {
+    if (flusher == 1) {
       if (threadIdx.x == 0) proxyChan.flush();
       flusher = 0;
     }
@@ -152,7 +151,7 @@ void ProxyChannelOneToOneTest::testPingPong(bool useIbOnly, bool waitWithPoll) {
   const int nElem = 4 * 1024 * 1024;
 
   std::vector<mscclpp::SimpleProxyChannel> proxyChannels;
-  std::shared_ptr<int> buff = mscclpp::allocSharedCuda<int>(nElem);
+  std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
   setupMeshConnections(proxyChannels, useIbOnly, buff.get(), nElem * sizeof(int));
 
   std::vector<DeviceHandle<mscclpp::SimpleProxyChannel>> proxyChannelHandles;
@@ -166,22 +165,24 @@ void ProxyChannelOneToOneTest::testPingPong(bool useIbOnly, bool waitWithPoll) {
 
   std::shared_ptr<int> ret = mscclpp::makeSharedCudaHost<int>(0);
 
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, waitWithPoll, ret.get());
+  const int nTries = 1000;
+
+  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, waitWithPoll, nTries, ret.get());
   MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
 
   EXPECT_EQ(*ret, 0);
 
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, waitWithPoll, ret.get());
+  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, waitWithPoll, nTries, ret.get());
   MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
 
   EXPECT_EQ(*ret, 0);
 
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, waitWithPoll, ret.get());
+  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, waitWithPoll, nTries, ret.get());
   MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
 
   EXPECT_EQ(*ret, 0);
 
-  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, waitWithPoll, ret.get());
+  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, waitWithPoll, nTries, ret.get());
   MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
 
   EXPECT_EQ(*ret, 0);
@@ -189,6 +190,50 @@ void ProxyChannelOneToOneTest::testPingPong(bool useIbOnly, bool waitWithPoll) {
   proxyService->stopProxy();
 }
 
+void ProxyChannelOneToOneTest::testPingPongPerf(bool useIbOnly, bool waitWithPoll) {
+  if (gEnv->rank >= numRanksToUse) return;
+
+  const int nElem = 4 * 1024 * 1024;
+
+  std::vector<mscclpp::SimpleProxyChannel> proxyChannels;
+  std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
+  setupMeshConnections(proxyChannels, useIbOnly, buff.get(), nElem * sizeof(int));
+
+  std::vector<DeviceHandle<mscclpp::SimpleProxyChannel>> proxyChannelHandles;
+  for (auto& ch : proxyChannels) proxyChannelHandles.push_back(ch.deviceHandle());
+
+  ASSERT_EQ(proxyChannels.size(), 1);
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstProxyChans, proxyChannelHandles.data(),
+                                       sizeof(DeviceHandle<mscclpp::SimpleProxyChannel>)));
+
+  proxyService->startProxy();
+
+  std::shared_ptr<int> ret = mscclpp::makeSharedCudaHost<int>(0);
+
+  auto* testInfo = ::testing::UnitTest::GetInstance()->current_test_info();
+  const std::string testName = std::string(testInfo->test_suite_name()) + "." + std::string(testInfo->name());
+  const int nTries = 1000000;
+
+  // Warm-up
+  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, waitWithPoll, nTries, ret.get());
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+
+  communicator->bootstrap()->barrier();
+
+  // Measure latency
+  mscclpp::Timer timer;
+  kernelProxyPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, waitWithPoll, nTries, ret.get());
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+
+  communicator->bootstrap()->barrier();
+
+  if (gEnv->rank == 0) {
+    std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)nTries << " us/iter\n";
+  }
+
+  proxyService->stopProxy();
+}
+
 TEST_F(ProxyChannelOneToOneTest, PingPong) { testPingPong(false, false); }
 
 TEST_F(ProxyChannelOneToOneTest, PingPongIb) { testPingPong(true, false); }
@@ -197,6 +242,10 @@ TEST_F(ProxyChannelOneToOneTest, PingPongWithPoll) { testPingPong(false, true);
 
 TEST_F(ProxyChannelOneToOneTest, PingPongIbWithPoll) { testPingPong(true, true); }
 
+TEST_F(ProxyChannelOneToOneTest, PingPongPerf) { testPingPongPerf(false, false); }
+
+TEST_F(ProxyChannelOneToOneTest, PingPongPerfIb) { testPingPongPerf(true, false); }
+
 __device__ mscclpp::DeviceSyncer gChannelOneToOneTestProxyChansSyncer;
 
 template <bool CheckCorrectness>
@@ -211,7 +260,7 @@ __global__ void kernelProxyLLPingPong(int* buff, mscclpp::LLPacket* putPktBuf, m
   int threadId = threadIdx.x + blockIdx.x * blockDim.x;
   int numThreads = blockDim.x * gridDim.x;
   int flusher = 0;
-  const size_t nPkt = nElem / 2;
+  const int nPkt = nElem / 2;
   for (int i = 0; i < nTries; i++) {
     uint64_t flag = (uint64_t)i + 1;
 
@@ -269,11 +318,11 @@ void ProxyChannelOneToOneTest::testPacketPingPong(bool useIbOnly) {
   const int nElem = 4 * 1024 * 1024;
 
   std::vector<mscclpp::SimpleProxyChannel> proxyChannels;
-  std::shared_ptr<int> buff = mscclpp::allocSharedCuda<int>(nElem);
+  std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
 
   const size_t nPacket = (nElem * sizeof(int) + sizeof(uint64_t) - 1) / sizeof(uint64_t);
-  auto putPacketBuffer = mscclpp::allocSharedCuda<mscclpp::LLPacket>(nPacket);
-  auto getPacketBuffer = mscclpp::allocSharedCuda<mscclpp::LLPacket>(nPacket);
+  auto putPacketBuffer = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(nPacket);
+  auto getPacketBuffer = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(nPacket);
 
   setupMeshConnections(proxyChannels, useIbOnly, putPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket),
                        getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket));
@@ -336,11 +385,11 @@ void ProxyChannelOneToOneTest::testPacketPingPongPerf(bool useIbOnly) {
   const int nElem = 4 * 1024 * 1024;
 
   std::vector<mscclpp::SimpleProxyChannel> proxyChannels;
-  std::shared_ptr<int> buff = mscclpp::allocSharedCuda<int>(nElem);
+  std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
 
   const size_t nPacket = (nElem * sizeof(int) + sizeof(uint64_t) - 1) / sizeof(uint64_t);
-  auto putPacketBuffer = mscclpp::allocSharedCuda<mscclpp::LLPacket>(nPacket);
-  auto getPacketBuffer = mscclpp::allocSharedCuda<mscclpp::LLPacket>(nPacket);
+  auto putPacketBuffer = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(nPacket);
+  auto getPacketBuffer = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(nPacket);
 
   setupMeshConnections(proxyChannels, useIbOnly, putPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket),
                        getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket));
@@ -362,7 +411,7 @@ void ProxyChannelOneToOneTest::testPacketPingPongPerf(bool useIbOnly) {
 
   auto* testInfo = ::testing::UnitTest::GetInstance()->current_test_info();
   const std::string testName = std::string(testInfo->test_suite_name()) + "." + std::string(testInfo->name());
-  const int nTries = 1000;
+  const int nTries = 1000000;
 
   // Warm-up
   kernelProxyLLPingPong<false>
diff --git a/test/mp_unit/sm_channel_tests.cu b/test/mp_unit/sm_channel_tests.cu
index 37b3ce637..45c5fa644 100644
--- a/test/mp_unit/sm_channel_tests.cu
+++ b/test/mp_unit/sm_channel_tests.cu
@@ -70,6 +70,61 @@ void SmChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::SmChannel>
 
 __constant__ DeviceHandle<mscclpp::SmChannel> gChannelOneToOneTestConstSmChans;
 
+void SmChannelOneToOneTest::packetPingPongTest(const std::string testName, PacketPingPongKernelWrapper kernelWrapper) {
+  if (gEnv->rank >= numRanksToUse) return;
+
+  const int nElem = 4 * 1024 * 1024;
+  const int defaultNTries = 1000;
+
+  std::vector<mscclpp::SmChannel> smChannels;
+  std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
+  std::shared_ptr<int> intermBuff = mscclpp::allocExtSharedCuda<int>(nElem * 2);
+  setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int), intermBuff.get(), nElem * 2 * sizeof(int));
+  std::vector<DeviceHandle<mscclpp::SmChannel>> deviceHandles(smChannels.size());
+  std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(),
+                 [](const mscclpp::SmChannel& smChan) { return mscclpp::deviceHandle(smChan); });
+
+  ASSERT_EQ(smChannels.size(), 1);
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstSmChans, deviceHandles.data(),
+                                       sizeof(DeviceHandle<mscclpp::SmChannel>)));
+
+  std::shared_ptr<int> ret = mscclpp::makeSharedCudaHost<int>(0);
+
+  // The least nelem is 2 for packet ping pong
+  kernelWrapper(buff.get(), gEnv->rank, 2, ret.get(), defaultNTries);
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  *ret = 0;
+
+  kernelWrapper(buff.get(), gEnv->rank, 1024, ret.get(), defaultNTries);
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+
+  EXPECT_EQ(*ret, 0);
+  *ret = 0;
+
+  kernelWrapper(buff.get(), gEnv->rank, 1024 * 1024, ret.get(), defaultNTries);
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+
+  EXPECT_EQ(*ret, 0);
+  *ret = 0;
+
+  kernelWrapper(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get(), defaultNTries);
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+
+  EXPECT_EQ(*ret, 0);
+  *ret = 0;
+
+  int nTries = 1000000;
+  communicator->bootstrap()->barrier();
+  mscclpp::Timer timer;
+  kernelWrapper(buff.get(), gEnv->rank, 1024, ret.get(), nTries);
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  communicator->bootstrap()->barrier();
+
+  if (gEnv->rank == 0) {
+    std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)(nTries) << " us/iter\n";
+  }
+}
+
 __global__ void kernelSmPutPingPong(int* buff, int rank, int nElem, int* ret) {
   DeviceHandle<mscclpp::SmChannel>& smChan = gChannelOneToOneTestConstSmChans;
   volatile int* sendBuff = (volatile int*)buff;
@@ -123,7 +178,7 @@ TEST_F(SmChannelOneToOneTest, PutPingPong) {
   const int nElem = 4 * 1024 * 1024;
 
   std::vector<mscclpp::SmChannel> smChannels;
-  std::shared_ptr<int> buff = mscclpp::allocSharedCuda<int>(nElem);
+  std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
   setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int));
   std::vector<DeviceHandle<mscclpp::SmChannel>> deviceHandles(smChannels.size());
   std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(),
@@ -202,7 +257,7 @@ TEST_F(SmChannelOneToOneTest, GetPingPong) {
   const int nElem = 4 * 1024 * 1024;
 
   std::vector<mscclpp::SmChannel> smChannels;
-  std::shared_ptr<int> buff = mscclpp::allocSharedCuda<int>(nElem);
+  std::shared_ptr<int> buff = mscclpp::allocExtSharedCuda<int>(nElem);
   setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int));
   std::vector<DeviceHandle<mscclpp::SmChannel>> deviceHandles(smChannels.size());
   std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(),
@@ -238,17 +293,53 @@ TEST_F(SmChannelOneToOneTest, GetPingPong) {
   EXPECT_EQ(*ret, 0);
 }
 
-__global__ void kernelSmPacketPingPong(int* buff, int rank, int nElem, int* ret) {
+__global__ void kernelSmLL8PacketPingPong(int* buff, int rank, int nElem, int* ret, int nTries) {
   if (rank > 1) return;
 
   DeviceHandle<mscclpp::SmChannel>& smChan = gChannelOneToOneTestConstSmChans;
   volatile int* sendBuff = (volatile int*)buff;
-  int nTries = 1000;
   int putOffset = (rank == 0) ? 0 : 10000000;
   int getOffset = (rank == 0) ? 10000000 : 0;
   for (int i = 0; i < nTries; i++) {
     uint64_t flag = (uint64_t)i + 1;
 
+    // rank=0: 0, 1, 0, 1, ...
+    // rank=1: 1, 0, 1, 0, ...
+    if ((rank ^ (i & 1)) == 0) {
+      // If each thread writes 8 bytes at once, we don't need a barrier before putPackets().
+      for (int j = threadIdx.x; j < nElem; j += blockDim.x) {
+        sendBuff[j] = putOffset + i + j;
+        // sendBuff[2 * j + 1] = putOffset + i + 2 * j + 1;
+      }
+      // __syncthreads();
+      smChan.putPackets<mscclpp::LL8Packet>(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag);
+    } else {
+      smChan.getPackets<mscclpp::LL8Packet>(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag);
+      // If each thread reads 8 bytes at once, we don't need a barrier after getPackets().
+      // __syncthreads();
+      for (int j = threadIdx.x; j < nElem; j += blockDim.x) {
+        if (sendBuff[j] != getOffset + i + j) {
+          // printf("ERROR: rank = %d, sendBuff[%d] = %d, expected %d. Skipping following errors\n", rank, 2 * j,
+          //        sendBuff[2 * j], getOffset + i + 2 * j);
+          *ret = 1;
+          break;
+        }
+      }
+    }
+    // Make sure all threads are done in this iteration
+    __syncthreads();
+  }
+}
+
+__global__ void kernelSmLL16PacketPingPong(int* buff, int rank, int nElem, int* ret, int nTries) {
+  if (rank > 1) return;
+
+  DeviceHandle<mscclpp::SmChannel>& smChan = gChannelOneToOneTestConstSmChans;
+  volatile int* sendBuff = (volatile int*)buff;
+  int putOffset = (rank == 0) ? 0 : 10000000;
+  int getOffset = (rank == 0) ? 10000000 : 0;
+  for (int i = 0; i < nTries; i++) {
+    uint64_t flag = (uint64_t)i + 1;
     // rank=0: 0, 1, 0, 1, ...
     // rank=1: 1, 0, 1, 0, ...
     if ((rank ^ (i & 1)) == 0) {
@@ -258,9 +349,9 @@ __global__ void kernelSmPacketPingPong(int* buff, int rank, int nElem, int* ret)
         sendBuff[2 * j + 1] = putOffset + i + 2 * j + 1;
       }
       // __syncthreads();
-      smChan.putPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag);
+      smChan.putPackets<mscclpp::LL16Packet>(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag);
     } else {
-      smChan.getPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag);
+      smChan.getPackets<mscclpp::LL16Packet>(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag);
       // If each thread reads 8 bytes at once, we don't need a barrier after getPackets().
       // __syncthreads();
       for (int j = threadIdx.x; j < nElem / 2; j += blockDim.x) {
@@ -283,46 +374,16 @@ __global__ void kernelSmPacketPingPong(int* buff, int rank, int nElem, int* ret)
   }
 }
 
-TEST_F(SmChannelOneToOneTest, PacketPingPong) {
-  if (gEnv->rank >= numRanksToUse) return;
-
-  const int nElem = 4 * 1024 * 1024;
-
-  std::vector<mscclpp::SmChannel> smChannels;
-  std::shared_ptr<int> buff = mscclpp::allocSharedCuda<int>(nElem);
-  std::shared_ptr<int> intermBuff = mscclpp::allocSharedCuda<int>(nElem * 2);
-  setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int), intermBuff.get(), nElem * 2 * sizeof(int));
-  std::vector<DeviceHandle<mscclpp::SmChannel>> deviceHandles(smChannels.size());
-  std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(),
-                 [](const mscclpp::SmChannel& smChan) { return mscclpp::deviceHandle(smChan); });
-
-  ASSERT_EQ(smChannels.size(), 1);
-  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstSmChans, deviceHandles.data(),
-                                       sizeof(DeviceHandle<mscclpp::SmChannel>)));
-
-  std::shared_ptr<int> ret = mscclpp::makeSharedCudaHost<int>(0);
-
-  // The least nelem is 2 for packet ping pong
-  kernelSmPacketPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 2, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelSmPacketPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelSmPacketPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
-
-  EXPECT_EQ(*ret, 0);
-  *ret = 0;
-
-  kernelSmPacketPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get());
-  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+TEST_F(SmChannelOneToOneTest, LL8PacketPingPong) {
+  auto kernelSmLL8PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
+    kernelSmLL8PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
+  };
+  packetPingPongTest("smLL8PacketPingPong", kernelSmLL8PacketPingPongWrapper);
+}
 
-  EXPECT_EQ(*ret, 0);
+TEST_F(SmChannelOneToOneTest, LL16PacketPingPong) {
+  auto kernelSmLL16PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
+    kernelSmLL16PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
+  };
+  packetPingPongTest("smLL16PacketPingPong", kernelSmLL16PacketPingPongWrapper);
 }
diff --git a/test/mscclpp-test/CMakeLists.txt b/test/mscclpp-test/CMakeLists.txt
index d77c01dde..e2ec8c2ea 100644
--- a/test/mscclpp-test/CMakeLists.txt
+++ b/test/mscclpp-test/CMakeLists.txt
@@ -5,6 +5,9 @@ FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download
 FetchContent_MakeAvailable(json)
 
 function(add_mscclpp_test_executable name sources)
+    if(USE_ROCM)
+        set_source_files_properties(${sources} PROPERTIES LANGUAGE CXX)
+    endif()
     add_executable(${name} ${sources} common.cc)
     target_link_libraries(${name} ${TEST_LIBS_COMMON} MPI::MPI_CXX nlohmann_json::nlohmann_json)
     target_include_directories(${name} ${TEST_INC_COMMON})
diff --git a/test/mscclpp-test/allgather_test.cu b/test/mscclpp-test/allgather_test.cu
index ca050826e..714b2858d 100644
--- a/test/mscclpp-test/allgather_test.cu
+++ b/test/mscclpp-test/allgather_test.cu
@@ -1,14 +1,18 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include <cuda_runtime.h>
-
 #include <algorithm>
-#include <mscclpp/concurrency.hpp>
+#include <mscclpp/concurrency_device.hpp>
 #include <string>
 
 #include "common.hpp"
 
+#if defined(__HIP_PLATFORM_AMD__)
+#define WARP_SIZE 64
+#else
+#define WARP_SIZE 32
+#endif
+
 namespace {
 auto isUsingHostOffload = [](int kernelNum) { return kernelNum == 3; };
 constexpr uint64_t MAGIC = 0xdeadbeef;
@@ -19,10 +23,12 @@ using DeviceHandle = mscclpp::DeviceHandle<T>;
 __constant__ DeviceHandle<mscclpp::SimpleProxyChannel> constProxyChans[16];
 __constant__ DeviceHandle<mscclpp::ProxyChannel> constRawProxyChan[16];
 
-__constant__ DeviceHandle<mscclpp::SmChannel> constSmChans[8];
+__constant__ DeviceHandle<mscclpp::SmChannel> constSmChans[512];
+__constant__ DeviceHandle<mscclpp::SmChannel> constSmOutOfPlaceChans[16];
+__device__ uint64_t globalFlag;
 
-__global__ void allgather0(int rank, size_t nelemsPerGPU) {
-  int warpId = threadIdx.x / 32;
+__global__ void __launch_bounds__(1024) allgather0(int rank, size_t nelemsPerGPU) {
+  int warpId = threadIdx.x / WARP_SIZE;
 
   // Each warp is responsible for one of the remote ranks
   DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan = constProxyChans[warpId];
@@ -31,14 +37,16 @@ __global__ void allgather0(int rank, size_t nelemsPerGPU) {
 
   // this thread's role is a sender role
   // put your data asynchronously
-  if (threadIdx.x % 32 == 0) proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int));
+  if (threadIdx.x % WARP_SIZE == 0) {
+    proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int));
+  }
   // make sure everyone is put their data before some thread randomly blocks everyone else in signal
   __syncthreads();
   // push with flag and sync to make sure the data is received
-  if (threadIdx.x % 32 == 0) proxyChan.flush();
+  if (threadIdx.x % WARP_SIZE == 0) proxyChan.flush();
 
   // this thread's role is a receiver role. wait on the semaphore to make sure the data is ready
-  if (threadIdx.x % 32 == 0) proxyChan.wait();
+  if (threadIdx.x % WARP_SIZE == 0) proxyChan.wait();
 }
 
 __device__ void localAllGather(DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan, int rank, int nRanksPerNode,
@@ -52,14 +60,19 @@ __device__ void localAllGather(DeviceHandle<mscclpp::SimpleProxyChannel> proxyCh
   for (int i = 1; i < nRanksPerNode; i++) {
     if ((remoteRank % nRanksPerNode) == ((rank + i) % nRanksPerNode)) {
       // put your data to GPU (rank+i) % nRanksPerNode and signal in one call
-      if (flushAfterSignal && (threadIdx.x % 32) == 0) proxyChan.putWithSignalAndFlush(offset, size);
-      if (!flushAfterSignal && (threadIdx.x % 32) == 0) proxyChan.putWithSignal(offset, size);
+      if (flushAfterSignal && (threadIdx.x % WARP_SIZE) == 0) proxyChan.putWithSignalAndFlush(offset, size);
+      if (!flushAfterSignal && (threadIdx.x % WARP_SIZE) == 0) proxyChan.putWithSignal(offset, size);
     }
     // wait for the data from GPU (rank-i) % nRanksPerNode to arrive
     if ((remoteRank % nRanksPerNode) == ((rank - i + nRanksPerNode) % nRanksPerNode)) {
-      if ((threadIdx.x % 32) == 0) proxyChan.wait();
+      if ((threadIdx.x % WARP_SIZE) == 0) proxyChan.wait();
     }
-    asm volatile("bar.sync %0, %1;" ::"r"(11), "r"((nRanksPerNode - 1) * 32) : "memory");
+#if defined(__HIP_PLATFORM_AMD__)
+    // NOTE: we actually need a group barrier here for better performance, but __syncthreads() is still correct.
+    __syncthreads();
+#else
+    asm volatile("bar.sync %0, %1;" ::"r"(11), "r"((nRanksPerNode - 1) * WARP_SIZE) : "memory");
+#endif
   }
 }
 
@@ -111,8 +124,8 @@ __device__ void localAllGatherSm(int rank, int nRanksPerNode, int startRankChunk
   constSmChans[peerIdx].get(offset + offsetForThisBlock, sizeForThisBlock, threadIdx.x, blockDim.x);
 }
 
-__global__ void allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) {
-  int warpId = threadIdx.x / 32;
+__global__ void __launch_bounds__(1024) allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) {
+  int warpId = threadIdx.x / WARP_SIZE;
   int remoteRank = (warpId < rank) ? warpId : warpId + 1;
 
   // Each warp is responsible for one of the remote ranks
@@ -122,8 +135,8 @@ __global__ void allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) {
                  nelemsPerGPU * sizeof(int));
 }
 
-__global__ void allgather2(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
-  int warpId = threadIdx.x / 32;
+__global__ void __launch_bounds__(1024) allgather2(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
+  int warpId = threadIdx.x / WARP_SIZE;
   int remoteRank = (warpId < rank) ? warpId : warpId + 1;
 
   // Each warp is responsible for one of the remote ranks
@@ -150,10 +163,10 @@ __global__ void allgather2(int rank, int worldSize, int nRanksPerNode, size_t ne
   // cross-node exchange
   if (remoteRank % nRanksPerNode == rank % nRanksPerNode) {
     // opposite side
-    if ((threadIdx.x % 32) == 0)
+    if ((threadIdx.x % WARP_SIZE) == 0)
       proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int),
                               (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int));
-    if ((threadIdx.x % 32) == 0) proxyChan.wait();
+    if ((threadIdx.x % WARP_SIZE) == 0) proxyChan.wait();
   }
 
   // sync here to make sure IB flush dose not block the CUDA IPC traffic
@@ -161,7 +174,7 @@ __global__ void allgather2(int rank, int worldSize, int nRanksPerNode, size_t ne
   // need to flush ib channel here to avoid cq overflow. since we won't change send suffer after send, we don't need
   // to flush for IPC channel.
   if (remoteRank % nRanksPerNode == rank % nRanksPerNode) {
-    if ((threadIdx.x % 32) == 0) proxyChan.flush();
+    if ((threadIdx.x % WARP_SIZE) == 0) proxyChan.flush();
   }
   __syncthreads();
 
@@ -176,15 +189,15 @@ __global__ void allgather2(int rank, int worldSize, int nRanksPerNode, size_t ne
   // cross-node exchange
   if (remoteRank % nRanksPerNode == rank % nRanksPerNode) {
     // opposite side
-    if ((threadIdx.x % 32) == 0)
+    if ((threadIdx.x % WARP_SIZE) == 0)
       proxyChan.putWithSignal((rank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int),
                               nelemsPerGPU / pipelineSize * sizeof(int));
-    if ((threadIdx.x % 32) == 0) proxyChan.wait();
+    if ((threadIdx.x % WARP_SIZE) == 0) proxyChan.wait();
   }
 
   __syncthreads();
   if (remoteRank % nRanksPerNode == rank % nRanksPerNode) {
-    if ((threadIdx.x % 32) == 0) proxyChan.flush();
+    if ((threadIdx.x % WARP_SIZE) == 0) proxyChan.flush();
   }
   __syncthreads();
 
@@ -197,8 +210,8 @@ __global__ void allgather2(int rank, int worldSize, int nRanksPerNode, size_t ne
   }
 }
 
-__global__ void allgather3() {
-  int warpId = threadIdx.x / 32;
+__global__ void __launch_bounds__(1024) allgather3() {
+  int warpId = threadIdx.x / WARP_SIZE;
 
   // Each warp is responsible for one of the remote ranks
   DeviceHandle<mscclpp::ProxyChannel> proxyChan = constRawProxyChan[warpId];
@@ -214,12 +227,12 @@ __global__ void allgather3() {
     // wait for the work to be done in cpu side
     proxyChan.fifo_.sync(currentFifoHead);
   }
-  if (tid % 32 == 0) {
+  if (tid % WARP_SIZE == 0) {
     proxyChan.wait();
   }
 }
 
-__global__ void allgather4(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
+__global__ void __launch_bounds__(1024) allgather4(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
   // this allgather is a pipelined and hierarchical one and only works for two nodes
   // it is implemented as follows:
   // Step 1: each node does a local allgather and concurrently,
@@ -277,6 +290,215 @@ __global__ void allgather4(int rank, int worldSize, int nRanksPerNode, size_t ne
                    nBlocksForLocalAllGather);
 }
 
+__global__ void __launch_bounds__(1024, 1)
+    allgather5(size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) {
+  const size_t nBlock = gridDim.x;
+  if (blockIdx.x >= nBlock) return;
+
+  const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t lid = tid % WARP_SIZE;
+  const size_t wid = tid / WARP_SIZE;
+
+  const size_t nThread = blockDim.x * nBlock;
+  const size_t nWarp = nThread / WARP_SIZE;
+  const size_t nPeer = nRanksPerNode - 1;
+  const size_t chanOffset = nPeer * blockIdx.x;
+  auto smChans = constSmChans + chanOffset;
+
+  if (wid < nPeer && lid == 0) {
+    smChans[wid].relaxedSignal();
+    smChans[wid].wait();
+  }
+  __syncthreads();
+  const size_t bytesPerGPU = nelemsPerGPU * sizeof(int);
+  const size_t bytes = bytesPerGPU * nPeer;
+  size_t unitBytesPerThread;
+  if (bytes >= nThread * 64) {
+    unitBytesPerThread = 64;
+  } else {
+    unitBytesPerThread = 16;
+  }
+  const size_t unitBytesPerWarp = unitBytesPerThread * WARP_SIZE;
+  const size_t unitBytes = unitBytesPerWarp * nWarp;
+  const size_t nLoop = bytes / unitBytes;
+
+  if (nLoop > 0) {
+    // First loop unrolling
+    const size_t peerIdx = wid % nPeer;
+    const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1);
+    const size_t offset = bytesPerGPU * remoteRankLocalIndex + (wid / nPeer) * unitBytesPerWarp;
+    smChans[peerIdx].get<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE);
+  }
+
+  for (size_t i = 1; i < nLoop; ++i) {
+    const size_t gWid = wid + i * nWarp;
+    const size_t peerIdx = gWid % nPeer;
+    const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1);
+    const size_t offset = bytesPerGPU * remoteRankLocalIndex + (gWid / nPeer) * unitBytesPerWarp;
+    smChans[peerIdx].get<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE);
+  }
+
+  if (bytes % unitBytes > 0) {
+    const size_t gWid = wid + nLoop * nWarp;
+    const size_t peerIdx = gWid % nPeer;
+    const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1);
+    const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp;
+    const size_t offset = bytesPerGPU * remoteRankLocalIndex + offsetWithinRank;
+    const size_t remainBytes = (offsetWithinRank + unitBytesPerWarp > bytesPerGPU)
+                                   ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0)
+                                   : unitBytesPerWarp;
+    if (remainBytes > 0) {
+      smChans[peerIdx].get<16, true>(offset, remainBytes, lid, WARP_SIZE);
+    }
+  }
+}
+
+__global__ void __launch_bounds__(1024, 1)
+    allgather6(size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) {
+  const size_t nBlock = gridDim.x;
+  if (blockIdx.x >= nBlock) return;
+
+  const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t lid = tid % WARP_SIZE;
+  const size_t wid = tid / WARP_SIZE;
+
+  const size_t nThread = blockDim.x * nBlock;
+  const size_t nWarp = nThread / WARP_SIZE;
+  const size_t nPeer = nRanksPerNode - 1;
+  const size_t chanOffset = nPeer * blockIdx.x;
+  auto smChans = constSmChans + chanOffset;
+
+  if (wid < nPeer && lid == 0) {
+    smChans[wid].relaxedSignal();
+    smChans[wid].wait();
+  }
+  __syncthreads();
+  const size_t bytesPerGPU = nelemsPerGPU * sizeof(int);
+  const size_t bytes = bytesPerGPU * nPeer;
+  size_t unitBytesPerThread;
+  if (bytes >= nThread * 64) {
+    unitBytesPerThread = 64;
+  } else {
+    unitBytesPerThread = 16;
+  }
+  const size_t unitBytesPerWarp = unitBytesPerThread * WARP_SIZE;
+  const size_t unitBytes = unitBytesPerWarp * nWarp;
+  const size_t nLoop = bytes / unitBytes;
+
+  if (nLoop > 0) {
+    // First loop unrolling
+    const size_t peerIdx = wid % nPeer;
+    const size_t offset = bytesPerGPU * rank + (wid / nPeer) * unitBytesPerWarp;
+    smChans[peerIdx].put<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE);
+  }
+
+  for (size_t i = 1; i < nLoop; ++i) {
+    const size_t gWid = wid + i * nWarp;
+    const size_t peerIdx = gWid % nPeer;
+    const size_t offset = bytesPerGPU * rank + (gWid / nPeer) * unitBytesPerWarp;
+    smChans[peerIdx].put<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE);
+  }
+
+  if (bytes % unitBytes > 0) {
+    const size_t gWid = wid + nLoop * nWarp;
+    const size_t peerIdx = gWid % nPeer;
+    const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp;
+    const size_t offset = bytesPerGPU * rank + offsetWithinRank;
+    const size_t remainBytes = (offsetWithinRank + unitBytesPerWarp > bytesPerGPU)
+                                   ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0)
+                                   : unitBytesPerWarp;
+    if (remainBytes > 0) {
+      smChans[peerIdx].put<16, true>(offset, remainBytes, lid, WARP_SIZE);
+    }
+  }
+}
+
+__global__ void __launch_bounds__(1024, 1)
+    allgather7(size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) {
+  const size_t nBlock = gridDim.x;
+  if (blockIdx.x >= nBlock) return;
+
+  const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t lid = tid % WARP_SIZE;
+  const size_t wid = tid / WARP_SIZE;
+
+  const size_t nThread = blockDim.x * nBlock;
+  const size_t nWarp = nThread / WARP_SIZE;
+  const size_t nPeer = nRanksPerNode - 1;
+  auto smChans = constSmOutOfPlaceChans;
+
+  const uint32_t flag = (uint32_t)globalFlag;
+  const size_t bytesPerGPU = nelemsPerGPU * sizeof(int);
+  const size_t bytes = bytesPerGPU * nPeer;
+  size_t unitBytesPerThread = 8;
+  const size_t unitBytesPerWarp = unitBytesPerThread * WARP_SIZE;
+  const size_t unitBytes = unitBytesPerWarp * nWarp;
+  const size_t nLoop = bytes / unitBytes;
+
+  // double buffering
+  const size_t scratchOffset = (flag & 1) ? 0 : bytesPerGPU * nRanksPerNode * 2;
+
+  if (nLoop > 0) {
+    // First loop unrolling
+    const size_t peerIdx = wid % nPeer;
+    const size_t offset = bytesPerGPU * rank + (wid / nPeer) * unitBytesPerWarp;
+    smChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag);
+  }
+
+  if (nLoop > 0) {
+    // First loop unrolling
+    const size_t peerIdx = wid % nPeer;
+    const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1);
+    const size_t offset = bytesPerGPU * remoteRankLocalIndex + (wid / nPeer) * unitBytesPerWarp;
+    smChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag);
+  }
+
+  for (size_t i = 1; i < nLoop; ++i) {
+    const size_t gWid = wid + i * nWarp;
+    const size_t peerIdx = gWid % nPeer;
+    const size_t offset = bytesPerGPU * rank + (gWid / nPeer) * unitBytesPerWarp;
+    smChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag);
+  }
+
+  for (size_t i = 1; i < nLoop; ++i) {
+    const size_t gWid = wid + i * nWarp;
+    const size_t peerIdx = gWid % nPeer;
+    const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1);
+    const size_t offset = bytesPerGPU * remoteRankLocalIndex + (gWid / nPeer) * unitBytesPerWarp;
+    smChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag);
+  }
+
+  if (bytes % unitBytes > 0) {
+    const size_t gWid = wid + nLoop * nWarp;
+    const size_t peerIdx = gWid % nPeer;
+    const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp;
+    const size_t offset = bytesPerGPU * rank + offsetWithinRank;
+    const size_t remainBytes = (offsetWithinRank + unitBytesPerWarp > bytesPerGPU)
+                                   ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0)
+                                   : unitBytesPerWarp;
+    if (remainBytes > 0) {
+      smChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, remainBytes, lid, WARP_SIZE, flag);
+    }
+  }
+  if (bytes % unitBytes > 0) {
+    const size_t gWid = wid + nLoop * nWarp;
+    const size_t peerIdx = gWid % nPeer;
+    const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1);
+    const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp;
+    const size_t offset = bytesPerGPU * remoteRankLocalIndex + offsetWithinRank;
+    const size_t remainBytes = (offsetWithinRank + unitBytesPerWarp > bytesPerGPU)
+                                   ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0)
+                                   : unitBytesPerWarp;
+    if (remainBytes > 0) {
+      smChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, remainBytes, lid, WARP_SIZE, flag);
+    }
+  }
+
+  if (threadIdx.x == 0 && blockIdx.x == 0) {
+    globalFlag += 1;
+  }
+}
+
 class AllGatherProxyService : public mscclpp::BaseProxyService {
  public:
   AllGatherProxyService(int worldSize, int rank, int cudaDevice);
@@ -376,9 +598,18 @@ void AllGatherTestColl::runColl(const TestArgs& args, cudaStream_t stream) {
   if (kernelNum == 4) {
     nBlocks = 21;
     nThreads = 1024;
+  } else if (kernelNum == 5) {
+    nBlocks = 24;
+    nThreads = 1024;
+  } else if (kernelNum == 6) {
+    nBlocks = 24;
+    nThreads = 1024;
+  } else if (kernelNum == 7) {
+    nBlocks = 4;
+    nThreads = 896;
   } else {
     nBlocks = 1;
-    nThreads = 32 * (worldSize - 1);
+    nThreads = WARP_SIZE * (worldSize - 1);
   }
   if (kernelNum == 0) {
     allgather0<<<nBlocks, nThreads, 0, stream>>>(rank, paramCount_);
@@ -390,11 +621,17 @@ void AllGatherTestColl::runColl(const TestArgs& args, cudaStream_t stream) {
     allgather3<<<nBlocks, nThreads, 0, stream>>>();
   } else if (kernelNum == 4) {
     allgather4<<<nBlocks, nThreads, 0, stream>>>(rank, worldSize, nRanksPerNode, paramCount_);
+  } else if (kernelNum == 5) {
+    allgather5<<<nBlocks, nThreads, 0, stream>>>(rank, worldSize, nRanksPerNode, paramCount_);
+  } else if (kernelNum == 6) {
+    allgather6<<<nBlocks, nThreads, 0, stream>>>(rank, worldSize, nRanksPerNode, paramCount_);
+  } else if (kernelNum == 7) {
+    allgather7<<<nBlocks, nThreads, 0, stream>>>(rank, worldSize, nRanksPerNode, paramCount_);
   }
 }
 
 void AllGatherTestColl::initData(const TestArgs& args, std::vector<void*> sendBuff, void* expectedBuff) {
-  if (sendBuff.size() != 1) std::unexpected();
+  if (sendBuff.size() != 1) std::runtime_error("unexpected error");
   int rank = args.rank;
   std::vector<int> dataHost(std::max(sendCount_, recvCount_), 0);
   for (size_t i = 0; i < recvCount_; i++) {
@@ -442,7 +679,10 @@ std::vector<KernelRestriction> AllGatherTestColl::getKernelRestrictions() {
           {1, "allgather1", false, 1, 4 * worldSize_},
           {2, "allgather2", true, 3, 4 * worldSize_},
           {3, "allgather3", true, 1, 4 * worldSize_},
-          {4, "allgather4", true, 3, 16 * worldSize_ /*use ulong2 to transfer data*/}};
+          {4, "allgather4", true, 3, 16 * worldSize_ /*use ulong2 to transfer data*/},
+          {5, "allgather5", false, 1, 16 * worldSize_ /*use ulong2 to transfer data*/},
+          {6, "allgather6", false, 1, 16 * worldSize_ /*use ulong2 to transfer data*/},
+          {7, "allgather7", false, 1, 16 * worldSize_ /*use ulong2 to transfer data*/}};
 }
 
 class AllGatherTestEngine : public BaseTestEngine {
@@ -463,14 +703,22 @@ class AllGatherTestEngine : public BaseTestEngine {
 
   std::shared_ptr<int> sendBuff_;
   std::shared_ptr<int[]> expectedBuff_;
+  std::shared_ptr<mscclpp::LLPacket> scratchPacketBuff_;
   std::vector<mscclpp::SmChannel> smChannels_;
+  std::vector<mscclpp::SmChannel> smOutOfPlaceChannels_;
 };
 
 AllGatherTestEngine::AllGatherTestEngine(const TestArgs& args) : BaseTestEngine(args, "allgather") {}
 
 void AllGatherTestEngine::allocateBuffer() {
-  sendBuff_ = mscclpp::allocSharedCuda<int>(args_.maxBytes / sizeof(int));
+  sendBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
   expectedBuff_ = std::shared_ptr<int[]>(new int[args_.maxBytes / sizeof(int)]);
+  if (args_.kernelNum == 7) {
+    const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t);
+    // 2x for double-buffering, scratchBuff used to store original data and reduced results
+    const size_t scratchBuffNelem = nPacket * 2 /*original data & reduced result */ * 2 /* double buffering*/;
+    scratchPacketBuff_ = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(scratchBuffNelem);
+  }
 }
 
 void AllGatherTestEngine::setupConnections() {
@@ -478,20 +726,35 @@ void AllGatherTestEngine::setupConnections() {
   if (!isUsingHostOffload(args_.kernelNum)) {
     setupMeshConnections(devProxyChannels, sendBuff_.get(), args_.maxBytes);
     if (devProxyChannels.size() > sizeof(constProxyChans) / sizeof(DeviceHandle<mscclpp::SimpleProxyChannel>)) {
-      std::unexpected();
+      std::runtime_error("unexpected error");
     }
     CUDATHROW(cudaMemcpyToSymbol(constProxyChans, devProxyChannels.data(),
                                  sizeof(DeviceHandle<mscclpp::SimpleProxyChannel>) * devProxyChannels.size()));
 
-    setupMeshConnections(smChannels_, sendBuff_.get(), args_.maxBytes);
+    setupMeshConnections(smChannels_, sendBuff_.get(), args_.maxBytes, nullptr, 0, ChannelSemantic::PUT, 64);
     std::vector<DeviceHandle<mscclpp::SmChannel>> smChannelHandles(smChannels_.size());
     if (smChannels_.size() > sizeof(constSmChans) / sizeof(DeviceHandle<mscclpp::SmChannel>)) {
-      std::unexpected();
+      std::runtime_error("unexpected error");
     }
     std::transform(smChannels_.begin(), smChannels_.end(), smChannelHandles.begin(),
                    [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); });
     CUDATHROW(cudaMemcpyToSymbol(constSmChans, smChannelHandles.data(),
                                  sizeof(DeviceHandle<mscclpp::SmChannel>) * smChannelHandles.size()));
+
+    if (args_.kernelNum == 7) {
+      const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t);
+      const size_t scratchPacketBuffBytes = nPacket * 2 * 2 * sizeof(mscclpp::LLPacket);
+      setupMeshConnections(smOutOfPlaceChannels_, sendBuff_.get(), args_.maxBytes, scratchPacketBuff_.get(),
+                           scratchPacketBuffBytes);
+      std::vector<DeviceHandle<mscclpp::SmChannel>> smOutOfPlaceChannelHandles(smOutOfPlaceChannels_.size());
+      if (smOutOfPlaceChannels_.size() > sizeof(constSmOutOfPlaceChans) / sizeof(DeviceHandle<mscclpp::SmChannel>)) {
+        std::runtime_error("unexpected error");
+      }
+      std::transform(smOutOfPlaceChannels_.begin(), smOutOfPlaceChannels_.end(), smOutOfPlaceChannelHandles.begin(),
+                     [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); });
+      CUDATHROW(cudaMemcpyToSymbol(constSmOutOfPlaceChans, smOutOfPlaceChannelHandles.data(),
+                                   sizeof(DeviceHandle<mscclpp::SmChannel>) * smOutOfPlaceChannelHandles.size()));
+    }
   } else {
     auto service = std::dynamic_pointer_cast<AllGatherProxyService>(chanService_);
     setupMeshConnections(devProxyChannels, sendBuff_.get(), args_.maxBytes, nullptr, 0,
@@ -508,7 +771,7 @@ void AllGatherTestEngine::setupConnections() {
                          });
     auto proxyChannels = service->proxyChannels();
     if (proxyChannels.size() > sizeof(constRawProxyChan) / sizeof(DeviceHandle<mscclpp::ProxyChannel>)) {
-      std::unexpected();
+      std::runtime_error("unexpected error");
     }
     CUDATHROW(cudaMemcpyToSymbol(constRawProxyChan, proxyChannels.data(),
                                  sizeof(DeviceHandle<mscclpp::ProxyChannel>) * proxyChannels.size()));
diff --git a/test/mscclpp-test/allreduce_test.cu b/test/mscclpp-test/allreduce_test.cu
index 4df3f09e9..84eb694b1 100644
--- a/test/mscclpp-test/allreduce_test.cu
+++ b/test/mscclpp-test/allreduce_test.cu
@@ -2,8 +2,8 @@
 // Licensed under the MIT license.
 
 #include <algorithm>
-#include <mscclpp/concurrency.hpp>
-#include <mscclpp/packet.hpp>
+#include <mscclpp/concurrency_device.hpp>
+#include <mscclpp/packet_device.hpp>
 #include <vector>
 
 #include "common.hpp"
@@ -371,6 +371,68 @@ __device__ void localReduceScatterSm2(int* buff, int rank, int nRanksPerNode, si
   }
 }
 
+__device__ void localReduceScatterSm3(int* buff, int rank, int nRanksPerNode, size_t chunkSize, size_t nelems,
+                                      int nBlocks) {
+  if (nRanksPerNode == 1) return;
+  if ((int)blockIdx.x >= nBlocks) return;
+  const int nPeer = nRanksPerNode - 1;
+  DeviceHandle<mscclpp::SmChannel>* smChans = constSmOutOfPlaceGetChans;
+
+  const size_t localRankIndexInNode = rank % nRanksPerNode;
+  const size_t indexOffset = localRankIndexInNode * chunkSize;
+  const size_t indexOffset4 = indexOffset / 4;
+
+  int4* buff4 = (int4*)buff;
+
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < nPeer) {
+    smChans[tid].signal();
+  }
+  const int waitStart = nBlocks * blockDim.x - nPeer;
+  if (tid >= waitStart && tid < (int)(nBlocks * blockDim.x)) {
+    smChans[tid - waitStart].wait();
+  }
+  reduceScatterDeviceSyncer.sync(nBlocks);
+
+  const size_t nInt4 = nelems / 4;
+
+  size_t base = 0;
+  const size_t unitNInt4 = blockDim.x * nBlocks;
+  for (; base + unitNInt4 < nInt4; base += unitNInt4) {
+    for (int index = 0; index < nPeer; ++index) {
+      int4 val;
+      int peerIdx = (index + localRankIndexInNode) % nPeer;
+      for (size_t idx = base + threadIdx.x + blockIdx.x * blockDim.x; idx < base + unitNInt4;
+           idx += blockDim.x * nBlocks) {
+        val = smChans[peerIdx].read<int4>(indexOffset4 + idx);
+        buff4[indexOffset4 + idx].w += val.w;
+        buff4[indexOffset4 + idx].x += val.x;
+        buff4[indexOffset4 + idx].y += val.y;
+        buff4[indexOffset4 + idx].z += val.z;
+      }
+    }
+  }
+  for (int index = 0; index < nPeer; ++index) {
+    int4 val;
+    int peerIdx = (index + localRankIndexInNode) % nPeer;
+    for (size_t idx = base + threadIdx.x + blockIdx.x * blockDim.x; idx < nInt4; idx += blockDim.x * nBlocks) {
+      val = smChans[peerIdx].read<int4>(indexOffset4 + idx);
+      buff4[indexOffset4 + idx].w += val.w;
+      buff4[indexOffset4 + idx].x += val.x;
+      buff4[indexOffset4 + idx].y += val.y;
+      buff4[indexOffset4 + idx].z += val.z;
+    }
+  }
+
+  const size_t nLastInts = nelems % 4;
+  for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
+    for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nLastInts; idx += blockDim.x * nBlocks) {
+      int val = smChans[(localRankIndexInNode + peerIdx) % nPeer].read<int>(indexOffset + nInt4 * 4 + idx);
+      buff[indexOffset + nInt4 * 4 + idx] += val;
+    }
+  }
+}
+
 __device__ void reduceScatterSm(int* buff, int* scratch, int rank, int nRanksPerNode, int worldSize,
                                 size_t nelems  // must be divisible by 3
 ) {
@@ -520,6 +582,39 @@ __device__ void localRingAllGatherSm(int rank, int nRanksPerNode, uint64_t size,
   }
 }
 
+__device__ void localRingAllGatherSm2(size_t rank, size_t nRanksPerNode, size_t size, size_t nBlocks) {
+  if (nRanksPerNode == 1) return;
+  if (blockIdx.x >= nBlocks) return;
+
+  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const size_t nPeer = nRanksPerNode - 1;
+
+  if (tid < nPeer) {
+    constSmInPlaceChans[tid].signal();
+  }
+  size_t waitStart = nBlocks * blockDim.x - nPeer;
+  if (tid >= waitStart && tid < nBlocks * blockDim.x) {
+    constSmInPlaceChans[tid - waitStart].wait();
+  }
+  allGatherDeviceSyncer.sync(nBlocks);
+  const size_t unitSize = 16 * blockDim.x * nBlocks;
+  size_t base = 0;
+  for (; base + unitSize < size; base += unitSize) {
+    for (size_t i = 0; i < nPeer; ++i) {
+      size_t peerIdx = (i + rank) % nPeer;
+      const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1);
+      size_t offset = size * remoteRankLocalIndex + base;
+      constSmInPlaceChans[peerIdx].get(offset, unitSize, tid, blockDim.x * nBlocks);
+    }
+  }
+  for (size_t i = 0; i < nPeer; ++i) {
+    size_t peerIdx = (i + rank) % nPeer;
+    const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1);
+    size_t offset = size * remoteRankLocalIndex + base;
+    constSmInPlaceChans[peerIdx].get(offset, size - base, tid, blockDim.x * nBlocks);
+  }
+}
+
 // This is an allgather4 equivalent
 __device__ void allGatherSm(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
   // this allgather is a pipelined and hierarchical one and only works for two nodes
@@ -579,7 +674,8 @@ __device__ void allGatherSm(int rank, int worldSize, int nRanksPerNode, size_t n
                    nBlocksForLocalAllGather);
 }
 
-__global__ void allreduce0(int* buff, int* scratch, int rank, int worldSize, size_t nelems, size_t scratchDataCount) {
+__global__ void __launch_bounds__(1024)
+    allreduce0(int* buff, int* scratch, int rank, int worldSize, size_t nelems, size_t scratchDataCount) {
   int peerId = blockIdx.x / BLOCKS_PER_PEER;
   int isComm = (threadIdx.x == 0) && (blockIdx.x % BLOCKS_PER_PEER == 0);
   int remoteRank = (peerId < rank) ? peerId : peerId + 1;
@@ -741,8 +837,9 @@ __global__ void __launch_bounds__(1024) allreduce1(int* buff, int* scratch, int
   }
 }
 
-__global__ void allreduce2(int* buff, void* scratch, void* putPktBuf, void* getPktBuf, void* result, int rank,
-                           int nRanksPerNode, int worldSize, size_t nelems) {
+__global__ void __launch_bounds__(1024)
+    allreduce2(int* buff, void* scratch, void* putPktBuf, void* getPktBuf, void* result, int rank, int nRanksPerNode,
+               int worldSize, size_t nelems) {
   int numPeersPerNode = nRanksPerNode - 1;
   size_t nPkts = nelems / 2;  // 2 elems per packet, assume nelems is even
   size_t pktBytes = nPkts * sizeof(mscclpp::LLPacket);
@@ -854,24 +951,31 @@ __global__ void __launch_bounds__(1024)
   }
 }
 
-__global__ void allreduce4(int* buff, int* scratch, int rank, int nRanksPerNode, int worldSize, size_t nelems) {
+__global__ void __launch_bounds__(1024)
+    allreduce4(int* buff, int* scratch, int rank, int nRanksPerNode, int worldSize, size_t nelems) {
   reduceScatterSm(buff, scratch, rank, nRanksPerNode, worldSize, nelems);
   deviceSyncer.sync(gridDim.x);
   allGatherSm(rank, worldSize, nRanksPerNode, nelems / worldSize);
 }
 
-__global__ void allreduce5(int* buff, int rank, int nRanksPerNode, int worldSize, size_t nelems) {
+__global__ void __launch_bounds__(1024)
+    allreduce5(int* buff, int rank, int nRanksPerNode, int worldSize, size_t nelems) {
+#if defined(__HIP_PLATFORM_AMD__)
+  localReduceScatterSm3(buff, rank, nRanksPerNode, nelems / worldSize, nelems / worldSize, gridDim.x);
+  deviceSyncer.sync(gridDim.x);
+  localRingAllGatherSm2(rank, nRanksPerNode, nelems / worldSize * sizeof(int), gridDim.x);
+#else
   localReduceScatterSm2(buff, rank, nRanksPerNode, nelems / worldSize, nelems / worldSize, gridDim.x);
   deviceSyncer.sync(gridDim.x);
   localRingAllGatherSm(rank, nRanksPerNode, nelems / worldSize * sizeof(int), gridDim.x);
+#endif
 }
 
-__global__ void allreduce6(int* buff, int* scratch, void* resultBuff, int rank, int nRanksPerNode, int worldSize,
-                           size_t nelems) {
+__global__ void __launch_bounds__(1024)
+    allreduce6(int* buff, int* scratch, void* resultBuff, int rank, int nRanksPerNode, int worldSize, size_t nelems) {
   // This version of allreduce only works for single nodes
-  if (worldSize != nRanksPerNode) return;
   const int nPeers = nRanksPerNode - 1;
-  const int nPkts = nelems / 2;
+  const size_t nPkts = nelems / 2;
   const int nelemsPerRank = nelems / worldSize;
   const int nPktsPerRank = nelemsPerRank / 2;
   // flag for packets. Initially 1
@@ -881,7 +985,6 @@ __global__ void allreduce6(int* buff, int* scratch, void* resultBuff, int rank,
   const int localBlockIdx = blockIdx.x % nBlocksPerPeer;
   const int peerIdx = blockIdx.x / nBlocksPerPeer;
   const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1;
-  DeviceHandle<mscclpp::SmChannel> smChan = constSmOutOfPlaceChans[peerIdx];
   const int tid = threadIdx.x + localBlockIdx * blockDim.x;
   // double buffering
   size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LLPacket);
@@ -890,11 +993,12 @@ __global__ void allreduce6(int* buff, int* scratch, void* resultBuff, int rank,
   size_t scratchResultOffset =
       (flag & 1) ? 2 * nPkts * sizeof(mscclpp::LLPacket) : 3 * nPkts * sizeof(mscclpp::LLPacket);
   size_t srcOffset = remoteRank * nelemsPerRank * sizeof(int);
-  uint2* src = (uint2*)((char*)buff + srcOffset);
+  uint2* src = (uint2*)((char*)buff + rank * nelemsPerRank * sizeof(int));
   uint2* dst = (uint2*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int));
 
   // step 1: write to scratch buffer
-  smChan.putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer, flag);
+  constSmOutOfPlaceChans[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid,
+                                             blockDim.x * nBlocksPerPeer, flag);
   // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer
   for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) {
     uint2 data = make_uint2(0, 0);
@@ -907,11 +1011,16 @@ __global__ void allreduce6(int* buff, int* scratch, void* resultBuff, int rank,
     }
     data.x += src[idx].x;
     data.y += src[idx].y;
-    dst[idx].x = data.x;
-    dst[idx].y = data.y;
+    dst[idx] = data;
+
+    mscclpp::LLPacket packet;
+    packet.data1 = data.x;
+    packet.flag1 = flag;
+    packet.data2 = data.y;
+    packet.flag2 = flag;
+    size_t offset = scratchResultOffset / sizeof(mscclpp::LLPacket) + (idx + rank * nPktsPerRank);
     for (int index = 0; index < nPeers; index++) {
-      mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)((char*)constSmOutOfPlaceChans[index].dst_ + scratchResultOffset);
-      dstPkt[idx + rank * nPktsPerRank].write(data.x, data.y, flag);
+      constSmOutOfPlaceChans[index].write(offset, packet);
     }
   }
   // step 3: get data result from scratch buffer
@@ -928,6 +1037,67 @@ __global__ void allreduce6(int* buff, int* scratch, void* resultBuff, int rank,
   }
 }
 
+__global__ void __launch_bounds__(1024)
+    allreduce7(int* buff, int* scratch, void* resultBuff, int rank, int nRanksPerNode, int worldSize, size_t nelems) {
+  // This version of allreduce only works for single nodes
+  const int nPeers = nRanksPerNode - 1;
+  const size_t nPkts = nelems;
+  const int nelemsPerRank = nelems / worldSize;
+  const int nPktsPerRank = nelemsPerRank;
+  // flag for packets. Initially 1
+  const uint32_t flag = (uint32_t)globalFlag;
+  // thread block & channel info
+  const int nBlocksPerPeer = gridDim.x / nPeers;
+  const int localBlockIdx = blockIdx.x % nBlocksPerPeer;
+  const int peerIdx = blockIdx.x / nBlocksPerPeer;
+  const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1;
+  const int tid = threadIdx.x + localBlockIdx * blockDim.x;
+  // double buffering
+  size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LL8Packet);
+  void* scratchBuff = (void*)((char*)scratch + scratchBaseOffset);
+  size_t scratchOffset = scratchBaseOffset + rank * nPktsPerRank * sizeof(mscclpp::LL8Packet);
+  size_t scratchResultOffset =
+      (flag & 1) ? 2 * nPkts * sizeof(mscclpp::LL8Packet) : 3 * nPkts * sizeof(mscclpp::LL8Packet);
+  size_t srcOffset = remoteRank * nelemsPerRank * sizeof(int);
+  uint32_t* src = (uint32_t*)((char*)buff + rank * nelemsPerRank * sizeof(int));
+  uint32_t* dst = (uint32_t*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int));
+
+  // step 1: write to scratch buffer
+  constSmOutOfPlaceChans[peerIdx].putPackets<mscclpp::LL8Packet>(scratchOffset, srcOffset, nelemsPerRank * sizeof(int),
+                                                                 tid, blockDim.x * nBlocksPerPeer, flag);
+  // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) {
+    uint32_t data = 0;
+    for (int index = 0; index < nPeers; index++) {
+      const int remoteRank = index < rank ? index : index + 1;
+      mscclpp::LL8Packet* dstPkt = (mscclpp::LL8Packet*)scratchBuff + remoteRank * nPktsPerRank;
+      uint32_t val = dstPkt[idx].read(flag);
+      data += val;
+    }
+    data += src[idx];
+    dst[idx] = data;
+
+    mscclpp::LL8Packet packet;
+    packet.data = data;
+    packet.flag = flag;
+    size_t offset = scratchResultOffset / sizeof(mscclpp::LL8Packet) + (idx + rank * nPktsPerRank);
+    for (int index = 0; index < nPeers; index++) {
+      constSmOutOfPlaceChans[index].write(offset, packet);
+    }
+  }
+  // step 3: get data result from scratch buffer
+  mscclpp::LL8Packet* dstPkt = (mscclpp::LL8Packet*)((char*)scratch + scratchResultOffset);
+  const int dstOffset = remoteRank * nPktsPerRank;
+  uint32_t* result = (uint32_t*)((char*)resultBuff + remoteRank * nelemsPerRank * sizeof(int));
+  for (int idx = threadIdx.x + localBlockIdx * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * nBlocksPerPeer) {
+    uint32_t data = dstPkt[idx + dstOffset].read(flag);
+    result[idx] = data;
+  }
+  if (threadIdx.x == 0 && blockIdx.x == 0) {
+    globalFlag += 1;
+  }
+}
+
 class AllReduceTestColl : public BaseTestColl {
  public:
   AllReduceTestColl() = default;
@@ -971,6 +1141,10 @@ void AllReduceTestColl::runColl(const TestArgs& args, cudaStream_t stream) {
     nBlocks = 21;
     tmpBuff = scratchPacketBuff;
     nThreadsPerBlock = 512;
+  } else if (kernelNum == 7) {
+    nBlocks = 28;
+    tmpBuff = scratchPacketBuff;
+    nThreadsPerBlock = 1024;
   } else {
     nBlocks = std::max(args.nRanksPerNode - 1, 1) * BLOCKS_PER_PEER;
     tmpBuff = scratchPacketBuff;
@@ -993,14 +1167,16 @@ void AllReduceTestColl::runColl(const TestArgs& args, cudaStream_t stream) {
   else if (kernelNum == 5)
     allreduce5<<<nBlocks, nThreadsPerBlock, 0, stream>>>((int*)inputBuff, rank, args.nRanksPerNode, worldSize,
                                                          paramCount_);
-  else if (kernelNum == 6) {
+  else if (kernelNum == 6)
     allreduce6<<<nBlocks, nThreadsPerBlock, 0, stream>>>((int*)inputBuff, (int*)tmpBuff, resultBuff, rank,
                                                          args.nRanksPerNode, worldSize, paramCount_);
-  }
+  else if (kernelNum == 7)
+    allreduce7<<<nBlocks, nThreadsPerBlock, 0, stream>>>((int*)inputBuff, (int*)tmpBuff, resultBuff, rank,
+                                                         args.nRanksPerNode, worldSize, paramCount_);
 }
 
 void AllReduceTestColl::initData(const TestArgs& args, std::vector<void*> sendBuff, void* expectedBuff) {
-  if (sendBuff.size() != 1) std::unexpected();
+  if (sendBuff.size() != 1) std::runtime_error("unexpected error");
   const int rank = args.rank;
   const int worldSize = args.totalRanks;
   std::vector<int> dataHost(std::max(sendCount_, recvCount_), rank);
@@ -1049,7 +1225,8 @@ std::vector<KernelRestriction> AllReduceTestColl::getKernelRestrictions() {
               16 * worldSize_ /*use ulong2 to transfer data*/,
           },
           {5, "allreduce5", false, 1, 4 * worldSize_},
-          {6, "allreduce6", false, 1, 4 * worldSize_}};
+          {6, "allreduce6", false, 1, 4 * worldSize_},
+          {7, "allreduce7", false, 1, 4 * worldSize_}};
 }
 
 class AllReduceTestEngine : public BaseTestEngine {
@@ -1079,42 +1256,46 @@ class AllReduceTestEngine : public BaseTestEngine {
   std::shared_ptr<int[]> expectedBuff_;
   std::vector<mscclpp::SmChannel> smOutOfPlaceChannels_;
   std::vector<mscclpp::SmChannel> smInPlaceChannels_;
-  std::vector<mscclpp::SmChannel> smOutputPlaceGetChannels_;
+  std::vector<mscclpp::SmChannel> smOutOfPlaceGetChannels_;
 };
 
 AllReduceTestEngine::AllReduceTestEngine(const TestArgs& args) : BaseTestEngine(args, "allreduce") {
   inPlace_ = isInPlace();
 }
 
-bool AllReduceTestEngine::isUsePacket() const { return (args_.kernelNum == 2 || args_.kernelNum == 6); }
+bool AllReduceTestEngine::isUsePacket() const {
+  return (args_.kernelNum == 2 || args_.kernelNum == 6 || args_.kernelNum == 7);
+}
 
-bool AllReduceTestEngine::isInPlace() const { return (args_.kernelNum != 2 && args_.kernelNum != 6); }
+bool AllReduceTestEngine::isInPlace() const {
+  return (args_.kernelNum != 2 && args_.kernelNum != 6 && args_.kernelNum != 7);
+}
 
 void AllReduceTestEngine::allocateBuffer() {
-  inputBuff_ = mscclpp::allocSharedCuda<int>(args_.maxBytes / sizeof(int));
-  resultBuff_ = mscclpp::allocSharedCuda<int>(args_.maxBytes / sizeof(int));
+  inputBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
+  resultBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
   inputBuff = inputBuff_.get();
   resultBuff = resultBuff_.get();
 
   if (args_.kernelNum == 0 || args_.kernelNum == 1 || args_.kernelNum == 3 || args_.kernelNum == 4) {
-    scratchBuff_ = mscclpp::allocSharedCuda<int>(args_.maxBytes / sizeof(int));
+    scratchBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
     scratchBuff = scratchBuff_.get();
   } else if (args_.kernelNum == 2) {
     const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t);
     // 2x for double-buffering
     const size_t scratchBuffNelem = nPacket * std::max(args_.nRanksPerNode - 1, 1) * 2;
-    scratchPacketBuff_ = mscclpp::allocSharedCuda<mscclpp::LLPacket>(scratchBuffNelem);
+    scratchPacketBuff_ = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(scratchBuffNelem);
     scratchPacketBuff = scratchPacketBuff_.get();
     const size_t packetBuffNelem = nPacket * 2;
-    putPacketBuff_ = mscclpp::allocSharedCuda<mscclpp::LLPacket>(packetBuffNelem);
-    getPacketBuff_ = mscclpp::allocSharedCuda<mscclpp::LLPacket>(packetBuffNelem);
+    putPacketBuff_ = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(packetBuffNelem);
+    getPacketBuff_ = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(packetBuffNelem);
     putPacketBuff = putPacketBuff_.get();
     getPacketBuff = getPacketBuff_.get();
-  } else if (args_.kernelNum == 6) {
+  } else if (args_.kernelNum == 6 || args_.kernelNum == 7) {
     const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t);
     // 2x for double-buffering, scratchBuff used to store original data and reduced results
     const size_t scratchBuffNelem = nPacket * 2 /*original data & reduced result */ * 2 /* double buffering*/;
-    scratchPacketBuff_ = mscclpp::allocSharedCuda<mscclpp::LLPacket>(scratchBuffNelem);
+    scratchPacketBuff_ = mscclpp::allocExtSharedCuda<mscclpp::LLPacket>(scratchBuffNelem);
     scratchPacketBuff = scratchPacketBuff_.get();
   }
 
@@ -1131,7 +1312,7 @@ void AllReduceTestEngine::setupConnections() {
     std::vector<DeviceHandle<mscclpp::SimpleProxyChannel>> proxyChannels;
 
     const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t);
-    if (args_.kernelNum == 6) {
+    if (args_.kernelNum == 6 || args_.kernelNum == 7) {
       const size_t scratchPacketBuffBytes = nPacket * 2 * 2 * sizeof(mscclpp::LLPacket);
       setupMeshConnections(smOutOfPlaceChannels_, inputBuff_.get(), args_.maxBytes, scratchPacketBuff_.get(),
                            scratchPacketBuffBytes);
@@ -1149,10 +1330,10 @@ void AllReduceTestEngine::setupConnections() {
                            scratchPacketBuffBytes);
 
       if (smOutOfPlaceChannels_.size() > sizeof(constSmOutOfPlaceChans) / sizeof(DeviceHandle<mscclpp::SmChannel>)) {
-        std::unexpected();
+        std::runtime_error("unexpected error");
       }
       if (proxyChannels.size() > sizeof(constDevFstRoundChans) / sizeof(DeviceHandle<mscclpp::SimpleProxyChannel>)) {
-        std::unexpected();
+        std::runtime_error("unexpected error");
       }
 
       std::vector<DeviceHandle<mscclpp::SmChannel>> smChannelDeviceHandles(smOutOfPlaceChannels_.size());
@@ -1169,7 +1350,7 @@ void AllReduceTestEngine::setupConnections() {
     // Send data from local inputBuff to remote scratchBuff (out-of-place)
     setupMeshConnections(fstRoundChannels, inputBuff_.get(), args_.maxBytes, scratchBuff_.get(), args_.maxBytes);
     if (fstRoundChannels.size() > sizeof(constDevFstRoundChans) / sizeof(DeviceHandle<mscclpp::SimpleProxyChannel>)) {
-      std::unexpected();
+      std::runtime_error("unexpected error");
     }
     CUDATHROW(cudaMemcpyToSymbol(constDevFstRoundChans, fstRoundChannels.data(),
                                  sizeof(DeviceHandle<mscclpp::SimpleProxyChannel>) * fstRoundChannels.size()));
@@ -1177,14 +1358,14 @@ void AllReduceTestEngine::setupConnections() {
     // Send data from local inputBuff to remote inputBuff (in-place)
     setupMeshConnections(sndRoundChannels, inputBuff_.get(), args_.maxBytes);
     if (sndRoundChannels.size() > sizeof(constDevSndRoundChans) / sizeof(DeviceHandle<mscclpp::SimpleProxyChannel>)) {
-      std::unexpected();
+      std::runtime_error("unexpected error");
     }
     CUDATHROW(cudaMemcpyToSymbol(constDevSndRoundChans, sndRoundChannels.data(),
                                  sizeof(DeviceHandle<mscclpp::SimpleProxyChannel>) * sndRoundChannels.size()));
 
     setupMeshConnections(smOutOfPlaceChannels_, inputBuff_.get(), args_.maxBytes, scratchBuff_.get(), args_.maxBytes);
     if (smOutOfPlaceChannels_.size() > sizeof(constSmOutOfPlaceChans) / sizeof(DeviceHandle<mscclpp::SmChannel>)) {
-      std::unexpected();
+      std::runtime_error("unexpected error");
     }
     std::vector<DeviceHandle<mscclpp::SmChannel>> smChannelDeviceHandles(smOutOfPlaceChannels_.size());
     getChannelDeviceHandle(smOutOfPlaceChannels_, smChannelDeviceHandles);
@@ -1193,21 +1374,21 @@ void AllReduceTestEngine::setupConnections() {
 
     setupMeshConnections(smInPlaceChannels_, inputBuff_.get(), args_.maxBytes);
     if (smInPlaceChannels_.size() > sizeof(constSmInPlaceChans) / sizeof(DeviceHandle<mscclpp::SmChannel>)) {
-      std::unexpected();
+      std::runtime_error("unexpected error");
     }
     smChannelDeviceHandles.resize(smInPlaceChannels_.size());
     getChannelDeviceHandle(smInPlaceChannels_, smChannelDeviceHandles);
     CUDATHROW(cudaMemcpyToSymbol(constSmInPlaceChans, smChannelDeviceHandles.data(),
                                  sizeof(DeviceHandle<mscclpp::SmChannel>) * smChannelDeviceHandles.size()));
 
-    setupMeshConnections(smOutputPlaceGetChannels_, inputBuff_.get(), args_.maxBytes, scratchBuff_.get(),
-                         args_.maxBytes, ChannelSemantic::GET);
-    if (smOutputPlaceGetChannels_.size() >
+    setupMeshConnections(smOutOfPlaceGetChannels_, inputBuff_.get(), args_.maxBytes, scratchBuff_.get(), args_.maxBytes,
+                         ChannelSemantic::GET);
+    if (smOutOfPlaceGetChannels_.size() >
         sizeof(constSmOutOfPlaceGetChans) / sizeof(DeviceHandle<mscclpp::SmChannel>)) {
-      std::unexpected();
+      std::runtime_error("unexpected error");
     }
-    smChannelDeviceHandles.resize(smOutputPlaceGetChannels_.size());
-    getChannelDeviceHandle(smOutputPlaceGetChannels_, smChannelDeviceHandles);
+    smChannelDeviceHandles.resize(smOutOfPlaceGetChannels_.size());
+    getChannelDeviceHandle(smOutOfPlaceGetChannels_, smChannelDeviceHandles);
     CUDATHROW(cudaMemcpyToSymbol(constSmOutOfPlaceGetChans, smChannelDeviceHandles.data(),
                                  sizeof(DeviceHandle<mscclpp::SmChannel>) * smChannelDeviceHandles.size()));
   }
diff --git a/test/mscclpp-test/alltoall_test.cu b/test/mscclpp-test/alltoall_test.cu
index 88f6fb4cc..a1881af91 100644
--- a/test/mscclpp-test/alltoall_test.cu
+++ b/test/mscclpp-test/alltoall_test.cu
@@ -2,7 +2,7 @@
 // Licensed under the MIT license.
 
 #include <cstdint>
-#include <mscclpp/concurrency.hpp>
+#include <mscclpp/concurrency_device.hpp>
 
 #include "common.hpp"
 
@@ -29,7 +29,7 @@ __device__ void localAlltoall(int rank, int nRanksPerNode, size_t nElements) {
   }
 }
 
-__global__ void alltoall0(int rank, size_t nElements) {
+__global__ void __launch_bounds__(1024) alltoall0(int rank, size_t nElements) {
   int remoteRank = ((int)blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1;
   DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan = constProxyChans[blockIdx.x];
   if (threadIdx.x == 0) {
@@ -44,7 +44,7 @@ __global__ void alltoall0(int rank, size_t nElements) {
   }
 }
 
-__global__ void alltoall1(int rank, int nRanksPerNode, size_t nElements) {
+__global__ void __launch_bounds__(1024) alltoall1(int rank, int nRanksPerNode, size_t nElements) {
   localAlltoall(rank, nRanksPerNode, nElements);
 }
 
@@ -75,7 +75,7 @@ void AllToAllTestColl::runColl(const TestArgs& args, cudaStream_t stream) {
 }
 
 void AllToAllTestColl::initData(const TestArgs& args, std::vector<void*> sendBuff, void* expectedBuff) {
-  if (sendBuff.size() != 1) std::unexpected();
+  if (sendBuff.size() != 1) std::runtime_error("unexpected error");
   const int rank = args.rank;
   std::vector<int> dataHost(recvCount_, 0);
   // For rank 0, the data is 0, 1, 2 ... recvCount_ - 1, for rank 1, the data is recvCount_, recvCount_ + 1, ...
@@ -139,8 +139,8 @@ class AllToAllTestEngine : public BaseTestEngine {
 AllToAllTestEngine::AllToAllTestEngine(const TestArgs& args) : BaseTestEngine(args, "alltoall") { inPlace_ = false; }
 
 void AllToAllTestEngine::allocateBuffer() {
-  sendBuff_ = mscclpp::allocSharedCuda<int>(args_.maxBytes / sizeof(int));
-  recvBuff_ = mscclpp::allocSharedCuda<int>(args_.maxBytes / sizeof(int));
+  sendBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
+  recvBuff_ = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
   expectedBuff_ = std::shared_ptr<int[]>(new int[args_.maxBytes / sizeof(int)]);
 
   localSendBuff = sendBuff_.get();
@@ -152,7 +152,7 @@ void AllToAllTestEngine::setupConnections() {
   setupMeshConnections(proxyChannels, sendBuff_.get(), args_.maxBytes, recvBuff_.get(), args_.maxBytes);
 
   if (proxyChannels.size() > sizeof(constProxyChans) / sizeof(DeviceHandle<mscclpp::SimpleProxyChannel>)) {
-    std::unexpected();
+    std::runtime_error("unexpected error");
   }
   CUDATHROW(cudaMemcpyToSymbol(constProxyChans, proxyChannels.data(),
                                sizeof(DeviceHandle<mscclpp::SimpleProxyChannel>) * proxyChannels.size()));
diff --git a/test/mscclpp-test/check_perf_result.py b/test/mscclpp-test/check_perf_result.py
index 1430526ec..22e946794 100644
--- a/test/mscclpp-test/check_perf_result.py
+++ b/test/mscclpp-test/check_perf_result.py
@@ -38,7 +38,7 @@ def check_perf_result(perf_result: dict, baseline: dict, time_threshold: float,
                 str(key),
                 value["time"],
                 baseline[key]["time"],
-                time_threshold,
+                threshold,
             )
             res = False
     return res
diff --git a/test/mscclpp-test/common.cc b/test/mscclpp-test/common.cc
index a8e533ab9..9c52f9f4a 100644
--- a/test/mscclpp-test/common.cc
+++ b/test/mscclpp-test/common.cc
@@ -3,7 +3,6 @@
 
 #include "common.hpp"
 
-#include <cuda.h>
 #include <getopt.h>
 #include <libgen.h>
 #include <mpi.h>
@@ -429,7 +428,7 @@ void BaseTestEngine::setupMeshConnections(std::vector<DeviceHandle<mscclpp::Simp
 
 void BaseTestEngine::setupMeshConnections(std::vector<mscclpp::SmChannel>& smChannels, void* inputBuff,
                                           size_t inputBuffBytes, void* outputBuff, size_t outputBuffBytes,
-                                          ChannelSemantic semantic) {
+                                          ChannelSemantic semantic, size_t nChannelPerConnection) {
   const mscclpp::TransportFlags allTransports = mscclpp::Transport::CudaIpc | IBs[args_.gpuNum];
   mscclpp::RegisteredMemory inputBufRegMem = comm_->registerMemory(inputBuff, inputBuffBytes, allTransports);
   mscclpp::RegisteredMemory getPacketBufRegMem;
@@ -444,19 +443,23 @@ void BaseTestEngine::setupMeshConnections(std::vector<mscclpp::SmChannel>& smCha
       (outputBuff && semantic == ChannelSemantic::PUT) ? outputBufRegMem : inputBufRegMem;
   setupMeshConnectionsInternal(connections, localRegMemory, remoteRegMemories);
 
-  std::unordered_map<size_t, std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>> smSemaphores;
+  std::unordered_map<size_t, std::vector<std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>>> smSemaphores;
   for (size_t cid = 0; cid < connections.size(); ++cid) {
     if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) {
-      smSemaphores.emplace(cid, std::make_shared<mscclpp::SmDevice2DeviceSemaphore>(*comm_, connections[cid]));
+      for (size_t i = 0; i < nChannelPerConnection; ++i) {
+        smSemaphores[cid].emplace_back(std::make_shared<mscclpp::SmDevice2DeviceSemaphore>(*comm_, connections[cid]));
+      }
     }
   }
   comm_->setup();
 
-  for (size_t cid = 0; cid < connections.size(); ++cid) {
-    if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) {
-      smChannels.emplace_back(smSemaphores[cid], remoteRegMemories[cid].get(),
-                              (outputBuff && semantic == ChannelSemantic::GET) ? outputBuff : inputBufRegMem.data(),
-                              nullptr);
+  for (size_t i = 0; i < nChannelPerConnection; ++i) {
+    for (size_t cid = 0; cid < connections.size(); ++cid) {
+      if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) {
+        smChannels.emplace_back(smSemaphores[cid][i], remoteRegMemories[cid].get(),
+                                (outputBuff && semantic == ChannelSemantic::GET) ? outputBuff : inputBufRegMem.data(),
+                                outputBuff);
+      }
     }
   }
 }
diff --git a/test/mscclpp-test/common.hpp b/test/mscclpp-test/common.hpp
index 665ff9119..7e3e8c423 100644
--- a/test/mscclpp-test/common.hpp
+++ b/test/mscclpp-test/common.hpp
@@ -118,7 +118,7 @@ class BaseTestEngine {
                             SetupChannelFunc setupChannel = nullptr);
   void setupMeshConnections(std::vector<mscclpp::SmChannel>& smChannels, void* inputBuff, size_t inputBuffBytes,
                             void* outputBuff = nullptr, size_t outputBuffBytes = 0,
-                            ChannelSemantic semantic = ChannelSemantic::PUT);
+                            ChannelSemantic semantic = ChannelSemantic::PUT, size_t nChannelPerConnection = 1);
   void setupMeshConnections(std::vector<mscclpp::SmChannel>& smChannels,
                             std::vector<DeviceHandle<mscclpp::SimpleProxyChannel>>& proxyChannels, void* inputBuff,
                             size_t inputBuffBytes, void* putPacketBuff = nullptr, size_t putPacketBuffBytes = 0,
diff --git a/test/mscclpp-test/sendrecv_test.cu b/test/mscclpp-test/sendrecv_test.cu
index 46cc0658f..b0f830a1a 100644
--- a/test/mscclpp-test/sendrecv_test.cu
+++ b/test/mscclpp-test/sendrecv_test.cu
@@ -5,8 +5,8 @@
 #include <cstdio>
 #include <cstring>
 #include <iostream>
-#include <mscclpp/concurrency.hpp>
-#include <mscclpp/cuda_utils.hpp>
+#include <mscclpp/concurrency_device.hpp>
+#include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/semaphore.hpp>
 #include <mscclpp/sm_channel.hpp>
 #include <string>
@@ -36,7 +36,7 @@ inline mscclpp::Transport getTransport(int rank, int peerRank, int nRanksPerNode
 
 __device__ mscclpp::DeviceSyncer deviceSyncer;
 
-__global__ void kernel(size_t dataSize, size_t dataPerBlock) {
+__global__ void __launch_bounds__(1024) kernel(size_t dataSize, size_t dataPerBlock) {
   size_t startIndex = blockIdx.x * dataPerBlock;
   size_t blockDataSize = min(dataSize - startIndex, dataPerBlock);
   int globalIndex = blockIdx.x * blockDim.x + threadIdx.x;
@@ -87,7 +87,7 @@ std::vector<KernelRestriction> SendRecvTestColl::getKernelRestrictions() {
 
 void SendRecvTestColl::initData(const TestArgs& args, std::vector<void*> sendBuff, void* expectedBuff) {
   int rank = args.rank;
-  if (sendBuff.size() != 1) std::unexpected();
+  if (sendBuff.size() != 1) std::runtime_error("unexpected error");
   MSCCLPP_CUDATHROW(cudaMemset(sendBuff[0], 0, sendCount_ * typeSize_));
 
   // TODO: The type should not limited to int.
@@ -137,8 +137,8 @@ class SendRecvTestEngine : public BaseTestEngine {
 SendRecvTestEngine::SendRecvTestEngine(const TestArgs& args) : BaseTestEngine(args, "sendrecv") { inPlace_ = false; }
 
 void SendRecvTestEngine::allocateBuffer() {
-  std::shared_ptr<int> sendBuff = mscclpp::allocSharedCuda<int>(args_.maxBytes / sizeof(int));
-  std::shared_ptr<int> recvBuff = mscclpp::allocSharedCuda<int>(args_.maxBytes / sizeof(int));
+  std::shared_ptr<int> sendBuff = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
+  std::shared_ptr<int> recvBuff = mscclpp::allocExtSharedCuda<int>(args_.maxBytes / sizeof(int));
   devicePtrs_.push_back(sendBuff);
   devicePtrs_.push_back(recvBuff);
 
diff --git a/test/nvls_test.cu b/test/nvls_test.cu
new file mode 100644
index 000000000..55ece3fcf
--- /dev/null
+++ b/test/nvls_test.cu
@@ -0,0 +1,213 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <stdio.h>
+
+#if (USE_NVLS)
+#include <cuda.h>
+#include <cudaTypedefs.h>
+#include <cuda_runtime.h>
+#include <mpi.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <mscclpp/gpu.hpp>
+
+#define CUCHECK(cmd)                                     \
+  do {                                                   \
+    auto err = cmd;                                      \
+    if (err != 0) {                                      \
+      printf("Cuda failure %d: Line %d", err, __LINE__); \
+      exit(-1);                                          \
+    }                                                    \
+  } while (false)
+
+// AR kernel snippet for sm_90 only
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+#define MULTIMEM_ST(val, ptr)                                                                                   \
+  asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), \
+               "r"(val.w)                                                                                       \
+               : "memory");
+// specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc
+#define MULTIMEM_LD(val, ptr)                                     \
+  asm("multimem.ld_reduce.global.add.v4.f32 {%0,%1,%2,%3}, [%4];" \
+      : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)        \
+      : "l"(ptr)                                                  \
+      : "memory");
+#else
+#define MULTIMEM_ST(val, ptr)
+#define MULTIMEM_LD(val, ptr)
+#endif
+
+__global__ void init_kernel(float* uc_ptr, int size, int myrank, int nranks) {
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x) {
+    uc_ptr[idx] = myrank + idx;
+  }
+}
+
+__global__ void check_correctness(float* uc_ptr, int size, int myrank, int nranks) {
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x) {
+    float expected = (float)((nranks * (nranks - 1)) / 2 + nranks * idx);
+    if (abs(uc_ptr[idx] - expected) > 0.01 * expected) {
+      printf("error! idx %d: %f != %f\n", idx, uc_ptr[idx], expected);
+    }
+  }
+}
+
+__global__ void testing(float* mc_ptr, int size, int myrank, int nranks) {
+  // for allreduce we dont even need an UC pointer. just using same mc_ptr for in-place reduction
+  // line is assumed to be 16B 4 ints of 8 halves
+  int my_st = ((int64_t)size * (int64_t)myrank) / (int64_t)nranks;
+  int my_en = ((int64_t)size * (int64_t)(myrank + 1)) / (int64_t)nranks;
+
+  int my_offset = (threadIdx.x + blockIdx.x * blockDim.x) * 4;
+  int my_step = blockDim.x * gridDim.x * 4;
+
+  for (int idx = my_st + my_offset; idx < my_en; idx += my_step) {
+    [[maybe_unused]] uint4 val;
+    MULTIMEM_LD(val, mc_ptr + idx);
+    MULTIMEM_ST(val, mc_ptr + idx);
+  }
+}
+
+int main() {
+  int myrank, nranks;
+  MPI_Init(NULL, NULL);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+
+  cudaSetDevice(myrank);
+
+  size_t size = 1024 * 1024 * 512;
+  CUmemAllocationHandleType handleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+
+  CUmulticastObjectProp mcProp = {};
+  mcProp.numDevices = nranks;
+  mcProp.size = size;
+  mcProp.handleTypes = handleType;
+
+  size_t minGran, gran;
+  gran = 0;
+  minGran = 0;
+  CUCHECK(cuMulticastGetGranularity(&minGran, &mcProp, CU_MULTICAST_GRANULARITY_MINIMUM));
+  CUCHECK(cuMulticastGetGranularity(&gran, &mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED));
+
+  if (!myrank) printf("nvls multicast granularity: gran = %lu, minGrad = %lu\n", gran, minGran);
+  size_t mcSize = ((size + gran - 1) / gran) * gran;
+  mcProp.size = mcSize;
+
+  CUmemGenericAllocationHandle handle;
+  // only one rank creates the multicast object
+  if (!myrank) CUCHECK(cuMulticastCreate(&handle, &mcProp));
+
+  int fd = 0;
+  if (!myrank) CUCHECK(cuMemExportToShareableHandle(&fd, handle, handleType, 0 /*flags*/));
+
+  // some ugly UDS business
+  //  Borrow ipcsocket.{c,h} from nccl code
+  // in cuda 12.4 new fabric handle type is available so instead it would be possible to use MPI_Allgather for the
+  // exported handles
+  //  moreover it would the only way to do it on GraceHopper systems, since UDS is limited to single Unix node
+
+  pid_t currentPid = getpid();
+  MPI_Bcast(&fd, sizeof(fd), MPI_CHAR, 0, MPI_COMM_WORLD);
+  MPI_Bcast(&currentPid, sizeof(currentPid), MPI_CHAR, 0, MPI_COMM_WORLD);
+  int pidFd = syscall(SYS_pidfd_open, currentPid, 0);
+
+  // MPI_Bcast(&fd, sizeof(fd), MPI_CHAR, 0, MPI_COMM_WORLD);
+  // everyone else would now have same multicast object
+  int peerFd = 0;
+  peerFd = syscall(SYS_pidfd_getfd, pidFd, fd, 0);
+  if (myrank) CUCHECK(cuMemImportFromShareableHandle(&handle, reinterpret_cast<void*>(peerFd), handleType));
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  close(fd);
+  // end of ugly UDS business
+  // everyone adds device(s), no syncs required, just need to ensure bindmem happens after all this is called
+  int mydev = myrank;
+  CUCHECK(cuMulticastAddDevice(handle, mydev));
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  CUmemGenericAllocationHandle memhandle;
+  CUmemAllocationProp prop = {};
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.location.id = mydev;
+  prop.requestedHandleTypes = handleType;
+
+  // allocate physical memory (data buffer)
+  CUCHECK(cuMemCreate(&memhandle, size, &prop, 0 /*flags*/));
+
+  void* uc_va;
+  void* mc_va;
+  CUmemAccessDesc accessDesc = {};
+  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  accessDesc.location.id = mydev;
+  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+
+  // Map a VA to UC space
+  CUCHECK(cuMemAddressReserve((CUdeviceptr*)&uc_va, size, minGran, 0U, 0));
+  cudaMemset(uc_va, 0, size);
+  CUCHECK(cuMemMap((CUdeviceptr)uc_va, size, 0, memhandle, 0));
+  // set access on UC address
+  CUCHECK(cuMemSetAccess((CUdeviceptr)uc_va, size, &accessDesc, 1));
+
+  // everyone binds memory to the multicast
+  CUCHECK(cuMulticastBindMem(handle, 0 /*mcOffset*/, memhandle, 0 /*memOffset*/, size, 0));
+  MPI_Barrier(MPI_COMM_WORLD);
+  // usual VA business: map both MC and PA to two different VA addresses
+
+  // Map a VA to MC space
+  CUCHECK(cuMemAddressReserve((CUdeviceptr*)&mc_va, mcSize, minGran, 0U, 0));
+  CUCHECK(cuMemMap((CUdeviceptr)mc_va, mcSize, 0, handle, 0));
+  // set access on MC address
+  CUCHECK(cuMemSetAccess((CUdeviceptr)mc_va, mcSize, &accessDesc, 1));
+
+  int rept = 10;
+  int block_size = 1024;
+  int nblocks = 16;
+
+  cudaDeviceSynchronize();
+  MPI_Barrier(MPI_COMM_WORLD);
+  init_kernel<<<nblocks, block_size>>>((float*)uc_va, size / sizeof(float), myrank, nranks);
+  cudaDeviceSynchronize();
+  MPI_Barrier(MPI_COMM_WORLD);
+  testing<<<nblocks, block_size>>>((float*)mc_va, size / sizeof(float), myrank, nranks);
+  cudaDeviceSynchronize();
+  MPI_Barrier(MPI_COMM_WORLD);
+  check_correctness<<<nblocks, block_size>>>((float*)uc_va, size / sizeof(float), myrank, nranks);
+  cudaDeviceSynchronize();
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  for (size_t input_size = 1024; input_size <= size; input_size *= 2) {
+    // warmup
+    for (int i = 0; i < rept; i++) {
+      testing<<<nblocks, block_size>>>((float*)mc_va, input_size / sizeof(float), myrank, nranks);
+    }
+    cudaDeviceSynchronize();
+    MPI_Barrier(MPI_COMM_WORLD);
+    double st = MPI_Wtime();
+    for (int i = 0; i < rept; i++) {
+      testing<<<nblocks, block_size>>>((float*)mc_va, input_size / sizeof(float), myrank, nranks);
+    }
+    cudaDeviceSynchronize();
+    double en = MPI_Wtime();
+    double time = (en - st) / rept;
+    if (!myrank)
+      printf("input_size %ld | Time = %f us, alg_bw = %f (GBps)\n", input_size, time * 1e6, input_size / 1e9 / time);
+  }
+  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Finalize();
+  return 0;
+}
+
+#else  // !(USE_NVLS)
+
+int main() {
+  printf("This test requires NVLS to be enabled\n");
+  return 0;
+}
+
+#endif  // !(USE_NVLS)
diff --git a/test/unit/cuda_utils_tests.cc b/test/unit/cuda_utils_tests.cc
index 0fabd9740..c2f565967 100644
--- a/test/unit/cuda_utils_tests.cc
+++ b/test/unit/cuda_utils_tests.cc
@@ -3,7 +3,7 @@
 
 #include <gtest/gtest.h>
 
-#include <mscclpp/cuda_utils.hpp>
+#include <mscclpp/gpu_utils.hpp>
 
 TEST(CudaUtilsTest, AllocShared) {
   auto p1 = mscclpp::allocSharedCuda<uint32_t>();
diff --git a/test/unit/fifo_tests.cu b/test/unit/fifo_tests.cu
index 567592117..0cfe03e1e 100644
--- a/test/unit/fifo_tests.cu
+++ b/test/unit/fifo_tests.cu
@@ -3,8 +3,8 @@
 
 #include <gtest/gtest.h>
 
-#include <mscclpp/cuda_utils.hpp>
 #include <mscclpp/fifo.hpp>
+#include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/numa.hpp>
 #include <mscclpp/utils.hpp>
 
diff --git a/test/unit/numa_tests.cc b/test/unit/numa_tests.cc
index af09261a2..dfa63a74a 100644
--- a/test/unit/numa_tests.cc
+++ b/test/unit/numa_tests.cc
@@ -3,7 +3,7 @@
 
 #include <gtest/gtest.h>
 
-#include <mscclpp/cuda_utils.hpp>
+#include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/numa.hpp>
 
 TEST(NumaTest, Basic) {
diff --git a/test/unit/socket_tests.cc b/test/unit/socket_tests.cc
index fe0a063e5..4fa8d3915 100644
--- a/test/unit/socket_tests.cc
+++ b/test/unit/socket_tests.cc
@@ -17,7 +17,7 @@ TEST(Socket, ListenAndConnect) {
   ASSERT_NO_THROW(mscclpp::SocketGetAddrFromString(&listenAddr, ipPortPair.c_str()));
 
   mscclpp::Socket listenSock(&listenAddr);
-  listenSock.listen();
+  listenSock.bindAndListen();
 
   std::thread clientThread([&listenAddr]() {
     mscclpp::Socket sock(&listenAddr);