From 877c1f3a08eb92b99bb72c6f6eb9e40508197726 Mon Sep 17 00:00:00 2001
From: surya periaswamy <surya.periaswamy@amd.com>
Date: Tue, 23 Jun 2026 18:08:48 +0000
Subject: [PATCH 1/2] [RCCL] [AICOMRCCL-598] Add DeviceApi unit tests under
 rccl-UnitTestsFixtures

Adds DeviceApi.LsaRemoteRead, DeviceApi.CuMemDisabled, and
DeviceApi.WinDisabled in rccl-UnitTestsFixtures, covering:
- LSA cross-rank peer read: each rank's kernel builds an
  ncclLsaBarrierSession<ncclCoopCta>, syncs across ranks, then reads
  its peer's symmetric-window buffer via ncclGetLsaPointer, and
- ncclDevCommCreate / symmetric-window gating under
  NCCL_CUMEM_ENABLE / NCCL_WIN_ENABLE.

Targets the current device API (NCCL 2.30.4 on develop):
- ncclDevCommRequirements_t is initialized via
  NCCL_DEV_COMM_REQUIREMENTS_INITIALIZER (2.30 validates its
  size/magic/version header; zero-init is rejected).
- The negative tests accept the unsupported-config rejection at
  whichever point the runtime raises it: NCCL 2.30 rejects at
  ncclCommWindowRegister, older releases at ncclDevCommCreate.
- Negative configs pin NCCL_IB_DISABLE=1 (single-node 2-GPU tests do
  not need IB; otherwise the rejection path can surface an
  environment-dependent ncclSystemError from ibv_create_qp that masks
  the clean ncclInvalidUsage gating signal).

Device-API helper headers are included directly
(nccl_device/impl/{core,lsa_barrier}__funcs.h); these are HIP-clean
since PR #6259 added the hip_compat.h cuda::memory_order polyfill.

NCCL bootstrap is pinned to loopback (single-process multi-GPU via
ncclCommInitAll). DeviceApiResources use RAII teardown; the prior
AICOMRCCL-835 teardown segfault was fixed by the symMemoryDropRef
drain in the NCCL 2.28.9 sync.
---
 projects/rccl/CHANGELOG.md            |   1 +
 projects/rccl/test/CMakeLists.txt     |   1 +
 projects/rccl/test/DeviceApiTests.cpp | 456 ++++++++++++++++++++++++++
 3 files changed, 458 insertions(+)
 create mode 100644 projects/rccl/test/DeviceApiTests.cpp
diff --git a/projects/rccl/CHANGELOG.md b/projects/rccl/CHANGELOG.md
index a5214eecf94..f67e1310197 100644
--- a/projects/rccl/CHANGELOG.md
+++ b/projects/rccl/CHANGELOG.md
@@ -33,6 +33,7 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
 * Added `RCCL_IB_P2P_DISABLE_CTS` to disable CTS offload for P2P connections on AINIC. Defaults to 1 (disabled). When `RCCL_CTS_OFFLOAD_ENABLED=1` is explicitly set, it overrides this flag and forces CTS on all connections including P2P.
 * Merged `RCCL_CTS_INLINE_DATA` into `RCCL_CTS_OFFLOAD_ENABLED`. CTS offload and CTS inline data are now controlled by a single tri-state variable: `-1` (default, auto-enable on AINIC), `0` (force disable), `1` (force enable for all connections).
 * Added Pythonic API bindings under `bindings/nccl4py/` (RCCL fork of NVIDIA `nccl4py` v0.2.0). Provides Python access to RCCL collectives via Cython bindings, an on-disk `cuda.core` HIP shim for ROCm hosts without `cuda-bindings` / `cuda-core`, and RCCL-only collective wrappers (`ncclAllReduceWithBias`, `ncclAllToAllv`).
+* Added unit tests for the RCCL Device API in `rccl-UnitTestsFixtures` (`DeviceApi.LsaRemoteRead`, `DeviceApi.CuMemDisabled`, `DeviceApi.WinDisabled`) covering LSA symmetric remote read and `ncclDevCommCreate` gating under `NCCL_CUMEM_ENABLE` / `NCCL_WIN_ENABLE`.
 
 ### Changed
 * Compatibility with NCCL 2.28.3.
diff --git a/projects/rccl/test/CMakeLists.txt b/projects/rccl/test/CMakeLists.txt
index 44b1baac20c..fb3879f942d 100644
--- a/projects/rccl/test/CMakeLists.txt
+++ b/projects/rccl/test/CMakeLists.txt
@@ -227,6 +227,7 @@ if(BUILD_TESTS)
       VersionInfoTests.cpp
       device/TestOp128.cpp
       device/GinDeviceTests.cpp
+      DeviceApiTests.cpp
       common/main_fixtures.cpp
       common/EnvVars.cpp
       common/ProcessIsolatedTestRunner.cpp
diff --git a/projects/rccl/test/DeviceApiTests.cpp b/projects/rccl/test/DeviceApiTests.cpp
new file mode 100644
index 00000000000..9a354110857
--- /dev/null
+++ b/projects/rccl/test/DeviceApiTests.cpp
@@ -0,0 +1,456 @@
+/*************************************************************************
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <gtest/gtest.h>
+#include <hip/hip_runtime.h>
+#include <rccl/rccl.h>
+
+#include <array>
+#include <chrono>
+#include <cstdlib>
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "nccl_device/impl/core__funcs.h"
+#include "nccl_device/impl/lsa_barrier__funcs.h"
+
+#include "common/ProcessIsolatedTestRunner.hpp"
+
+// `cuda::memory_order` and `cuda::atomic_ref` are polyfilled on HIP via
+// nccl_device/hip_compat.h (transitively included by lsa_barrier__funcs.h),
+// so the test kernel can use the same upstream-NCCL signatures on both
+// platforms. See [RCCL] PR #6259 (a572f1aabf).
+
+namespace RcclUnitTesting
+{
+
+namespace
+{
+
+constexpr int    kPositiveRanks    = 2;
+constexpr int    kNegativeRanks    = 1;
+constexpr int    kBlocksPerRank    = 1;
+constexpr int    kThreadsPerBlock  = 64;
+constexpr size_t kBufferBytes      = sizeof(int);
+constexpr int    kNegativeTestSeed = 7;
+
+// Each rank reads one integer from its peer through a symmetric window.
+__global__ void lsaReadPeerValueKernel(
+    ncclWindow_t inputWindow, int* outputValue, ncclDevComm_t devComm
+)
+{
+    ncclLsaBarrierSession<ncclCoopCta> barrier(
+        ncclCoopCta(), devComm, ncclTeamLsa(devComm), devComm.lsaBarrier, blockIdx.x
+    );
+    barrier.sync(ncclCoopCta(), cuda::memory_order_relaxed);
+
+    if(threadIdx.x == 0)
+    {
+        const int peer = (devComm.rank + 1) % devComm.nRanks;
+        int*      peerInput
+            = reinterpret_cast<int*>(ncclGetLsaPointer(inputWindow, 0, peer));
+        outputValue[0] = peerInput[0];
+    }
+
+    barrier.sync(ncclCoopCta(), cuda::memory_order_release);
+}
+
+struct DeviceApiRankResources
+{
+    int            device         = -1;
+    ncclComm_t     comm           = nullptr;
+    hipStream_t    stream         = nullptr;
+    int*           inputBuffer    = nullptr;
+    int*           outputBuffer   = nullptr;
+    ncclWindow_t   inputWindow    = nullptr;
+    ncclDevComm_t  devComm        = {};
+    bool           devCommCreated = false;
+};
+
+struct DeviceApiResources
+{
+    explicit DeviceApiResources(int rankCount)
+        : ranks(static_cast<size_t>(rankCount))
+    {
+        for(int rank = 0; rank < rankCount; ++rank)
+            ranks[rank].device = rank;
+    }
+
+    ~DeviceApiResources()
+    {
+        for(auto& rank : ranks)
+        {
+            if(rank.device >= 0)
+                (void)hipSetDevice(rank.device);
+
+            if(rank.stream != nullptr)
+                (void)hipStreamSynchronize(rank.stream);
+
+            if(rank.devCommCreated && rank.comm != nullptr)
+                (void)ncclDevCommDestroy(rank.comm, &rank.devComm);
+
+            if(rank.inputWindow != nullptr && rank.comm != nullptr)
+                (void)ncclCommWindowDeregister(rank.comm, rank.inputWindow);
+
+            if(rank.outputBuffer != nullptr)
+                (void)hipFree(rank.outputBuffer);
+
+            if(rank.inputBuffer != nullptr)
+                (void)ncclMemFree(rank.inputBuffer);
+
+            if(rank.stream != nullptr)
+                (void)hipStreamDestroy(rank.stream);
+
+            if(rank.comm != nullptr)
+                (void)ncclCommDestroy(rank.comm);
+        }
+    }
+
+    std::vector<DeviceApiRankResources> ranks;
+};
+
+static int getVisibleGpuCount()
+{
+    int gpuCount = 0;
+    return hipGetDeviceCount(&gpuCount) == hipSuccess ? gpuCount : 0;
+}
+
+static bool hasFullDirectP2p(int gpuCount)
+{
+    for(int src = 0; src < gpuCount; ++src)
+    {
+        for(int dst = 0; dst < gpuCount; ++dst)
+        {
+            if(src == dst)
+                continue;
+
+            int canAccessPeer = 0;
+            if(hipDeviceCanAccessPeer(&canAccessPeer, src, dst) != hipSuccess || !canAccessPeer)
+                return false;
+        }
+    }
+
+    return true;
+}
+
+static void initializeCommunicators(DeviceApiResources& resources)
+{
+    std::vector<ncclComm_t> comms(resources.ranks.size(), nullptr);
+
+    ASSERT_EQ(
+        ncclCommInitAll(comms.data(), static_cast<int>(comms.size()), nullptr), ncclSuccess
+    );
+
+    for(size_t rank = 0; rank < resources.ranks.size(); ++rank)
+        resources.ranks[rank].comm = comms[rank];
+}
+
+static void allocateInputBuffer(DeviceApiRankResources& rank, int inputValue)
+{
+    ASSERT_EQ(hipSetDevice(rank.device), hipSuccess);
+
+    void* rawInput = nullptr;
+    ASSERT_EQ(ncclMemAlloc(&rawInput, kBufferBytes), ncclSuccess);
+    rank.inputBuffer = static_cast<int*>(rawInput);
+
+    ASSERT_EQ(
+        hipMemcpy(rank.inputBuffer, &inputValue, kBufferBytes, hipMemcpyHostToDevice),
+        hipSuccess
+    );
+}
+
+static void allocatePositiveBuffers(
+    DeviceApiResources& resources, const std::array<int, kPositiveRanks>& inputValues
+)
+{
+    for(size_t rankIdx = 0; rankIdx < resources.ranks.size(); ++rankIdx)
+    {
+        auto& rank = resources.ranks[rankIdx];
+        ASSERT_EQ(hipSetDevice(rank.device), hipSuccess);
+        ASSERT_EQ(hipStreamCreate(&rank.stream), hipSuccess);
+
+        allocateInputBuffer(rank, inputValues[rankIdx]);
+
+        ASSERT_EQ(hipMalloc(reinterpret_cast<void**>(&rank.outputBuffer), kBufferBytes), hipSuccess);
+        ASSERT_EQ(hipMemset(rank.outputBuffer, 0, kBufferBytes), hipSuccess);
+    }
+}
+
+static void registerInputWindows(DeviceApiResources& resources)
+{
+    ASSERT_EQ(ncclGroupStart(), ncclSuccess);
+
+    std::vector<ncclResult_t> results(resources.ranks.size(), ncclSuccess);
+    for(size_t rankIdx = 0; rankIdx < resources.ranks.size(); ++rankIdx)
+    {
+        auto& rank   = resources.ranks[rankIdx];
+        results[rankIdx] = ncclCommWindowRegister(
+            rank.comm,
+            rank.inputBuffer,
+            kBufferBytes,
+            &rank.inputWindow,
+            NCCL_WIN_COLL_SYMMETRIC
+        );
+    }
+
+    const ncclResult_t groupResult = ncclGroupEnd();
+
+    for(const auto& result : results)
+        ASSERT_EQ(result, ncclSuccess);
+    ASSERT_EQ(groupResult, ncclSuccess);
+}
+
+// Like registerInputWindows() but returns the registration result instead of
+// asserting success. Used by the negative tests: in NCCL 2.30 an unsupported
+// configuration (cuMem / symmetric windows disabled) is rejected at
+// ncclCommWindowRegister, whereas older releases accepted registration and
+// rejected later at ncclDevCommCreate. Returns the first non-success per-rank
+// result, otherwise the ncclGroupEnd() result.
+static ncclResult_t tryRegisterInputWindows(DeviceApiResources& resources)
+{
+    if(ncclGroupStart() != ncclSuccess)
+        return ncclInternalError;
+
+    std::vector<ncclResult_t> results(resources.ranks.size(), ncclSuccess);
+    for(size_t rankIdx = 0; rankIdx < resources.ranks.size(); ++rankIdx)
+    {
+        auto& rank       = resources.ranks[rankIdx];
+        results[rankIdx] = ncclCommWindowRegister(
+            rank.comm,
+            rank.inputBuffer,
+            kBufferBytes,
+            &rank.inputWindow,
+            NCCL_WIN_COLL_SYMMETRIC
+        );
+    }
+
+    const ncclResult_t groupResult = ncclGroupEnd();
+
+    for(const auto& result : results)
+        if(result != ncclSuccess)
+            return result;
+    return groupResult;
+}
+
+static void clearHipErrorState()
+{
+    (void)hipGetLastError();
+}
+
+static void runPositiveLsaRemoteReadTest()
+{
+    if(getVisibleGpuCount() < kPositiveRanks)
+        GTEST_SKIP() << "This test requires at least 2 visible GPUs.";
+
+    if(!hasFullDirectP2p(kPositiveRanks))
+        GTEST_SKIP() << "This test requires direct P2P access between the first 2 GPUs.";
+
+    DeviceApiResources resources(kPositiveRanks);
+    initializeCommunicators(resources);
+
+    const std::array<int, kPositiveRanks> inputValues = {7, 11};
+    allocatePositiveBuffers(resources, inputValues);
+
+    registerInputWindows(resources);
+
+    for(const auto& rank : resources.ranks)
+    {
+        if(rank.inputWindow == nullptr)
+            GTEST_SKIP() << "Symmetric window registration is unavailable on this configuration.";
+    }
+
+    ncclDevCommRequirements_t requirements = NCCL_DEV_COMM_REQUIREMENTS_INITIALIZER;
+    requirements.lsaBarrierCount           = kBlocksPerRank;
+
+    ASSERT_EQ(ncclGroupStart(), ncclSuccess);
+
+    std::vector<ncclResult_t> createResults(resources.ranks.size(), ncclSuccess);
+    for(size_t rankIdx = 0; rankIdx < resources.ranks.size(); ++rankIdx)
+    {
+        auto& rank          = resources.ranks[rankIdx];
+        createResults[rankIdx] = ncclDevCommCreate(rank.comm, &requirements, &rank.devComm);
+    }
+
+    const ncclResult_t groupResult = ncclGroupEnd();
+
+    for(size_t rankIdx = 0; rankIdx < resources.ranks.size(); ++rankIdx)
+    {
+        if(createResults[rankIdx] == ncclSuccess)
+            resources.ranks[rankIdx].devCommCreated = true;
+    }
+
+    bool unsupportedConfiguration = (groupResult == ncclInvalidUsage);
+    for(const auto& result : createResults)
+        unsupportedConfiguration |= (result == ncclInvalidUsage);
+
+    if(unsupportedConfiguration)
+        GTEST_SKIP() << "Symmetric device API is unsupported on this configuration.";
+
+    for(const auto& result : createResults)
+        ASSERT_EQ(result, ncclSuccess);
+    ASSERT_EQ(groupResult, ncclSuccess);
+
+    for(auto& rank : resources.ranks)
+    {
+        ASSERT_EQ(hipSetDevice(rank.device), hipSuccess);
+        clearHipErrorState();
+
+        hipLaunchKernelGGL(
+            lsaReadPeerValueKernel,
+            dim3(kBlocksPerRank),
+            dim3(kThreadsPerBlock),
+            0,
+            rank.stream,
+            rank.inputWindow,
+            rank.outputBuffer,
+            rank.devComm
+        );
+        const hipError_t launchError = hipGetLastError();
+        ASSERT_EQ(launchError, hipSuccess)
+            << "lsaReadPeerValueKernel launch failed on device " << rank.device << ": "
+            << hipGetErrorString(launchError);
+    }
+
+    for(auto& rank : resources.ranks)
+    {
+        ASSERT_EQ(hipSetDevice(rank.device), hipSuccess);
+        ASSERT_EQ(hipStreamSynchronize(rank.stream), hipSuccess);
+    }
+
+    const std::array<int, kPositiveRanks> expectedOutputs = {inputValues[1], inputValues[0]};
+
+    for(size_t rankIdx = 0; rankIdx < resources.ranks.size(); ++rankIdx)
+    {
+        auto& rank = resources.ranks[rankIdx];
+        int   hostOutput = 0;
+
+        ASSERT_EQ(hipSetDevice(rank.device), hipSuccess);
+        ASSERT_EQ(
+            hipMemcpy(&hostOutput, rank.outputBuffer, kBufferBytes, hipMemcpyDeviceToHost),
+            hipSuccess
+        );
+        EXPECT_EQ(hostOutput, expectedOutputs[rankIdx]);
+    }
+}
+
+static void runDevCommCreateFailureTest()
+{
+    if(getVisibleGpuCount() < kNegativeRanks)
+        GTEST_SKIP() << "This test requires at least 1 visible GPU.";
+
+    DeviceApiResources resources(kNegativeRanks);
+    initializeCommunicators(resources);
+    allocateInputBuffer(resources.ranks[0], kNegativeTestSeed);
+
+    // The device API is gated off in these configs (cuMem / symmetric windows
+    // disabled). NCCL 2.30 rejects the unsupported configuration at symmetric-
+    // window registration; older releases accepted registration and rejected
+    // later at ncclDevCommCreate. Accept the rejection at whichever point the
+    // runtime raises it.
+    const ncclResult_t registerResult = tryRegisterInputWindows(resources);
+    if(registerResult != ncclSuccess)
+    {
+        EXPECT_EQ(registerResult, ncclInvalidUsage);
+        return;
+    }
+
+    ncclDevCommRequirements_t requirements = NCCL_DEV_COMM_REQUIREMENTS_INITIALIZER;
+    requirements.lsaBarrierCount           = kBlocksPerRank;
+
+    const ncclResult_t createResult
+        = ncclDevCommCreate(resources.ranks[0].comm, &requirements, &resources.ranks[0].devComm);
+
+    EXPECT_EQ(createResult, ncclInvalidUsage);
+    if(createResult == ncclSuccess)
+        resources.ranks[0].devCommCreated = true;
+}
+
+// Per-test config notes:
+//   - These are single-process multi-GPU tests (ncclCommInitAll). All rank-to-
+//     rank bootstrap traffic is intra-host by construction, so we pin NCCL's
+//     bootstrap socket to loopback. This makes the tests self-contained on
+//     any host network configuration (single-NIC, multi-NIC, containers with
+//     shared host netns, etc.) without relying on the caller to set
+//     NCCL_SOCKET_IFNAME.
+//   - withNumGpus(N) declares how many physical GPUs the test occupies so
+//     ProcessIsolatedTestRunner can schedule it correctly under
+//     maxParallelJobs > 1. The positive test runs 2 ranks (one per GPU); the
+//     negative tests run 1 rank but still hipSetDevice / hipMalloc / call
+//     ncclCommInitAll on a real GPU, so they declare 1 slot too.
+static ProcessIsolatedTestRunner::TestConfig makeDeviceApiEnabledConfig(
+    const std::string& name, std::function<void()> testFn
+)
+{
+    return ProcessIsolatedTestRunner::TestConfig(name, testFn)
+        .withEnvironment({{"NCCL_CUMEM_ENABLE", "1"},
+                          {"NCCL_WIN_ENABLE", "1"},
+                          {"NCCL_SOCKET_IFNAME", "lo"}})
+        .withTimeout(std::chrono::seconds(60))
+        .withNumGpus(kPositiveRanks);
+}
+
+// The negative configs additionally pin NCCL_IB_DISABLE=1: these are single-
+// node 2-GPU tests that don't need IB/RDMA, and on the unsupported-config
+// rejection path NCCL 2.30 otherwise enters the IB transport and can surface
+// an environment-dependent ncclSystemError (ibv_create_qp) that masks the
+// clean ncclInvalidUsage gating signal the test is asserting.
+static ProcessIsolatedTestRunner::TestConfig makeCuMemDisabledConfig(
+    const std::string& name, std::function<void()> testFn
+)
+{
+    return ProcessIsolatedTestRunner::TestConfig(name, testFn)
+        .withEnvironment({{"NCCL_CUMEM_ENABLE", "0"},
+                          {"NCCL_WIN_ENABLE", "1"},
+                          {"NCCL_SOCKET_IFNAME", "lo"},
+                          {"NCCL_IB_DISABLE", "1"}})
+        .withTimeout(std::chrono::seconds(60))
+        .withNumGpus(kNegativeRanks);
+}
+
+static ProcessIsolatedTestRunner::TestConfig makeWinDisabledConfig(
+    const std::string& name, std::function<void()> testFn
+)
+{
+    return ProcessIsolatedTestRunner::TestConfig(name, testFn)
+        .withEnvironment({{"NCCL_CUMEM_ENABLE", "1"},
+                          {"NCCL_WIN_ENABLE", "0"},
+                          {"NCCL_SOCKET_IFNAME", "lo"},
+                          {"NCCL_IB_DISABLE", "1"}})
+        .withTimeout(std::chrono::seconds(60))
+        .withNumGpus(kNegativeRanks);
+}
+
+} // namespace
+
+TEST(DeviceApi, LsaRemoteRead)
+{
+    RUN_ISOLATED_TESTS(
+        makeDeviceApiEnabledConfig(
+            "DeviceApi.LsaRemoteRead", []() { runPositiveLsaRemoteReadTest(); }
+        )
+    );
+}
+
+TEST(DeviceApi, CuMemDisabled)
+{
+    RUN_ISOLATED_TESTS(
+        makeCuMemDisabledConfig(
+            "DeviceApi.CuMemDisabled", []() { runDevCommCreateFailureTest(); }
+        )
+    );
+}
+
+TEST(DeviceApi, WinDisabled)
+{
+    RUN_ISOLATED_TESTS(
+        makeWinDisabledConfig(
+            "DeviceApi.WinDisabled", []() { runDevCommCreateFailureTest(); }
+        )
+    );
+}
+
+} // namespace RcclUnitTesting

From 62c6e68a6d6bdf447b76676beb6630ebf444f72c Mon Sep 17 00:00:00 2001
From: surya periaswamy <surya.periaswamy@amd.com>
Date: Tue, 23 Jun 2026 18:08:48 +0000
Subject: [PATCH 2/2] [RCCL] [AICOMRCCL-598] test_runner: route DeviceApi suite
 to rccl-UnitTestsFixtures

Add a DeviceApi entry under the unit_tests_fixtures block so the
test_runner invokes the binary that contains these tests.
---
 .../scripts/test_runner/configs/mi300x_mellanox_ib.json      | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/projects/rccl/tools/scripts/test_runner/configs/mi300x_mellanox_ib.json b/projects/rccl/tools/scripts/test_runner/configs/mi300x_mellanox_ib.json
index eb05c8a511e..e8301c4fd19 100644
--- a/projects/rccl/tools/scripts/test_runner/configs/mi300x_mellanox_ib.json
+++ b/projects/rccl/tools/scripts/test_runner/configs/mi300x_mellanox_ib.json
@@ -469,6 +469,11 @@
           "description": "loadPack/storePack three internal paths (direct cast, funnel-shift, element-by-element) and boundary",
           "test_filter": "PackRoundtripTest.LoadStorePack_*:PackRoundtripTest.L2_StorePackBoundary:PackRoundtripTest.Store16Load16Global_Direct"
         },
+        {
+          "name": "DeviceApi",
+          "description": "Verify RCCL Device API: symmetric window registration and ncclDevCommCreate gating",
+          "test_filter": "DeviceApi.*"
+        },
         {
           "name": "Fixtures_All",
           "description": "All release fixture tests"