From 877c1f3a08eb92b99bb72c6f6eb9e40508197726 Mon Sep 17 00:00:00 2001 From: surya periaswamy Date: Tue, 23 Jun 2026 18:08:48 +0000 Subject: [PATCH 1/2] [RCCL] [AICOMRCCL-598] Add DeviceApi unit tests under rccl-UnitTestsFixtures Adds DeviceApi.LsaRemoteRead, DeviceApi.CuMemDisabled, and DeviceApi.WinDisabled in rccl-UnitTestsFixtures, covering: - LSA cross-rank peer read: each rank's kernel builds an ncclLsaBarrierSession, syncs across ranks, then reads its peer's symmetric-window buffer via ncclGetLsaPointer, and - ncclDevCommCreate / symmetric-window gating under NCCL_CUMEM_ENABLE / NCCL_WIN_ENABLE. Targets the current device API (NCCL 2.30.4 on develop): - ncclDevCommRequirements_t is initialized via NCCL_DEV_COMM_REQUIREMENTS_INITIALIZER (2.30 validates its size/magic/version header; zero-init is rejected). - The negative tests accept the unsupported-config rejection at whichever point the runtime raises it: NCCL 2.30 rejects at ncclCommWindowRegister, older releases at ncclDevCommCreate. - Negative configs pin NCCL_IB_DISABLE=1 (single-node 2-GPU tests do not need IB; otherwise the rejection path can surface an environment-dependent ncclSystemError from ibv_create_qp that masks the clean ncclInvalidUsage gating signal). Device-API helper headers are included directly (nccl_device/impl/{core,lsa_barrier}__funcs.h); these are HIP-clean since PR #6259 added the hip_compat.h cuda::memory_order polyfill. NCCL bootstrap is pinned to loopback (single-process multi-GPU via ncclCommInitAll). DeviceApiResources use RAII teardown; the prior AICOMRCCL-835 teardown segfault was fixed by the symMemoryDropRef drain in the NCCL 2.28.9 sync. --- projects/rccl/CHANGELOG.md | 1 + projects/rccl/test/CMakeLists.txt | 1 + projects/rccl/test/DeviceApiTests.cpp | 456 ++++++++++++++++++++++++++ 3 files changed, 458 insertions(+) create mode 100644 projects/rccl/test/DeviceApiTests.cpp diff --git a/projects/rccl/CHANGELOG.md b/projects/rccl/CHANGELOG.md index a5214eecf94..f67e1310197 100644 --- a/projects/rccl/CHANGELOG.md +++ b/projects/rccl/CHANGELOG.md @@ -33,6 +33,7 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https: * Added `RCCL_IB_P2P_DISABLE_CTS` to disable CTS offload for P2P connections on AINIC. Defaults to 1 (disabled). When `RCCL_CTS_OFFLOAD_ENABLED=1` is explicitly set, it overrides this flag and forces CTS on all connections including P2P. * Merged `RCCL_CTS_INLINE_DATA` into `RCCL_CTS_OFFLOAD_ENABLED`. CTS offload and CTS inline data are now controlled by a single tri-state variable: `-1` (default, auto-enable on AINIC), `0` (force disable), `1` (force enable for all connections). * Added Pythonic API bindings under `bindings/nccl4py/` (RCCL fork of NVIDIA `nccl4py` v0.2.0). Provides Python access to RCCL collectives via Cython bindings, an on-disk `cuda.core` HIP shim for ROCm hosts without `cuda-bindings` / `cuda-core`, and RCCL-only collective wrappers (`ncclAllReduceWithBias`, `ncclAllToAllv`). +* Added unit tests for the RCCL Device API in `rccl-UnitTestsFixtures` (`DeviceApi.LsaRemoteRead`, `DeviceApi.CuMemDisabled`, `DeviceApi.WinDisabled`) covering LSA symmetric remote read and `ncclDevCommCreate` gating under `NCCL_CUMEM_ENABLE` / `NCCL_WIN_ENABLE`. ### Changed * Compatibility with NCCL 2.28.3. diff --git a/projects/rccl/test/CMakeLists.txt b/projects/rccl/test/CMakeLists.txt index 44b1baac20c..fb3879f942d 100644 --- a/projects/rccl/test/CMakeLists.txt +++ b/projects/rccl/test/CMakeLists.txt @@ -227,6 +227,7 @@ if(BUILD_TESTS) VersionInfoTests.cpp device/TestOp128.cpp device/GinDeviceTests.cpp + DeviceApiTests.cpp common/main_fixtures.cpp common/EnvVars.cpp common/ProcessIsolatedTestRunner.cpp diff --git a/projects/rccl/test/DeviceApiTests.cpp b/projects/rccl/test/DeviceApiTests.cpp new file mode 100644 index 00000000000..9a354110857 --- /dev/null +++ b/projects/rccl/test/DeviceApiTests.cpp @@ -0,0 +1,456 @@ +/************************************************************************* + * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "nccl_device/impl/core__funcs.h" +#include "nccl_device/impl/lsa_barrier__funcs.h" + +#include "common/ProcessIsolatedTestRunner.hpp" + +// `cuda::memory_order` and `cuda::atomic_ref` are polyfilled on HIP via +// nccl_device/hip_compat.h (transitively included by lsa_barrier__funcs.h), +// so the test kernel can use the same upstream-NCCL signatures on both +// platforms. See [RCCL] PR #6259 (a572f1aabf). + +namespace RcclUnitTesting +{ + +namespace +{ + +constexpr int kPositiveRanks = 2; +constexpr int kNegativeRanks = 1; +constexpr int kBlocksPerRank = 1; +constexpr int kThreadsPerBlock = 64; +constexpr size_t kBufferBytes = sizeof(int); +constexpr int kNegativeTestSeed = 7; + +// Each rank reads one integer from its peer through a symmetric window. +__global__ void lsaReadPeerValueKernel( + ncclWindow_t inputWindow, int* outputValue, ncclDevComm_t devComm +) +{ + ncclLsaBarrierSession barrier( + ncclCoopCta(), devComm, ncclTeamLsa(devComm), devComm.lsaBarrier, blockIdx.x + ); + barrier.sync(ncclCoopCta(), cuda::memory_order_relaxed); + + if(threadIdx.x == 0) + { + const int peer = (devComm.rank + 1) % devComm.nRanks; + int* peerInput + = reinterpret_cast(ncclGetLsaPointer(inputWindow, 0, peer)); + outputValue[0] = peerInput[0]; + } + + barrier.sync(ncclCoopCta(), cuda::memory_order_release); +} + +struct DeviceApiRankResources +{ + int device = -1; + ncclComm_t comm = nullptr; + hipStream_t stream = nullptr; + int* inputBuffer = nullptr; + int* outputBuffer = nullptr; + ncclWindow_t inputWindow = nullptr; + ncclDevComm_t devComm = {}; + bool devCommCreated = false; +}; + +struct DeviceApiResources +{ + explicit DeviceApiResources(int rankCount) + : ranks(static_cast(rankCount)) + { + for(int rank = 0; rank < rankCount; ++rank) + ranks[rank].device = rank; + } + + ~DeviceApiResources() + { + for(auto& rank : ranks) + { + if(rank.device >= 0) + (void)hipSetDevice(rank.device); + + if(rank.stream != nullptr) + (void)hipStreamSynchronize(rank.stream); + + if(rank.devCommCreated && rank.comm != nullptr) + (void)ncclDevCommDestroy(rank.comm, &rank.devComm); + + if(rank.inputWindow != nullptr && rank.comm != nullptr) + (void)ncclCommWindowDeregister(rank.comm, rank.inputWindow); + + if(rank.outputBuffer != nullptr) + (void)hipFree(rank.outputBuffer); + + if(rank.inputBuffer != nullptr) + (void)ncclMemFree(rank.inputBuffer); + + if(rank.stream != nullptr) + (void)hipStreamDestroy(rank.stream); + + if(rank.comm != nullptr) + (void)ncclCommDestroy(rank.comm); + } + } + + std::vector ranks; +}; + +static int getVisibleGpuCount() +{ + int gpuCount = 0; + return hipGetDeviceCount(&gpuCount) == hipSuccess ? gpuCount : 0; +} + +static bool hasFullDirectP2p(int gpuCount) +{ + for(int src = 0; src < gpuCount; ++src) + { + for(int dst = 0; dst < gpuCount; ++dst) + { + if(src == dst) + continue; + + int canAccessPeer = 0; + if(hipDeviceCanAccessPeer(&canAccessPeer, src, dst) != hipSuccess || !canAccessPeer) + return false; + } + } + + return true; +} + +static void initializeCommunicators(DeviceApiResources& resources) +{ + std::vector comms(resources.ranks.size(), nullptr); + + ASSERT_EQ( + ncclCommInitAll(comms.data(), static_cast(comms.size()), nullptr), ncclSuccess + ); + + for(size_t rank = 0; rank < resources.ranks.size(); ++rank) + resources.ranks[rank].comm = comms[rank]; +} + +static void allocateInputBuffer(DeviceApiRankResources& rank, int inputValue) +{ + ASSERT_EQ(hipSetDevice(rank.device), hipSuccess); + + void* rawInput = nullptr; + ASSERT_EQ(ncclMemAlloc(&rawInput, kBufferBytes), ncclSuccess); + rank.inputBuffer = static_cast(rawInput); + + ASSERT_EQ( + hipMemcpy(rank.inputBuffer, &inputValue, kBufferBytes, hipMemcpyHostToDevice), + hipSuccess + ); +} + +static void allocatePositiveBuffers( + DeviceApiResources& resources, const std::array& inputValues +) +{ + for(size_t rankIdx = 0; rankIdx < resources.ranks.size(); ++rankIdx) + { + auto& rank = resources.ranks[rankIdx]; + ASSERT_EQ(hipSetDevice(rank.device), hipSuccess); + ASSERT_EQ(hipStreamCreate(&rank.stream), hipSuccess); + + allocateInputBuffer(rank, inputValues[rankIdx]); + + ASSERT_EQ(hipMalloc(reinterpret_cast(&rank.outputBuffer), kBufferBytes), hipSuccess); + ASSERT_EQ(hipMemset(rank.outputBuffer, 0, kBufferBytes), hipSuccess); + } +} + +static void registerInputWindows(DeviceApiResources& resources) +{ + ASSERT_EQ(ncclGroupStart(), ncclSuccess); + + std::vector results(resources.ranks.size(), ncclSuccess); + for(size_t rankIdx = 0; rankIdx < resources.ranks.size(); ++rankIdx) + { + auto& rank = resources.ranks[rankIdx]; + results[rankIdx] = ncclCommWindowRegister( + rank.comm, + rank.inputBuffer, + kBufferBytes, + &rank.inputWindow, + NCCL_WIN_COLL_SYMMETRIC + ); + } + + const ncclResult_t groupResult = ncclGroupEnd(); + + for(const auto& result : results) + ASSERT_EQ(result, ncclSuccess); + ASSERT_EQ(groupResult, ncclSuccess); +} + +// Like registerInputWindows() but returns the registration result instead of +// asserting success. Used by the negative tests: in NCCL 2.30 an unsupported +// configuration (cuMem / symmetric windows disabled) is rejected at +// ncclCommWindowRegister, whereas older releases accepted registration and +// rejected later at ncclDevCommCreate. Returns the first non-success per-rank +// result, otherwise the ncclGroupEnd() result. +static ncclResult_t tryRegisterInputWindows(DeviceApiResources& resources) +{ + if(ncclGroupStart() != ncclSuccess) + return ncclInternalError; + + std::vector results(resources.ranks.size(), ncclSuccess); + for(size_t rankIdx = 0; rankIdx < resources.ranks.size(); ++rankIdx) + { + auto& rank = resources.ranks[rankIdx]; + results[rankIdx] = ncclCommWindowRegister( + rank.comm, + rank.inputBuffer, + kBufferBytes, + &rank.inputWindow, + NCCL_WIN_COLL_SYMMETRIC + ); + } + + const ncclResult_t groupResult = ncclGroupEnd(); + + for(const auto& result : results) + if(result != ncclSuccess) + return result; + return groupResult; +} + +static void clearHipErrorState() +{ + (void)hipGetLastError(); +} + +static void runPositiveLsaRemoteReadTest() +{ + if(getVisibleGpuCount() < kPositiveRanks) + GTEST_SKIP() << "This test requires at least 2 visible GPUs."; + + if(!hasFullDirectP2p(kPositiveRanks)) + GTEST_SKIP() << "This test requires direct P2P access between the first 2 GPUs."; + + DeviceApiResources resources(kPositiveRanks); + initializeCommunicators(resources); + + const std::array inputValues = {7, 11}; + allocatePositiveBuffers(resources, inputValues); + + registerInputWindows(resources); + + for(const auto& rank : resources.ranks) + { + if(rank.inputWindow == nullptr) + GTEST_SKIP() << "Symmetric window registration is unavailable on this configuration."; + } + + ncclDevCommRequirements_t requirements = NCCL_DEV_COMM_REQUIREMENTS_INITIALIZER; + requirements.lsaBarrierCount = kBlocksPerRank; + + ASSERT_EQ(ncclGroupStart(), ncclSuccess); + + std::vector createResults(resources.ranks.size(), ncclSuccess); + for(size_t rankIdx = 0; rankIdx < resources.ranks.size(); ++rankIdx) + { + auto& rank = resources.ranks[rankIdx]; + createResults[rankIdx] = ncclDevCommCreate(rank.comm, &requirements, &rank.devComm); + } + + const ncclResult_t groupResult = ncclGroupEnd(); + + for(size_t rankIdx = 0; rankIdx < resources.ranks.size(); ++rankIdx) + { + if(createResults[rankIdx] == ncclSuccess) + resources.ranks[rankIdx].devCommCreated = true; + } + + bool unsupportedConfiguration = (groupResult == ncclInvalidUsage); + for(const auto& result : createResults) + unsupportedConfiguration |= (result == ncclInvalidUsage); + + if(unsupportedConfiguration) + GTEST_SKIP() << "Symmetric device API is unsupported on this configuration."; + + for(const auto& result : createResults) + ASSERT_EQ(result, ncclSuccess); + ASSERT_EQ(groupResult, ncclSuccess); + + for(auto& rank : resources.ranks) + { + ASSERT_EQ(hipSetDevice(rank.device), hipSuccess); + clearHipErrorState(); + + hipLaunchKernelGGL( + lsaReadPeerValueKernel, + dim3(kBlocksPerRank), + dim3(kThreadsPerBlock), + 0, + rank.stream, + rank.inputWindow, + rank.outputBuffer, + rank.devComm + ); + const hipError_t launchError = hipGetLastError(); + ASSERT_EQ(launchError, hipSuccess) + << "lsaReadPeerValueKernel launch failed on device " << rank.device << ": " + << hipGetErrorString(launchError); + } + + for(auto& rank : resources.ranks) + { + ASSERT_EQ(hipSetDevice(rank.device), hipSuccess); + ASSERT_EQ(hipStreamSynchronize(rank.stream), hipSuccess); + } + + const std::array expectedOutputs = {inputValues[1], inputValues[0]}; + + for(size_t rankIdx = 0; rankIdx < resources.ranks.size(); ++rankIdx) + { + auto& rank = resources.ranks[rankIdx]; + int hostOutput = 0; + + ASSERT_EQ(hipSetDevice(rank.device), hipSuccess); + ASSERT_EQ( + hipMemcpy(&hostOutput, rank.outputBuffer, kBufferBytes, hipMemcpyDeviceToHost), + hipSuccess + ); + EXPECT_EQ(hostOutput, expectedOutputs[rankIdx]); + } +} + +static void runDevCommCreateFailureTest() +{ + if(getVisibleGpuCount() < kNegativeRanks) + GTEST_SKIP() << "This test requires at least 1 visible GPU."; + + DeviceApiResources resources(kNegativeRanks); + initializeCommunicators(resources); + allocateInputBuffer(resources.ranks[0], kNegativeTestSeed); + + // The device API is gated off in these configs (cuMem / symmetric windows + // disabled). NCCL 2.30 rejects the unsupported configuration at symmetric- + // window registration; older releases accepted registration and rejected + // later at ncclDevCommCreate. Accept the rejection at whichever point the + // runtime raises it. + const ncclResult_t registerResult = tryRegisterInputWindows(resources); + if(registerResult != ncclSuccess) + { + EXPECT_EQ(registerResult, ncclInvalidUsage); + return; + } + + ncclDevCommRequirements_t requirements = NCCL_DEV_COMM_REQUIREMENTS_INITIALIZER; + requirements.lsaBarrierCount = kBlocksPerRank; + + const ncclResult_t createResult + = ncclDevCommCreate(resources.ranks[0].comm, &requirements, &resources.ranks[0].devComm); + + EXPECT_EQ(createResult, ncclInvalidUsage); + if(createResult == ncclSuccess) + resources.ranks[0].devCommCreated = true; +} + +// Per-test config notes: +// - These are single-process multi-GPU tests (ncclCommInitAll). All rank-to- +// rank bootstrap traffic is intra-host by construction, so we pin NCCL's +// bootstrap socket to loopback. This makes the tests self-contained on +// any host network configuration (single-NIC, multi-NIC, containers with +// shared host netns, etc.) without relying on the caller to set +// NCCL_SOCKET_IFNAME. +// - withNumGpus(N) declares how many physical GPUs the test occupies so +// ProcessIsolatedTestRunner can schedule it correctly under +// maxParallelJobs > 1. The positive test runs 2 ranks (one per GPU); the +// negative tests run 1 rank but still hipSetDevice / hipMalloc / call +// ncclCommInitAll on a real GPU, so they declare 1 slot too. +static ProcessIsolatedTestRunner::TestConfig makeDeviceApiEnabledConfig( + const std::string& name, std::function testFn +) +{ + return ProcessIsolatedTestRunner::TestConfig(name, testFn) + .withEnvironment({{"NCCL_CUMEM_ENABLE", "1"}, + {"NCCL_WIN_ENABLE", "1"}, + {"NCCL_SOCKET_IFNAME", "lo"}}) + .withTimeout(std::chrono::seconds(60)) + .withNumGpus(kPositiveRanks); +} + +// The negative configs additionally pin NCCL_IB_DISABLE=1: these are single- +// node 2-GPU tests that don't need IB/RDMA, and on the unsupported-config +// rejection path NCCL 2.30 otherwise enters the IB transport and can surface +// an environment-dependent ncclSystemError (ibv_create_qp) that masks the +// clean ncclInvalidUsage gating signal the test is asserting. +static ProcessIsolatedTestRunner::TestConfig makeCuMemDisabledConfig( + const std::string& name, std::function testFn +) +{ + return ProcessIsolatedTestRunner::TestConfig(name, testFn) + .withEnvironment({{"NCCL_CUMEM_ENABLE", "0"}, + {"NCCL_WIN_ENABLE", "1"}, + {"NCCL_SOCKET_IFNAME", "lo"}, + {"NCCL_IB_DISABLE", "1"}}) + .withTimeout(std::chrono::seconds(60)) + .withNumGpus(kNegativeRanks); +} + +static ProcessIsolatedTestRunner::TestConfig makeWinDisabledConfig( + const std::string& name, std::function testFn +) +{ + return ProcessIsolatedTestRunner::TestConfig(name, testFn) + .withEnvironment({{"NCCL_CUMEM_ENABLE", "1"}, + {"NCCL_WIN_ENABLE", "0"}, + {"NCCL_SOCKET_IFNAME", "lo"}, + {"NCCL_IB_DISABLE", "1"}}) + .withTimeout(std::chrono::seconds(60)) + .withNumGpus(kNegativeRanks); +} + +} // namespace + +TEST(DeviceApi, LsaRemoteRead) +{ + RUN_ISOLATED_TESTS( + makeDeviceApiEnabledConfig( + "DeviceApi.LsaRemoteRead", []() { runPositiveLsaRemoteReadTest(); } + ) + ); +} + +TEST(DeviceApi, CuMemDisabled) +{ + RUN_ISOLATED_TESTS( + makeCuMemDisabledConfig( + "DeviceApi.CuMemDisabled", []() { runDevCommCreateFailureTest(); } + ) + ); +} + +TEST(DeviceApi, WinDisabled) +{ + RUN_ISOLATED_TESTS( + makeWinDisabledConfig( + "DeviceApi.WinDisabled", []() { runDevCommCreateFailureTest(); } + ) + ); +} + +} // namespace RcclUnitTesting From 62c6e68a6d6bdf447b76676beb6630ebf444f72c Mon Sep 17 00:00:00 2001 From: surya periaswamy Date: Tue, 23 Jun 2026 18:08:48 +0000 Subject: [PATCH 2/2] [RCCL] [AICOMRCCL-598] test_runner: route DeviceApi suite to rccl-UnitTestsFixtures Add a DeviceApi entry under the unit_tests_fixtures block so the test_runner invokes the binary that contains these tests. --- .../scripts/test_runner/configs/mi300x_mellanox_ib.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/projects/rccl/tools/scripts/test_runner/configs/mi300x_mellanox_ib.json b/projects/rccl/tools/scripts/test_runner/configs/mi300x_mellanox_ib.json index eb05c8a511e..e8301c4fd19 100644 --- a/projects/rccl/tools/scripts/test_runner/configs/mi300x_mellanox_ib.json +++ b/projects/rccl/tools/scripts/test_runner/configs/mi300x_mellanox_ib.json @@ -469,6 +469,11 @@ "description": "loadPack/storePack three internal paths (direct cast, funnel-shift, element-by-element) and boundary", "test_filter": "PackRoundtripTest.LoadStorePack_*:PackRoundtripTest.L2_StorePackBoundary:PackRoundtripTest.Store16Load16Global_Direct" }, + { + "name": "DeviceApi", + "description": "Verify RCCL Device API: symmetric window registration and ncclDevCommCreate gating", + "test_filter": "DeviceApi.*" + }, { "name": "Fixtures_All", "description": "All release fixture tests"