diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 637b9276c0..f433a4513a 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -2080,7 +2080,7 @@ auto iterative_build_graph( curr_itopk_size = curr_topk + 32; } - RAFT_LOG_INFO( + RAFT_LOG_DEBUG( "# graph_size = %lu (%.3lf), graph_degree = %lu, query_size = %lu, itopk = %lu, topk = %lu", (uint64_t)cagra_graph.extent(0), (double)cagra_graph.extent(0) / final_graph_size, @@ -2146,7 +2146,7 @@ auto iterative_build_graph( auto end = std::chrono::high_resolution_clock::now(); auto elapsed_ms = std::chrono::duration_cast(end - start).count(); - RAFT_LOG_INFO("# elapsed time: %.3lf sec", (double)elapsed_ms / 1000); + RAFT_LOG_DEBUG("# elapsed time: %.3lf sec", (double)elapsed_ms / 1000); if (flag_last) { break; } flag_last = (curr_graph_size == final_graph_size); diff --git a/cpp/src/neighbors/detail/smem_utils.cuh b/cpp/src/neighbors/detail/smem_utils.cuh index 41c95c0ccd..73bc8c578d 100644 --- a/cpp/src/neighbors/detail/smem_utils.cuh +++ b/cpp/src/neighbors/detail/smem_utils.cuh @@ -4,24 +4,28 @@ */ #pragma once -#include - #include #include #include +#include namespace cuvs::neighbors::detail { /** * @brief (Thread-)Safely invoke a kernel with a maximum dynamic shared memory size. - * This is required because the sequence `cudaFuncSetAttribute` + kernel launch is not executed - * atomically. * - * Used this way, the cudaFuncAttributeMaxDynamicSharedMemorySize can only grow and thus - * guarantees that the kernel is safe to launch. + * Maintains a monotonically growing high-water mark for + * `cudaFuncAttributeMaxDynamicSharedMemorySize`. When the kernel function pointer changes, the new + * kernel is brought up to the current high-water mark; when smem_size exceeds the high-water mark, + * it is grown for the current kernel. This guarantees every kernel's attribute is always >= + * smem_size at the time of launch. + * + * NB: cudaFuncSetAttribute is per kernel function pointer value, not per type. Multiple kernel + * template instantiations may share the same KernelT type (e.g. function pointers with the same + * signature), so we track the kernel identity alongside the smem high-water mark. * * @tparam KernelT The type of the kernel. - * @tparam InvocationT The type of the invocation function. + * @tparam KernelLauncherT The type of the launch function/lambda. * @param kernel The kernel function address (for whom the smem-size is specified). * @param smem_size The size of the dynamic shared memory to be set. * @param launch The kernel launch function/lambda. @@ -31,31 +35,38 @@ void safely_launch_kernel_with_smem_size(KernelT const& kernel, uint32_t smem_size, KernelLauncherT const& launch) { - // the last smem size is parameterized by the kernel thanks to the template parameter. - static std::atomic current_smem_size{0}; - auto last_smem_size = current_smem_size.load(std::memory_order_relaxed); - if (smem_size > last_smem_size) { - // We still need a mutex for the critical section: actualize last_smem_size and set the - // attribute. - static auto mutex = std::mutex{}; - auto guard = std::lock_guard{mutex}; - if (!current_smem_size.compare_exchange_strong( - last_smem_size, smem_size, std::memory_order_relaxed, std::memory_order_relaxed)) { - // The value has been updated by another thread between the load and the mutex acquisition. - if (smem_size > last_smem_size) { - current_smem_size.store(smem_size, std::memory_order_relaxed); - } + // last_smem_size is a monotonically growing high-water mark across all kernel pointers. + // last_kernel tracks which kernel pointer was last used. + static std::atomic last_smem_size{0}; + static std::atomic last_kernel{KernelT{}}; + static std::mutex mutex; + // Fast path: skip the lock when the kernel matches and the smem size is within bounds. + // Load order matters: last_smem_size (acquire) before last_kernel (relaxed). Inside the lock + // we store in the opposite order: last_kernel (relaxed) then last_smem_size (release). + // This way an acquire load of last_smem_size that sees a post-cudaFuncSetAttribute value is + // guaranteed to also see the corresponding last_kernel. + if (smem_size > last_smem_size.load(std::memory_order_acquire) || + kernel != last_kernel.load(std::memory_order_relaxed)) { + std::lock_guard guard(mutex); + // Re-check under the lock: the outside decision can be stale. + uint32_t cur_smem_size = last_smem_size.load(std::memory_order_relaxed); + bool need_update = (kernel != last_kernel.load(std::memory_order_relaxed)); + if (smem_size > cur_smem_size) { + cur_smem_size = smem_size; + need_update = true; } - // Only update if the last seen value is smaller than the new one. - if (smem_size > last_smem_size) { + if (need_update) { auto launch_status = - cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size); + cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, cur_smem_size); RAFT_EXPECTS(launch_status == cudaSuccess, "Failed to set max dynamic shared memory size to %u bytes", - smem_size); + cur_smem_size); + // Store order matters: last_kernel before last_smem_size (release) so the fast-path + // acquire load of last_smem_size also publishes last_kernel. + last_kernel.store(kernel, std::memory_order_relaxed); + last_smem_size.store(cur_smem_size, std::memory_order_release); } } - // We don't need to guard the kernel launch because the smem_size can only grow. return launch(kernel); }