Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cpp/src/neighbors/detail/cagra/cagra_build.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -2080,7 +2080,7 @@ auto iterative_build_graph(
curr_itopk_size = curr_topk + 32;
}

RAFT_LOG_INFO(
RAFT_LOG_DEBUG(
"# graph_size = %lu (%.3lf), graph_degree = %lu, query_size = %lu, itopk = %lu, topk = %lu",
(uint64_t)cagra_graph.extent(0),
(double)cagra_graph.extent(0) / final_graph_size,
Expand Down Expand Up @@ -2146,7 +2146,7 @@ auto iterative_build_graph(

auto end = std::chrono::high_resolution_clock::now();
auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
RAFT_LOG_INFO("# elapsed time: %.3lf sec", (double)elapsed_ms / 1000);
RAFT_LOG_DEBUG("# elapsed time: %.3lf sec", (double)elapsed_ms / 1000);

if (flag_last) { break; }
flag_last = (curr_graph_size == final_graph_size);
Expand Down
63 changes: 37 additions & 26 deletions cpp/src/neighbors/detail/smem_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,28 @@
*/
#pragma once

#include <raft/core/error.hpp>

#include <atomic>
#include <cstdint>
#include <mutex>
#include <raft/core/error.hpp>

namespace cuvs::neighbors::detail {

/**
* @brief (Thread-)Safely invoke a kernel with a maximum dynamic shared memory size.
* This is required because the sequence `cudaFuncSetAttribute` + kernel launch is not executed
* atomically.
*
* Used this way, the cudaFuncAttributeMaxDynamicSharedMemorySize can only grow and thus
* guarantees that the kernel is safe to launch.
* Maintains a monotonically growing high-water mark for
* `cudaFuncAttributeMaxDynamicSharedMemorySize`. When the kernel function pointer changes, the new
* kernel is brought up to the current high-water mark; when smem_size exceeds the high-water mark,
* it is grown for the current kernel. This guarantees every kernel's attribute is always >=
* smem_size at the time of launch.
*
* NB: cudaFuncSetAttribute is per kernel function pointer value, not per type. Multiple kernel
* template instantiations may share the same KernelT type (e.g. function pointers with the same
* signature), so we track the kernel identity alongside the smem high-water mark.
*
* @tparam KernelT The type of the kernel.
* @tparam InvocationT The type of the invocation function.
* @tparam KernelLauncherT The type of the launch function/lambda.
* @param kernel The kernel function address (for whom the smem-size is specified).
* @param smem_size The size of the dynamic shared memory to be set.
* @param launch The kernel launch function/lambda.
Expand All @@ -31,31 +35,38 @@ void safely_launch_kernel_with_smem_size(KernelT const& kernel,
uint32_t smem_size,
KernelLauncherT const& launch)
{
// the last smem size is parameterized by the kernel thanks to the template parameter.
static std::atomic<uint32_t> current_smem_size{0};
auto last_smem_size = current_smem_size.load(std::memory_order_relaxed);
if (smem_size > last_smem_size) {
// We still need a mutex for the critical section: actualize last_smem_size and set the
// attribute.
static auto mutex = std::mutex{};
auto guard = std::lock_guard<std::mutex>{mutex};
if (!current_smem_size.compare_exchange_strong(
last_smem_size, smem_size, std::memory_order_relaxed, std::memory_order_relaxed)) {
// The value has been updated by another thread between the load and the mutex acquisition.
if (smem_size > last_smem_size) {
current_smem_size.store(smem_size, std::memory_order_relaxed);
}
// last_smem_size is a monotonically growing high-water mark across all kernel pointers.
// last_kernel tracks which kernel pointer was last used.
static std::atomic<uint32_t> last_smem_size{0};
static std::atomic<KernelT> last_kernel{KernelT{}};
static std::mutex mutex;
// Fast path: skip the lock when the kernel matches and the smem size is within bounds.
// Load order matters: last_smem_size (acquire) before last_kernel (relaxed). Inside the lock
// we store in the opposite order: last_kernel (relaxed) then last_smem_size (release).
// This way an acquire load of last_smem_size that sees a post-cudaFuncSetAttribute value is
// guaranteed to also see the corresponding last_kernel.
if (smem_size > last_smem_size.load(std::memory_order_acquire) ||
kernel != last_kernel.load(std::memory_order_relaxed)) {
std::lock_guard<std::mutex> guard(mutex);
// Re-check under the lock: the outside decision can be stale.
uint32_t cur_smem_size = last_smem_size.load(std::memory_order_relaxed);
bool need_update = (kernel != last_kernel.load(std::memory_order_relaxed));
if (smem_size > cur_smem_size) {
cur_smem_size = smem_size;
need_update = true;
}
// Only update if the last seen value is smaller than the new one.
if (smem_size > last_smem_size) {
if (need_update) {
auto launch_status =
cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, cur_smem_size);
RAFT_EXPECTS(launch_status == cudaSuccess,
"Failed to set max dynamic shared memory size to %u bytes",
smem_size);
cur_smem_size);
// Store order matters: last_kernel before last_smem_size (release) so the fast-path
// acquire load of last_smem_size also publishes last_kernel.
last_kernel.store(kernel, std::memory_order_relaxed);
last_smem_size.store(cur_smem_size, std::memory_order_release);
}
}
// We don't need to guard the kernel launch because the smem_size can only grow.
return launch(kernel);
}

Expand Down
Loading