Skip to content

Commit dbd29a6

Browse files
[REVIEW] cuVS bench: Fix cudaFuncSetAttribute not being called when CAGRA search switches kernel variants (#1851)
Fix a bug in `safely_launch_kernel_with_smem_size` where `cudaFuncSetAttribute` was skipped for kernels that needed it. The function tracked the max shared memory in a single static variable per KernelT type, but `cudaFuncSetAttribute` applies per function pointer value — and the single-CTA CAGRA [search](https://github.com/rapidsai/cuvs/blob/d7a28aa1cb7648fa61037ed0459df0ec0e9db841/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh#L1373C4-L1375C78) dispatches multiple kernel instantiations that share the same pointer type. When one kernel bumped the tracked max, a different kernel whose smem fell between its own previous max and the global max would skip `cudaFuncSetAttribute`, causing `cudaErrorInvalidValue`. The fix tracks the kernel pointer identity alongside a monotonically growing smem high-water mark: when the pointer changes, the new kernel is brought up to the high-water mark; when smem exceeds it, the mark is grown. ## Error in question ```c++ $ CUVS_CAGRA_ANN_BENCH --search --data_prefix='<DATA_DIR>/' --benchmark_out_format=csv --benchmark_out=res_search_iter_cagra.csv --benchmark_counters_tabular=true --override_kv=dataset_memory_type:\"device\" <CONFIG_DIR>/laion_1M_cagra_iterative.json [I] [12:28:52.095261] Using the query file '<DATA_DIR>/laion_1M/queries.fbin' [I] [12:28:52.096141] Using the ground truth file '<DATA_DIR>/laion_1M/groundtruth.1M.neighbors.ibin' 2026-02-25T12:28:52+00:00 Running CUVS_CAGRA_ANN_BENCH Run on (224 X 800 MHz CPU s) CPU Caches: L1 Data 48 KiB (x112) L1 Instruction 32 KiB (x112) L2 Unified 2048 KiB (x112) L3 Unified 307200 KiB (x2) Load Average: 0.70, 0.44, 0.28 dataset: laion_1M dim: 768 distance: euclidean ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations GPU Latency Recall end_to_end items_per_second itopk k max_iterations n_queries refine_ratio search_width total_queries ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- cuvs_cagra_iterative/0/process_time/real_time 5.70 ms 5.70 ms 121 5.68808m 5.69994m 0.96424 0.689692 1.75441M/s 64 10 8 10k 1 2 1.21M dataset_memory_type="device" cuvs_cagra_iterative/1/process_time/real_time 5.70 ms 5.70 ms 121 5.6863m 5.69879m 0.96424 0.689553 1.75477M/s 64 10 8 10k 1 2 1.21M dataset_memory_type="device" cuvs_cagra_iterative/2/process_time/real_time 4.92 ms 4.92 ms 140 4.90351m 4.91567m 0.96046 0.688193 2.03432M/s 128 10 12 10k 1 1 1.4M dataset_memory_type="device" cuvs_cagra_iterative/3/process_time/real_time 5.99 ms 5.99 ms 115 5.97476m 5.98617m 0.97519 0.688409 1.67052M/s 128 10 16 10k 1 1 1.15M dataset_memory_type="device" cuvs_cagra_iterative/4/process_time/real_time 6.97 ms 6.97 ms 99 6.95873m 6.9703m 0.98129 0.690059 1.43466M/s 256 10 16 10k 1 1 990k dataset_memory_type="device" cuvs_cagra_iterative/5/process_time/real_time 10.5 ms 10.5 ms 66 0.010479 0.0104908 0.98548 0.692391 953.222k/s 512 10 10 10k 1 2 660k dataset_memory_type="device" ----------------------------------------------------------------------------------------- Benchmark Time CPU Iterations ----------------------------------------------------------------------------------------- cuvs_cagra_iterative/6/process_time/real_time ERROR OCCURRED: 'Benchmark loop: CUDA error encountered at: file=cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh line=2348: call='cudaPeekAtLastError()', Reason=cudaErrorInvalidValue:invalid argument Obtained 19 stack frames #1 in CUVS_CAGRA_ANN_BENCH: raft::cuda_error::cuda_error(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) #2 in libcuvs.so: void cuvs::neighbors::cagra::detail::single_cta_search::select_and_run<float, unsigned int, float, unsigned int, cuvs::neighbors::filtering::none_sample_filter>(...) #3 in libcuvs.so: cuvs::neighbors::cagra::detail::single_cta_search::search<float, unsigned int, float, cuvs::neighbors::filtering::none_sample_filter, unsigned int, long>::operator()(...) #4 in libcuvs.so(+0x18fd0f1) #5 in libcuvs.so: void cuvs::neighbors::cagra::search<float, unsigned int, long>(...) #6-#19 in CUVS_CAGRA_ANN_BENCH / libc.so.6 ' ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations GPU Latency Recall end_to_end items_per_second itopk k max_iterations n_queries refine_ratio search_width total_queries ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- cuvs_cagra_iterative/7/process_time/real_time 10.5 ms 10.5 ms 66 0.0105088 0.0105202 0.98663 0.694332 950.555k/s 32 10 32 10k 1 1 660k dataset_memory_type="device" cuvs_cagra_iterative/8/process_time/real_time 12.8 ms 12.8 ms 54 0.012796 0.0128079 0.98807 0.691628 780.768k/s 32 10 64 10k 1 1 540k dataset_memory_type="device" ----------------------------------------------------------------------------------------- Benchmark Time CPU Iterations ----------------------------------------------------------------------------------------- cuvs_cagra_iterative/9/process_time/real_time ERROR OCCURRED: 'Benchmark loop: CUDA error encountered at: file=cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh line=2348: call='cudaPeekAtLastError()', Reason=cudaErrorInvalidValue:invalid argument [same stack trace as above] ' cuvs_cagra_iterative/10/process_time/real_time ERROR OCCURRED: 'Benchmark loop: CUDA error encountered at: file=cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh line=2348: call='cudaPeekAtLastError()', Reason=cudaErrorInvalidValue:invalid argument [same stack trace as above] ' ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations GPU Latency Recall end_to_end items_per_second itopk k max_iterations n_queries refine_ratio search_width total_queries ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- cuvs_cagra_iterative/11/process_time/real_time 46.1 ms 46.2 ms 15 0.0461323 0.0461439 0.99131 0.692158 216.714k/s 256 10 10 10k 1 16 150k dataset_memory_type="device" cuvs_cagra_iterative/12/process_time/real_time 142 ms 142 ms 5 0.141713 0.141725 0.99198 0.708627 70.5591k/s 512 10 32 10k 1 16 50k dataset_memory_type="device" ``` ## Config ``` { "dataset": { "name": "laion_1M", "base_file": "laion_1M/base.1M.fbin", "subset_size": 1000000, "query_file": "laion_1M/queries.fbin", "groundtruth_neighbors_file": "laion_1M/groundtruth.1M.neighbors.ibin", "distance": "euclidean" }, "search_basic_param": { "batch_size": 10000, "k": 10 }, "index": [ { "name": "cuvs_cagra_iterative", "algo": "cuvs_cagra", "build_param": { "graph_degree": 64, "intermediate_graph_degree": 128, "search_width": 1 }, "file": "laion_1M/cagra/q_coarse_iterative.ibin", "search_params": [ {"itopk": 64, "search_width": 2, "max_iterations": 8, "refine_ratio": 1}, {"itopk": 64, "search_width": 2, "max_iterations": 8, "refine_ratio": 1}, {"itopk": 128, "search_width": 1, "max_iterations": 12, "refine_ratio": 1}, {"itopk": 128, "search_width": 1, "max_iterations": 16, "refine_ratio": 1}, {"itopk": 256, "search_width": 1, "max_iterations": 16, "refine_ratio": 1}, {"itopk": 512, "search_width": 2, "max_iterations": 10, "refine_ratio": 1}, {"itopk": 256, "search_width": 2, "max_iterations": 12, "refine_ratio": 1}, {"itopk": 32, "search_width": 1, "max_iterations": 32, "refine_ratio": 1}, {"itopk": 32, "search_width": 1, "max_iterations": 64, "refine_ratio": 1}, {"itopk": 192, "search_width": 4, "max_iterations": 12, "refine_ratio": 1}, {"itopk": 256, "search_width": 4, "max_iterations": 12, "refine_ratio": 1}, {"itopk": 256, "search_width": 16, "max_iterations": 10, "refine_ratio": 1}, {"itopk": 512, "search_width": 16, "max_iterations": 32, "refine_ratio": 1} ] } ] } ``` Authors: - https://github.com/irina-resh-nvda Approvers: - Artem M. Chirkin (https://github.com/achirkin) URL: #1851
1 parent aad241e commit dbd29a6

2 files changed

Lines changed: 39 additions & 28 deletions

File tree

cpp/src/neighbors/detail/cagra/cagra_build.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2080,7 +2080,7 @@ auto iterative_build_graph(
20802080
curr_itopk_size = curr_topk + 32;
20812081
}
20822082

2083-
RAFT_LOG_INFO(
2083+
RAFT_LOG_DEBUG(
20842084
"# graph_size = %lu (%.3lf), graph_degree = %lu, query_size = %lu, itopk = %lu, topk = %lu",
20852085
(uint64_t)cagra_graph.extent(0),
20862086
(double)cagra_graph.extent(0) / final_graph_size,
@@ -2146,7 +2146,7 @@ auto iterative_build_graph(
21462146

21472147
auto end = std::chrono::high_resolution_clock::now();
21482148
auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
2149-
RAFT_LOG_INFO("# elapsed time: %.3lf sec", (double)elapsed_ms / 1000);
2149+
RAFT_LOG_DEBUG("# elapsed time: %.3lf sec", (double)elapsed_ms / 1000);
21502150

21512151
if (flag_last) { break; }
21522152
flag_last = (curr_graph_size == final_graph_size);

cpp/src/neighbors/detail/smem_utils.cuh

Lines changed: 37 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,24 +4,28 @@
44
*/
55
#pragma once
66

7-
#include <raft/core/error.hpp>
8-
97
#include <atomic>
108
#include <cstdint>
119
#include <mutex>
10+
#include <raft/core/error.hpp>
1211

1312
namespace cuvs::neighbors::detail {
1413

1514
/**
1615
* @brief (Thread-)Safely invoke a kernel with a maximum dynamic shared memory size.
17-
* This is required because the sequence `cudaFuncSetAttribute` + kernel launch is not executed
18-
* atomically.
1916
*
20-
* Used this way, the cudaFuncAttributeMaxDynamicSharedMemorySize can only grow and thus
21-
* guarantees that the kernel is safe to launch.
17+
* Maintains a monotonically growing high-water mark for
18+
* `cudaFuncAttributeMaxDynamicSharedMemorySize`. When the kernel function pointer changes, the new
19+
* kernel is brought up to the current high-water mark; when smem_size exceeds the high-water mark,
20+
* it is grown for the current kernel. This guarantees every kernel's attribute is always >=
21+
* smem_size at the time of launch.
22+
*
23+
* NB: cudaFuncSetAttribute is per kernel function pointer value, not per type. Multiple kernel
24+
* template instantiations may share the same KernelT type (e.g. function pointers with the same
25+
* signature), so we track the kernel identity alongside the smem high-water mark.
2226
*
2327
* @tparam KernelT The type of the kernel.
24-
* @tparam InvocationT The type of the invocation function.
28+
* @tparam KernelLauncherT The type of the launch function/lambda.
2529
* @param kernel The kernel function address (for whom the smem-size is specified).
2630
* @param smem_size The size of the dynamic shared memory to be set.
2731
* @param launch The kernel launch function/lambda.
@@ -31,31 +35,38 @@ void safely_launch_kernel_with_smem_size(KernelT const& kernel,
3135
uint32_t smem_size,
3236
KernelLauncherT const& launch)
3337
{
34-
// the last smem size is parameterized by the kernel thanks to the template parameter.
35-
static std::atomic<uint32_t> current_smem_size{0};
36-
auto last_smem_size = current_smem_size.load(std::memory_order_relaxed);
37-
if (smem_size > last_smem_size) {
38-
// We still need a mutex for the critical section: actualize last_smem_size and set the
39-
// attribute.
40-
static auto mutex = std::mutex{};
41-
auto guard = std::lock_guard<std::mutex>{mutex};
42-
if (!current_smem_size.compare_exchange_strong(
43-
last_smem_size, smem_size, std::memory_order_relaxed, std::memory_order_relaxed)) {
44-
// The value has been updated by another thread between the load and the mutex acquisition.
45-
if (smem_size > last_smem_size) {
46-
current_smem_size.store(smem_size, std::memory_order_relaxed);
47-
}
38+
// last_smem_size is a monotonically growing high-water mark across all kernel pointers.
39+
// last_kernel tracks which kernel pointer was last used.
40+
static std::atomic<uint32_t> last_smem_size{0};
41+
static std::atomic<KernelT> last_kernel{KernelT{}};
42+
static std::mutex mutex;
43+
// Fast path: skip the lock when the kernel matches and the smem size is within bounds.
44+
// Load order matters: last_smem_size (acquire) before last_kernel (relaxed). Inside the lock
45+
// we store in the opposite order: last_kernel (relaxed) then last_smem_size (release).
46+
// This way an acquire load of last_smem_size that sees a post-cudaFuncSetAttribute value is
47+
// guaranteed to also see the corresponding last_kernel.
48+
if (smem_size > last_smem_size.load(std::memory_order_acquire) ||
49+
kernel != last_kernel.load(std::memory_order_relaxed)) {
50+
std::lock_guard<std::mutex> guard(mutex);
51+
// Re-check under the lock: the outside decision can be stale.
52+
uint32_t cur_smem_size = last_smem_size.load(std::memory_order_relaxed);
53+
bool need_update = (kernel != last_kernel.load(std::memory_order_relaxed));
54+
if (smem_size > cur_smem_size) {
55+
cur_smem_size = smem_size;
56+
need_update = true;
4857
}
49-
// Only update if the last seen value is smaller than the new one.
50-
if (smem_size > last_smem_size) {
58+
if (need_update) {
5159
auto launch_status =
52-
cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
60+
cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, cur_smem_size);
5361
RAFT_EXPECTS(launch_status == cudaSuccess,
5462
"Failed to set max dynamic shared memory size to %u bytes",
55-
smem_size);
63+
cur_smem_size);
64+
// Store order matters: last_kernel before last_smem_size (release) so the fast-path
65+
// acquire load of last_smem_size also publishes last_kernel.
66+
last_kernel.store(kernel, std::memory_order_relaxed);
67+
last_smem_size.store(cur_smem_size, std::memory_order_release);
5668
}
5769
}
58-
// We don't need to guard the kernel launch because the smem_size can only grow.
5970
return launch(kernel);
6071
}
6172

0 commit comments

Comments
 (0)