[REVIEW] cuVS bench: Fix cudaFuncSetAttribute not being called when CAGRA search switches kernel variants (#1851)

irina-resh-nvda · web-flow · commit dbd29a645b90 · 2026-03-25T15:26:00.000Z
Fix a bug in `safely_launch_kernel_with_smem_size` where `cudaFuncSetAttribute` was skipped for kernels that needed it. The function tracked the max shared memory in a single static variable per KernelT type, but `cudaFuncSetAttribute` applies per function pointer value — and the single-CTA CAGRA [search](https://github.com/rapidsai/cuvs/blob/d7a28aa1cb7648fa61037ed0459df0ec0e9db841/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh#L1373C4-L1375C78) dispatches multiple kernel instantiations that share the same pointer type. When one kernel bumped the tracked max, a different kernel whose smem fell between its own previous max and the global max would skip `cudaFuncSetAttribute`, causing `cudaErrorInvalidValue`. The fix tracks the kernel pointer identity alongside a monotonically growing smem high-water mark: when the pointer changes, the new kernel is brought up to the high-water mark; when smem exceeds it, the mark is grown. ## Error in question ```c++ $ CUVS_CAGRA_ANN_BENCH --search --data_prefix='<DATA_DIR>/' --benchmark_out_format=csv --benchmark_out=res_search_iter_cagra.csv --benchmark_counters_tabular=true --override_kv=dataset_memory_type:\"device\" <CONFIG_DIR>/laion_1M_cagra_iterative.json [I] [12:28:52.095261] Using the query file '<DATA_DIR>/laion_1M/queries.fbin' [I] [12:28:52.096141] Using the ground truth file '<DATA_DIR>/laion_1M/groundtruth.1M.neighbors.ibin' 2026-02-25T12:28:52+00:00 Running CUVS_CAGRA_ANN_BENCH Run on (224 X 800 MHz CPU s) CPU Caches: L1 Data 48 KiB (x112) L1 Instruction 32 KiB (x112) L2 Unified 2048 KiB (x112) L3 Unified 307200 KiB (x2) Load Average: 0.70, 0.44, 0.28 dataset: laion_1M dim: 768 distance: euclidean ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations GPU Latency Recall end_to_end items_per_second itopk k max_iterations n_queries refine_ratio search_width total_queries ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- cuvs_cagra_iterative/0/process_time/real_time 5.70 ms 5.70 ms 121 5.68808m 5.69994m 0.96424 0.689692 1.75441M/s 64 10 8 10k 1 2 1.21M dataset_memory_type="device" cuvs_cagra_iterative/1/process_time/real_time 5.70 ms 5.70 ms 121 5.6863m 5.69879m 0.96424 0.689553 1.75477M/s 64 10 8 10k 1 2 1.21M dataset_memory_type="device" cuvs_cagra_iterative/2/process_time/real_time 4.92 ms 4.92 ms 140 4.90351m 4.91567m 0.96046 0.688193 2.03432M/s 128 10 12 10k 1 1 1.4M dataset_memory_type="device" cuvs_cagra_iterative/3/process_time/real_time 5.99 ms 5.99 ms 115 5.97476m 5.98617m 0.97519 0.688409 1.67052M/s 128 10 16 10k 1 1 1.15M dataset_memory_type="device" cuvs_cagra_iterative/4/process_time/real_time 6.97 ms 6.97 ms 99 6.95873m 6.9703m 0.98129 0.690059 1.43466M/s 256 10 16 10k 1 1 990k dataset_memory_type="device" cuvs_cagra_iterative/5/process_time/real_time 10.5 ms 10.5 ms 66 0.010479 0.0104908 0.98548 0.692391 953.222k/s 512 10 10 10k 1 2 660k dataset_memory_type="device" ----------------------------------------------------------------------------------------- Benchmark Time CPU Iterations ----------------------------------------------------------------------------------------- cuvs_cagra_iterative/6/process_time/real_time ERROR OCCURRED: 'Benchmark loop: CUDA error encountered at: file=cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh line=2348: call='cudaPeekAtLastError()', Reason=cudaErrorInvalidValue:invalid argument Obtained 19 stack frames #1 in CUVS_CAGRA_ANN_BENCH: raft::cuda_error::cuda_error(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) #2 in libcuvs.so: void cuvs::neighbors::cagra::detail::single_cta_search::select_and_run<float, unsigned int, float, unsigned int, cuvs::neighbors::filtering::none_sample_filter>(...) #3 in libcuvs.so: cuvs::neighbors::cagra::detail::single_cta_search::search<float, unsigned int, float, cuvs::neighbors::filtering::none_sample_filter, unsigned int, long>::operator()(...) #4 in libcuvs.so(+0x18fd0f1) #5 in libcuvs.so: void cuvs::neighbors::cagra::search<float, unsigned int, long>(...) #6-#19 in CUVS_CAGRA_ANN_BENCH / libc.so.6 ' ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations GPU Latency Recall end_to_end items_per_second itopk k max_iterations n_queries refine_ratio search_width total_queries ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- cuvs_cagra_iterative/7/process_time/real_time 10.5 ms 10.5 ms 66 0.0105088 0.0105202 0.98663 0.694332 950.555k/s 32 10 32 10k 1 1 660k dataset_memory_type="device" cuvs_cagra_iterative/8/process_time/real_time 12.8 ms 12.8 ms 54 0.012796 0.0128079 0.98807 0.691628 780.768k/s 32 10 64 10k 1 1 540k dataset_memory_type="device" ----------------------------------------------------------------------------------------- Benchmark Time CPU Iterations ----------------------------------------------------------------------------------------- cuvs_cagra_iterative/9/process_time/real_time ERROR OCCURRED: 'Benchmark loop: CUDA error encountered at: file=cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh line=2348: call='cudaPeekAtLastError()', Reason=cudaErrorInvalidValue:invalid argument [same stack trace as above] ' cuvs_cagra_iterative/10/process_time/real_time ERROR OCCURRED: 'Benchmark loop: CUDA error encountered at: file=cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh line=2348: call='cudaPeekAtLastError()', Reason=cudaErrorInvalidValue:invalid argument [same stack trace as above] ' ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations GPU Latency Recall end_to_end items_per_second itopk k max_iterations n_queries refine_ratio search_width total_queries ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- cuvs_cagra_iterative/11/process_time/real_time 46.1 ms 46.2 ms 15 0.0461323 0.0461439 0.99131 0.692158 216.714k/s 256 10 10 10k 1 16 150k dataset_memory_type="device" cuvs_cagra_iterative/12/process_time/real_time 142 ms 142 ms 5 0.141713 0.141725 0.99198 0.708627 70.5591k/s 512 10 32 10k 1 16 50k dataset_memory_type="device" ``` ## Config ``` { "dataset": { "name": "laion_1M", "base_file": "laion_1M/base.1M.fbin", "subset_size": 1000000, "query_file": "laion_1M/queries.fbin", "groundtruth_neighbors_file": "laion_1M/groundtruth.1M.neighbors.ibin", "distance": "euclidean" }, "search_basic_param": { "batch_size": 10000, "k": 10 }, "index": [ { "name": "cuvs_cagra_iterative", "algo": "cuvs_cagra", "build_param": { "graph_degree": 64, "intermediate_graph_degree": 128, "search_width": 1 }, "file": "laion_1M/cagra/q_coarse_iterative.ibin", "search_params": [ {"itopk": 64, "search_width": 2, "max_iterations": 8, "refine_ratio": 1}, {"itopk": 64, "search_width": 2, "max_iterations": 8, "refine_ratio": 1}, {"itopk": 128, "search_width": 1, "max_iterations": 12, "refine_ratio": 1}, {"itopk": 128, "search_width": 1, "max_iterations": 16, "refine_ratio": 1}, {"itopk": 256, "search_width": 1, "max_iterations": 16, "refine_ratio": 1}, {"itopk": 512, "search_width": 2, "max_iterations": 10, "refine_ratio": 1}, {"itopk": 256, "search_width": 2, "max_iterations": 12, "refine_ratio": 1}, {"itopk": 32, "search_width": 1, "max_iterations": 32, "refine_ratio": 1}, {"itopk": 32, "search_width": 1, "max_iterations": 64, "refine_ratio": 1}, {"itopk": 192, "search_width": 4, "max_iterations": 12, "refine_ratio": 1}, {"itopk": 256, "search_width": 4, "max_iterations": 12, "refine_ratio": 1}, {"itopk": 256, "search_width": 16, "max_iterations": 10, "refine_ratio": 1}, {"itopk": 512, "search_width": 16, "max_iterations": 32, "refine_ratio": 1} ] } ] } ``` Authors: - https://github.com/irina-resh-nvda Approvers: - Artem M. Chirkin (https://github.com/achirkin) URL: #1851
diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -2080,7 +2080,7 @@ auto iterative_build_graph(
       curr_itopk_size = curr_topk + 32;
     }
 
-    RAFT_LOG_INFO(
+    RAFT_LOG_DEBUG(
       "# graph_size = %lu (%.3lf), graph_degree = %lu, query_size = %lu, itopk = %lu, topk = %lu",
       (uint64_t)cagra_graph.extent(0),
       (double)cagra_graph.extent(0) / final_graph_size,
@@ -2146,7 +2146,7 @@ auto iterative_build_graph(
 
     auto end        = std::chrono::high_resolution_clock::now();
     auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
-    RAFT_LOG_INFO("# elapsed time: %.3lf sec", (double)elapsed_ms / 1000);
+    RAFT_LOG_DEBUG("# elapsed time: %.3lf sec", (double)elapsed_ms / 1000);
 
     if (flag_last) { break; }
     flag_last       = (curr_graph_size == final_graph_size);
diff --git a/cpp/src/neighbors/detail/smem_utils.cuh b/cpp/src/neighbors/detail/smem_utils.cuh
@@ -4,24 +4,28 @@
  */
 #pragma once
 
-#include <raft/core/error.hpp>
-
 #include <atomic>
 #include <cstdint>
 #include <mutex>
+#include <raft/core/error.hpp>
 
 namespace cuvs::neighbors::detail {
 
 /**
  * @brief (Thread-)Safely invoke a kernel with a maximum dynamic shared memory size.
- * This is required because the sequence `cudaFuncSetAttribute` + kernel launch is not executed
- * atomically.
  *
- * Used this way, the cudaFuncAttributeMaxDynamicSharedMemorySize can only grow and thus
- * guarantees that the kernel is safe to launch.
+ * Maintains a monotonically growing high-water mark for
+ * `cudaFuncAttributeMaxDynamicSharedMemorySize`. When the kernel function pointer changes, the new
+ * kernel is brought up to the current high-water mark; when smem_size exceeds the high-water mark,
+ * it is grown for the current kernel. This guarantees every kernel's attribute is always >=
+ * smem_size at the time of launch.
+ *
+ * NB: cudaFuncSetAttribute is per kernel function pointer value, not per type. Multiple kernel
+ * template instantiations may share the same KernelT type (e.g. function pointers with the same
+ * signature), so we track the kernel identity alongside the smem high-water mark.
  *
  * @tparam KernelT The type of the kernel.
- * @tparam InvocationT The type of the invocation function.
+ * @tparam KernelLauncherT The type of the launch function/lambda.
  * @param kernel The kernel function address (for whom the smem-size is specified).
  * @param smem_size The size of the dynamic shared memory to be set.
  * @param launch The kernel launch function/lambda.
@@ -31,31 +35,38 @@ void safely_launch_kernel_with_smem_size(KernelT const& kernel,
                                          uint32_t smem_size,
                                          KernelLauncherT const& launch)
 {
-  // the last smem size is parameterized by the kernel thanks to the template parameter.
-  static std::atomic<uint32_t> current_smem_size{0};
-  auto last_smem_size = current_smem_size.load(std::memory_order_relaxed);
-  if (smem_size > last_smem_size) {
-    // We still need a mutex for the critical section: actualize last_smem_size and set the
-    // attribute.
-    static auto mutex = std::mutex{};
-    auto guard        = std::lock_guard<std::mutex>{mutex};
-    if (!current_smem_size.compare_exchange_strong(
-          last_smem_size, smem_size, std::memory_order_relaxed, std::memory_order_relaxed)) {
-      // The value has been updated by another thread between the load and the mutex acquisition.
-      if (smem_size > last_smem_size) {
-        current_smem_size.store(smem_size, std::memory_order_relaxed);
-      }
+  // last_smem_size is a monotonically growing high-water mark across all kernel pointers.
+  // last_kernel tracks which kernel pointer was last used.
+  static std::atomic<uint32_t> last_smem_size{0};
+  static std::atomic<KernelT> last_kernel{KernelT{}};
+  static std::mutex mutex;
+  // Fast path: skip the lock when the kernel matches and the smem size is within bounds.
+  // Load order matters: last_smem_size (acquire) before last_kernel (relaxed). Inside the lock
+  // we store in the opposite order: last_kernel (relaxed) then last_smem_size (release).
+  // This way an acquire load of last_smem_size that sees a post-cudaFuncSetAttribute value is
+  // guaranteed to also see the corresponding last_kernel.
+  if (smem_size > last_smem_size.load(std::memory_order_acquire) ||
+      kernel != last_kernel.load(std::memory_order_relaxed)) {
+    std::lock_guard<std::mutex> guard(mutex);
+    // Re-check under the lock: the outside decision can be stale.
+    uint32_t cur_smem_size = last_smem_size.load(std::memory_order_relaxed);
+    bool need_update       = (kernel != last_kernel.load(std::memory_order_relaxed));
+    if (smem_size > cur_smem_size) {
+      cur_smem_size = smem_size;
+      need_update   = true;
     }
-    // Only update if the last seen value is smaller than the new one.
-    if (smem_size > last_smem_size) {
+    if (need_update) {
       auto launch_status =
-        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, cur_smem_size);
       RAFT_EXPECTS(launch_status == cudaSuccess,
                    "Failed to set max dynamic shared memory size to %u bytes",
-                   smem_size);
+                   cur_smem_size);
+      // Store order matters: last_kernel before last_smem_size (release) so the fast-path
+      // acquire load of last_smem_size also publishes last_kernel.
+      last_kernel.store(kernel, std::memory_order_relaxed);
+      last_smem_size.store(cur_smem_size, std::memory_order_release);
     }
   }
-  // We don't need to guard the kernel launch because the smem_size can only grow.
   return launch(kernel);
 }