NVIDIA · GPUtester · Mar 25, 2026 · Mar 25, 2026
@@ -2080,7 +2080,7 @@ auto iterative_build_graph(
       curr_itopk_size = curr_topk + 32;
     }
 
-    RAFT_LOG_INFO(
+    RAFT_LOG_DEBUG(
       "# graph_size = %lu (%.3lf), graph_degree = %lu, query_size = %lu, itopk = %lu, topk = %lu",
       (uint64_t)cagra_graph.extent(0),
       (double)cagra_graph.extent(0) / final_graph_size,
@@ -2146,7 +2146,7 @@ auto iterative_build_graph(
 
     auto end        = std::chrono::high_resolution_clock::now();
     auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
-    RAFT_LOG_INFO("# elapsed time: %.3lf sec", (double)elapsed_ms / 1000);
+    RAFT_LOG_DEBUG("# elapsed time: %.3lf sec", (double)elapsed_ms / 1000);
 
     if (flag_last) { break; }
     flag_last       = (curr_graph_size == final_graph_size);

@@ -4,24 +4,28 @@
  */
 #pragma once
 
-#include <raft/core/error.hpp>
-
 #include <atomic>
 #include <cstdint>
 #include <mutex>
+#include <raft/core/error.hpp>
 
 namespace cuvs::neighbors::detail {
 
 /**
  * @brief (Thread-)Safely invoke a kernel with a maximum dynamic shared memory size.
- * This is required because the sequence `cudaFuncSetAttribute` + kernel launch is not executed
- * atomically.
  *
- * Used this way, the cudaFuncAttributeMaxDynamicSharedMemorySize can only grow and thus
- * guarantees that the kernel is safe to launch.
+ * Maintains a monotonically growing high-water mark for
+ * `cudaFuncAttributeMaxDynamicSharedMemorySize`. When the kernel function pointer changes, the new
+ * kernel is brought up to the current high-water mark; when smem_size exceeds the high-water mark,
+ * it is grown for the current kernel. This guarantees every kernel's attribute is always >=
+ * smem_size at the time of launch.
+ *
+ * NB: cudaFuncSetAttribute is per kernel function pointer value, not per type. Multiple kernel
+ * template instantiations may share the same KernelT type (e.g. function pointers with the same
+ * signature), so we track the kernel identity alongside the smem high-water mark.
  *
  * @tparam KernelT The type of the kernel.
- * @tparam InvocationT The type of the invocation function.
+ * @tparam KernelLauncherT The type of the launch function/lambda.
  * @param kernel The kernel function address (for whom the smem-size is specified).
  * @param smem_size The size of the dynamic shared memory to be set.
  * @param launch The kernel launch function/lambda.
@@ -31,31 +35,38 @@ void safely_launch_kernel_with_smem_size(KernelT const& kernel,
                                          uint32_t smem_size,
                                          KernelLauncherT const& launch)
 {
-  // the last smem size is parameterized by the kernel thanks to the template parameter.
-  static std::atomic<uint32_t> current_smem_size{0};
-  auto last_smem_size = current_smem_size.load(std::memory_order_relaxed);
-  if (smem_size > last_smem_size) {
-    // We still need a mutex for the critical section: actualize last_smem_size and set the
-    // attribute.
-    static auto mutex = std::mutex{};
-    auto guard        = std::lock_guard<std::mutex>{mutex};
-    if (!current_smem_size.compare_exchange_strong(
-          last_smem_size, smem_size, std::memory_order_relaxed, std::memory_order_relaxed)) {
-      // The value has been updated by another thread between the load and the mutex acquisition.
-      if (smem_size > last_smem_size) {
-        current_smem_size.store(smem_size, std::memory_order_relaxed);
-      }
+  // last_smem_size is a monotonically growing high-water mark across all kernel pointers.
+  // last_kernel tracks which kernel pointer was last used.
+  static std::atomic<uint32_t> last_smem_size{0};
+  static std::atomic<KernelT> last_kernel{KernelT{}};
+  static std::mutex mutex;
+  // Fast path: skip the lock when the kernel matches and the smem size is within bounds.
+  // Load order matters: last_smem_size (acquire) before last_kernel (relaxed). Inside the lock
+  // we store in the opposite order: last_kernel (relaxed) then last_smem_size (release).
+  // This way an acquire load of last_smem_size that sees a post-cudaFuncSetAttribute value is
+  // guaranteed to also see the corresponding last_kernel.
+  if (smem_size > last_smem_size.load(std::memory_order_acquire) ||
+      kernel != last_kernel.load(std::memory_order_relaxed)) {
+    std::lock_guard<std::mutex> guard(mutex);
+    // Re-check under the lock: the outside decision can be stale.
+    uint32_t cur_smem_size = last_smem_size.load(std::memory_order_relaxed);
+    bool need_update       = (kernel != last_kernel.load(std::memory_order_relaxed));
+    if (smem_size > cur_smem_size) {
+      cur_smem_size = smem_size;
+      need_update   = true;
     }
-    // Only update if the last seen value is smaller than the new one.
-    if (smem_size > last_smem_size) {
+    if (need_update) {
       auto launch_status =
-        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, cur_smem_size);
       RAFT_EXPECTS(launch_status == cudaSuccess,
                    "Failed to set max dynamic shared memory size to %u bytes",
-                   smem_size);
+                   cur_smem_size);
+      // Store order matters: last_kernel before last_smem_size (release) so the fast-path
+      // acquire load of last_smem_size also publishes last_kernel.
+      last_kernel.store(kernel, std::memory_order_relaxed);
+      last_smem_size.store(cur_smem_size, std::memory_order_release);
     }
   }
-  // We don't need to guard the kernel launch because the smem_size can only grow.
   return launch(kernel);
 }