NVIDIA
diff --git a/‎cpp/CMakeLists.txt‎
Lines changed: 0 additions & 2 deletions b/‎cpp/CMakeLists.txt‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎cpp/src/cluster/detail/kmeans.cuh‎
Lines changed: 7 additions & 6 deletions b/‎cpp/src/cluster/detail/kmeans.cuh‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎cpp/src/cluster/detail/kmeans_balanced.cuh‎
Lines changed: 19 additions & 12 deletions b/‎cpp/src/cluster/detail/kmeans_balanced.cuh‎
Lines changed: 19 additions & 12 deletions
diff --git a/‎cpp/src/cluster/detail/kmeans_common.cuh‎
Lines changed: 26 additions & 0 deletions b/‎cpp/src/cluster/detail/kmeans_common.cuh‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎cpp/src/cluster/detail/minClusterDistanceCompute.cu‎
Lines changed: 45 additions & 18 deletions b/‎cpp/src/cluster/detail/minClusterDistanceCompute.cu‎
Lines changed: 45 additions & 18 deletions
@@ -1145,7 +1145,6 @@ if(NOT BUILD_CPU_ONLY)
                  CXX_STANDARD_REQUIRED ON
                  CUDA_STANDARD 20
                  CUDA_STANDARD_REQUIRED ON
-                 CUDA_RESOLVE_DEVICE_SYMBOLS ON
                  INTERFACE_POSITION_INDEPENDENT_CODE ON
                  POSITION_INDEPENDENT_CODE ON
     )
@@ -1202,7 +1201,6 @@ SECTIONS
                  CXX_STANDARD_REQUIRED ON
                  CUDA_STANDARD 20
                  CUDA_STANDARD_REQUIRED ON
-                 CUDA_RESOLVE_DEVICE_SYMBOLS ON
                  POSITION_INDEPENDENT_CODE ON
                  INTERFACE_POSITION_INDEPENDENT_CODE ON
                  EXPORT_NAME cuvs_static
 
@@ -694,14 +694,15 @@ void kmeans_fit(
 
   rmm::device_uvector<char> batch_workspace(streaming_batch_size, stream);
 
-  cuvs::spatial::knn::detail::utils::batch_load_iterator<DataT> data_batches(
-    X.data_handle(), n_samples, n_features, streaming_batch_size, stream);
+  auto data_batches = cuvs::spatial::knn::detail::utils::make_batch_load_iterator<DataT>(
+    handle, X.data_handle(), n_samples, n_features, streaming_batch_size, stream);
   // Host-path weight batches: only materialized when weights are provided and
   // the data resides on host
-  std::optional<cuvs::spatial::knn::detail::utils::batch_load_iterator<DataT>> weight_batches;
+  std::optional<cuvs::spatial::knn::detail::utils::batch_load_iterator_dyn<DataT>> weight_batches;
   if constexpr (!data_on_device) {
     if (weight_ptr != nullptr) {
-      weight_batches.emplace(weight_ptr, n_samples, 1, streaming_batch_size, stream);
+      weight_batches = cuvs::spatial::knn::detail::utils::make_batch_load_iterator<DataT>(
+        handle, weight_ptr, n_samples, IndexT{1}, streaming_batch_size, stream);
     } else {
       raft::matrix::fill(handle, batch_weights_buf.view(), DataT{1});
     }
@@ -833,7 +834,7 @@ void kmeans_fit(
         raft::make_device_matrix_view<DataT, IndexT>(new_centroids_ptr, n_clusters, n_features);
 
       data_batches.reset();
-      using wt_iter_t = cuvs::spatial::knn::detail::utils::batch_load_iterator<DataT>;
+      using wt_iter_t = cuvs::spatial::knn::detail::utils::batch_load_iterator_dyn<DataT>;
       std::optional<wt_iter_t> wt_it;
       if (weight_batches.has_value()) {
         weight_batches->reset();
@@ -932,7 +933,7 @@ void kmeans_fit(
 
       iter_inertia = DataT{0};
       data_batches.reset();
-      using wt_iter_t = cuvs::spatial::knn::detail::utils::batch_load_iterator<DataT>;
+      using wt_iter_t = cuvs::spatial::knn::detail::utils::batch_load_iterator_dyn<DataT>;
       std::optional<wt_iter_t> wt_it;
       if (weight_batches.has_value()) {
         weight_batches->reset();
 
@@ -16,6 +16,7 @@
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resource/device_properties.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/gemm.cuh>
@@ -171,22 +172,28 @@ inline std::enable_if_t<std::is_floating_point_v<MathT>> predict_core(
  * @return A suggested minibatch size and the expected memory cost per-row (in bytes)
  */
 template <typename MathT, typename IdxT>
-constexpr auto calc_minibatch_size(IdxT n_clusters,
-                                   IdxT n_rows,
-                                   IdxT dim,
-                                   cuvs::distance::DistanceType metric,
-                                   bool needs_conversion) -> std::tuple<IdxT, size_t>
+auto calc_minibatch_size(const raft::resources& handle,
+                         IdxT n_clusters,
+                         IdxT n_rows,
+                         IdxT dim,
+                         cuvs::distance::DistanceType metric,
+                         bool needs_conversion) -> std::tuple<IdxT, size_t>
 {
   n_clusters = std::max<IdxT>(1, n_clusters);
 
   // Estimate memory needs per row (i.e element of the batch).
   size_t mem_per_row = 0;
   switch (metric) {
-    // fusedL2NN needs a mutex and a key-value pair for each row.
     case distance::DistanceType::L2Expanded:
     case distance::DistanceType::L2SqrtExpanded: {
-      mem_per_row += sizeof(int);
-      mem_per_row += sizeof(raft::KeyValuePair<IdxT, MathT>);
+      if (use_fused<MathT, IdxT, IdxT>(handle, n_rows, n_clusters, dim)) {
+        // fusedL2NN needs a mutex and a key-value pair for each row.
+        mem_per_row += sizeof(int);
+        mem_per_row += sizeof(raft::KeyValuePair<IdxT, MathT>);
+      } else {
+        // unfused path needs a full GEMM output (distance matrix row).
+        mem_per_row += sizeof(MathT) * n_clusters;
+      }
     } break;
     // Other metrics require storing a distance matrix.
     default: {
@@ -377,8 +384,8 @@ void predict(const raft::resources& handle,
   raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "predict(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
   auto mem_res = mr.value_or(raft::resource::get_workspace_resource_ref(handle));
-  auto [max_minibatch_size, _mem_per_row] =
-    calc_minibatch_size<MathT>(n_clusters, n_rows, dim, params.metric, std::is_same_v<T, MathT>);
+  auto [max_minibatch_size, _mem_per_row] = calc_minibatch_size<MathT>(
+    handle, n_clusters, n_rows, dim, params.metric, std::is_same_v<T, MathT>);
   rmm::device_uvector<MathT> cur_dataset(
     std::is_same_v<T, MathT> ? 0 : max_minibatch_size * dim, stream, mem_res);
   bool need_compute_norm =
@@ -989,8 +996,8 @@ void build_hierarchical(const raft::resources& handle,
   // TODO: Remove the explicit managed memory- we shouldn't be creating this on the user's behalf.
   rmm::mr::managed_memory_resource managed_memory;
   rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource_ref(handle);
-  auto [max_minibatch_size, mem_per_row] =
-    calc_minibatch_size<MathT>(n_clusters, n_rows, dim, params.metric, std::is_same_v<T, MathT>);
+  auto [max_minibatch_size, mem_per_row]       = calc_minibatch_size<MathT>(
+    handle, n_clusters, n_rows, dim, params.metric, std::is_same_v<T, MathT>);
 
   // Precompute the L2 norm of the dataset if relevant and not yet computed.
   rmm::device_uvector<MathT> dataset_norm_buf(0, stream, device_memory);
 
@@ -19,6 +19,7 @@
 #include <raft/core/memory_type.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_properties.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/map.cuh>
@@ -56,6 +57,31 @@
 
 namespace cuvs::cluster::kmeans::detail {
 
+/**
+ * @brief Returns true if the fused distance NN implementation should be used.
+ *
+ * On Ampere (SM <= 8.x) always use fused.
+ * On Hopper (SM 9.x) use fused when m or n >= 4096.
+ * On Blackwell (SM >= 10.x) use unfused.
+ */
+template <typename MathT, typename IdxT, typename LabelT>
+bool use_fused(const raft::resources& handle, IdxT m, IdxT n, IdxT k)
+{
+  cudaDeviceProp prop;
+  prop = raft::resource::get_device_properties(handle);
+  if (prop.major <= 8) {
+    // Use fused for Ampere or before
+    return true;
+  } else if (prop.major == 9 && (m >= 4096 || n >= 4096)) {
+    // On Hopper if m, n are bigger than 4096, use fused
+    return true;
+  } else if (prop.major >= 10) {
+    // On Blackwell onwards, use unfused
+    return false;
+  }
+  return false;
+}
+
 template <typename DataT, typename IndexT>
 struct SamplingOp {
   DataT* rnd;
 
@@ -4,6 +4,7 @@
  */
 
 #include "../../distance/fused_distance_nn.cuh"
+#include "../../distance/unfused_distance_nn.cuh"
 #include "kmeans_common.cuh"
 
 #include <raft/matrix/init.cuh>
@@ -50,24 +51,50 @@ void minClusterAndDistanceCompute(
     raft::KeyValuePair<IndexT, DataT> initial_value(0, std::numeric_limits<DataT>::max());
     raft::matrix::fill(handle, minClusterAndDistance, initial_value);
 
-    workspace.resize((sizeof(int)) * n_samples, stream);
-
-    cuvs::distance::fusedDistanceNNMinReduce<DataT, raft::KeyValuePair<IndexT, DataT>, IndexT>(
-      minClusterAndDistance.data_handle(),
-      X.data_handle(),
-      centroids.data_handle(),
-      L2NormX.data_handle(),
-      centroidsNorm.data_handle(),
-      n_samples,
-      n_clusters,
-      n_features,
-      (void*)workspace.data(),
-      metric != cuvs::distance::DistanceType::L2Expanded,
-      false,
-      true,
-      metric,
-      0.0f,
-      stream);
+    bool should_use_fused =
+      use_fused<DataT, IndexT, IndexT>(handle, n_samples, n_clusters, n_features);
+
+    if (should_use_fused) {
+      workspace.resize((sizeof(int)) * n_samples, stream);
+
+      cuvs::distance::fusedDistanceNNMinReduce<DataT, raft::KeyValuePair<IndexT, DataT>, IndexT>(
+        minClusterAndDistance.data_handle(),
+        X.data_handle(),
+        centroids.data_handle(),
+        L2NormX.data_handle(),
+        centroidsNorm.data_handle(),
+        n_samples,
+        n_clusters,
+        n_features,
+        (void*)workspace.data(),
+        metric != cuvs::distance::DistanceType::L2Expanded,
+        false,
+        true,
+        metric,
+        0.0f,
+        stream);
+    } else {
+      workspace.resize(sizeof(DataT) * n_samples * n_clusters, stream);
+
+      cuvs::distance::
+        unfusedDistanceNNMinReduce<DataT, DataT, raft::KeyValuePair<IndexT, DataT>, IndexT>(
+          handle,
+          minClusterAndDistance.data_handle(),
+          X.data_handle(),
+          centroids.data_handle(),
+          L2NormX.data_handle(),
+          centroidsNorm.data_handle(),
+          n_samples,
+          n_clusters,
+          n_features,
+          (void*)workspace.data(),
+          metric != cuvs::distance::DistanceType::L2Expanded,
+          false,
+          true,
+          metric,
+          0.0f,
+          stream);
+    }
   } else {
     auto dataBatchSize      = getDataBatchSize(batch_samples, n_samples);
     auto centroidsBatchSize = getCentroidsBatchSize(batch_centroids, n_clusters);