NVIDIA
diff --git a/‎cpp/include/cuvs/cluster/kmeans.hpp‎
Lines changed: 24 additions & 2 deletions b/‎cpp/include/cuvs/cluster/kmeans.hpp‎
Lines changed: 24 additions & 2 deletions
diff --git a/‎cpp/src/cluster/detail/kmeans.cuh‎
Lines changed: 1 addition & 1 deletion b/‎cpp/src/cluster/detail/kmeans.cuh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/src/cluster/detail/kmeans_common.cuh‎
Lines changed: 7 additions & 7 deletions b/‎cpp/src/cluster/detail/kmeans_common.cuh‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎cpp/src/cluster/detail/kmeans_mg.cuh‎
Lines changed: 17 additions & 20 deletions b/‎cpp/src/cluster/detail/kmeans_mg.cuh‎
Lines changed: 17 additions & 20 deletions
@@ -125,6 +125,11 @@ struct params : base_params {
    * When set to 0 (default) with host data uses `min(3 * n_clusters, n_samples)`
    * as a default.
    *
+   * In Batched multi-GPU host-data fits, the effective KMeansPlusPlus initialization
+   * sample is materialized on device on every rank. Every rank must have enough
+   * GPU memory for this sample, and rank 0 must also have enough GPU memory for
+   * the seeding workspace.
+   *
    * Default: 0.
    */
   int64_t init_size = 0;
@@ -134,6 +139,9 @@ struct params : base_params {
    * When set to 0, defaults to n_samples (process all at once).
    * Only used by the batched (host-data) code path and ignored by device-data
    * overloads.
+   *
+   * In multi-GPU mode, this is a per-rank batch size. Each rank processes up to
+   * this many local samples per batch, clamped to that rank's local sample count.
    * Default: 0 (process all data at once).
    */
   int64_t streaming_batch_size = 0;
@@ -177,7 +185,20 @@ enum class kmeans_type { KMeans = 0, KMeansBalanced = 1 };
  *
  * This overload supports out-of-core computation where the dataset resides
  * on the host. Data is processed in GPU-sized batches, streaming from host to device.
- * The batch size is controlled by params.streaming_batch_size.
+ * The batch size is controlled by params.streaming_batch_size. In multi-GPU mode,
+ * this is a per-rank batch size.
+ *
+ * Multi-GPU dispatch is selected automatically based on the handle state:
+ *   - If `raft::resource::is_multi_gpu(handle)` (cuVS SNMG): the full dataset X
+ *     is split across GPUs internally with an OpenMP parallel region and NCCL.
+ *   - If `raft::resource::comms_initialized(handle)` (Dask/Ray/MPI): X is treated as
+ *     this worker's partition, and RAFT communicators are used for collectives.
+ *   - Otherwise: single-GPU batched k-means.
+ *
+ * With `params.init == InitMethod::KMeansPlusPlus` in multi-GPU mode, the
+ * effective initialization sample must fit in GPU memory on every rank because
+ * it is materialized on every device. Rank 0 must also have enough GPU memory
+ * for the seeding workspace before centroids are broadcast.
  *
  * @code{.cpp}
  *   #include <raft/core/resources.hpp>
@@ -208,7 +229,8 @@ enum class kmeans_type { KMeans = 0, KMeansBalanced = 1 };
  *               raft::make_host_scalar_view(&n_iter));
  * @endcode
  *
- * @param[in]     handle        The raft handle.
+ * @param[in]     handle        The raft handle. When a multi-GPU resource is
+ *                              attached, multi-GPU dispatch is used automatically.
  * @param[in]     params        Parameters for KMeans model. Batch size is read from
  *                              params.streaming_batch_size.
  * @param[in]     X             Training instances on HOST memory. The data must
 
@@ -969,7 +969,7 @@ void kmeans_fit(
 
     if (iter_inertia < inertia[0]) {
       inertia[0] = iter_inertia;
-      n_iter[0]  = n_current_iter;
+      n_iter[0]  = std::min(n_current_iter, static_cast<IndexT>(iter_params.max_iter));
       raft::copy(centroids.data_handle(), cur_centroids_ptr, centroid_buf_size, stream);
     }
     RAFT_LOG_DEBUG("KMeans.fit after iteration-%d/%d: inertia - %f, n_iter - %d",
 
@@ -622,26 +622,26 @@ void compute_centroid_shift(raft::resources const& handle,
  * @brief Evaluate convergence criteria entirely on device.
  *
  * Checks the cost-ratio and centroid-shift stopping conditions and writes
- * a boolean result (0 or 1) into @p done_flag.  Also advances
- * @p prior_clustering_cost to the current cost for the next iteration.
+ * 0 or 1 into @p done_flag, and advances @p prior_clustering_cost.
+ * @p FlagT is deduced from @p done_flag (default `int`).
  */
-template <typename DataT>
+template <typename DataT, typename FlagT = int>
 __device__ void check_convergence(raft::device_scalar_view<const DataT> clustering_cost,
                                   raft::device_scalar_view<DataT> prior_clustering_cost,
                                   raft::device_scalar_view<const DataT> sqrd_norm_error,
                                   DataT tol,
                                   int n_iter,
-                                  raft::device_scalar_view<int> done_flag)
+                                  raft::device_scalar_view<FlagT> done_flag)
 {
   DataT cur_cost = *clustering_cost.data_handle();
   DataT norm_err = *sqrd_norm_error.data_handle();
-  int done       = 0;
+  FlagT done     = FlagT{0};
 
   if (cur_cost != DataT{0} && n_iter > 1) {
     DataT delta = cur_cost / *prior_clustering_cost.data_handle();
-    if (delta > DataT{1} - tol) done = 1;
+    if (delta > DataT{1} - tol) done = FlagT{1};
   }
-  if (norm_err < tol) done = 1;
+  if (norm_err < tol) done = FlagT{1};
 
   *prior_clustering_cost.data_handle() = cur_cost;
   *done_flag.data_handle()             = done;
 
@@ -463,35 +463,31 @@ void checkWeights(const raft::resources& handle,
                   raft::device_vector_view<DataT, IndexT> weight)
 {
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
-  rmm::device_scalar<DataT> wt_aggr(stream);
+  auto d_wt_sum       = raft::make_device_scalar<DataT>(handle, DataT{0});
 
   const auto& comm = raft::resource::get_comms(handle);
 
   auto n_samples = weight.extent(0);
   raft::linalg::mapThenSumReduce(
-    wt_aggr.data(), n_samples, raft::identity_op{}, stream, weight.data_handle());
+    d_wt_sum.data_handle(), n_samples, raft::identity_op{}, stream, weight.data_handle());
 
-  comm.allreduce<DataT>(wt_aggr.data(),  // sendbuff
-                        wt_aggr.data(),  // recvbuff
-                        1,               // count
+  comm.allreduce<DataT>(d_wt_sum.data_handle(),  // sendbuff
+                        d_wt_sum.data_handle(),  // recvbuff
+                        1,                       // count
                         raft::comms::op_t::SUM,
                         stream);
-  DataT wt_sum = wt_aggr.value(stream);
-  raft::resource::sync_stream(handle, stream);
-  RAFT_EXPECTS(wt_sum > DataT{0}, "invalid parameter (sum of sample weights must be positive)");
 
-  if (wt_sum != n_samples) {
-    CUVS_LOG_KMEANS(handle,
-                    "[Warning!] KMeans: normalizing the user provided sample weights to "
-                    "sum up to %d samples",
-                    n_samples);
-
-    raft::linalg::map(handle,
-                      weight,
-                      raft::compose_op(raft::mul_const_op<DataT>{static_cast<DataT>(n_samples)},
-                                       raft::div_const_op<DataT>{wt_sum}),
-                      raft::make_const_mdspan(weight));
-  }
+  // Normalize weights so they sum to n_samples (per rank). Reading the sum from
+  // a device pointer avoids a host copy / stream sync. When the sum already
+  // equals n_samples this is a numerical no-op (matches single-GPU behavior).
+  const DataT* d_wt_sum_ptr = d_wt_sum.data_handle();
+  raft::linalg::map(
+    handle,
+    weight,
+    [n_samples, d_wt_sum_ptr] __device__(DataT w) {
+      return w * static_cast<DataT>(n_samples) / *d_wt_sum_ptr;
+    },
+    raft::make_const_mdspan(weight));
 }
 
 template <typename DataT, typename IndexT>
@@ -750,6 +746,7 @@ void fit(const raft::resources& handle,
       break;
     }
   }
+  n_iter[0] = std::min(n_iter[0], static_cast<IndexT>(params.max_iter));
 }
 
 };  // namespace cuvs::cluster::kmeans::mg::detail
Original file line number	Diff line number	Diff line change
`@@ -969,7 +969,7 @@ void kmeans_fit(`
`969`	`969`
`970`	`970`	`if (iter_inertia < inertia[0]) {`
`971`	`971`	`inertia[0] = iter_inertia;`
`972`		`- n_iter[0] = n_current_iter;`
	`972`	`+ n_iter[0] = std::min(n_current_iter, static_cast<IndexT>(iter_params.max_iter));`
`973`	`973`	`raft::copy(centroids.data_handle(), cur_centroids_ptr, centroid_buf_size, stream);`
`974`	`974`	`}`
`975`	`975`	`RAFT_LOG_DEBUG("KMeans.fit after iteration-%d/%d: inertia - %f, n_iter - %d",`