change to match ivfpq style with internal_distance_dtype

jinsolp · jinsolp · commit 1c7589fce9ca · 2026-05-13T21:27:34.000Z
diff --git a/c/include/cuvs/neighbors/nn_descent.h b/c/include/cuvs/neighbors/nn_descent.h
@@ -25,7 +25,7 @@ extern "C" {
  * performance and memory usage.
  * - `NND_DIST_COMP_FP16`: Use fp16 distance computation.
  *
- * @deprecated To be removed in 26.08. Use cuvsNNDescentIndexParams_v6 with use_fp16_dist_comp
+ * @deprecated To be removed in 26.08. Use cuvsNNDescentIndexParams_v6 with internal_distance_dtype
  * instead.
  */
 typedef enum {
@@ -91,10 +91,11 @@ typedef struct cuvsNNDescentIndexParams* cuvsNNDescentIndexParams_t;
  * the graph for. More iterations produce a better quality graph at cost of performance
  * `termination_threshold`: The delta at which nn-descent will terminate its iterations
  * `return_distances`: Boolean to decide whether to return distances array
- * `use_fp16_dist_comp`: When true and the input data is fp32, distance computation is performed
- * in fp16 for better performance and lower memory usage at the cost of precision. This requires
- * copying the fp32 input to an internal fp16 buffer on the device. Has no effect on non-fp32
- * input types (fp16, int8, uint8) which always use fp16 distance computation.
+ * `internal_distance_dtype`: Only applicable for fp32 input. Controls the precision used to
+ * compute distances. Possible values: [CUDA_R_32F, CUDA_R_16F]. Defaults to CUDA_R_32F. Set to
+ * CUDA_R_16F to compute distances in fp16 (faster, uses less device memory; not recommended for
+ * dim <= 16 due to precision loss). Has no effect on non-fp32 input types (fp16, int8, uint8)
+ * which always compute distances in fp16.
  *
  * @since 26.06
  */
@@ -106,7 +107,7 @@ struct cuvsNNDescentIndexParams_v6 {
   size_t max_iterations;
   float termination_threshold;
   bool return_distances;
-  bool use_fp16_dist_comp;
+  cudaDataType_t internal_distance_dtype;
 };
 
 typedef struct cuvsNNDescentIndexParams_v6* cuvsNNDescentIndexParams_v6_t;
diff --git a/c/src/neighbors/nn_descent.cpp b/c/src/neighbors/nn_descent.cpp
@@ -64,7 +64,8 @@ cuvs::neighbors::nn_descent::index_params convert_params(cuvsNNDescentIndexParam
   build_params.max_iterations            = params.max_iterations;
   build_params.termination_threshold     = params.termination_threshold;
   build_params.return_distances          = params.return_distances;
-  build_params.use_fp16_dist_comp        = (params.dist_comp_dtype == NND_DIST_COMP_FP16);
+  build_params.internal_distance_dtype =
+    (params.dist_comp_dtype == NND_DIST_COMP_FP16) ? CUDA_R_16F : CUDA_R_32F;
   return build_params;
 }
 
@@ -79,7 +80,7 @@ cuvs::neighbors::nn_descent::index_params convert_params_v6(
   build_params.max_iterations            = params.max_iterations;
   build_params.termination_threshold     = params.termination_threshold;
   build_params.return_distances          = params.return_distances;
-  build_params.use_fp16_dist_comp        = params.use_fp16_dist_comp;
+  build_params.internal_distance_dtype   = params.internal_distance_dtype;
   return build_params;
 }
 
@@ -219,8 +220,9 @@ extern "C" cuvsError_t cuvsNNDescentIndexParamsCreate(cuvsNNDescentIndexParams_t
       .max_iterations            = cpp_params.max_iterations,
       .termination_threshold     = cpp_params.termination_threshold,
       .return_distances          = cpp_params.return_distances,
-      .dist_comp_dtype           = cpp_params.use_fp16_dist_comp ? NND_DIST_COMP_FP16
-                                                                 : NND_DIST_COMP_AUTO};
+      .dist_comp_dtype           = cpp_params.internal_distance_dtype == CUDA_R_16F
+                                     ? NND_DIST_COMP_FP16
+                                     : NND_DIST_COMP_AUTO};
   });
 }
 
@@ -238,7 +240,7 @@ extern "C" cuvsError_t cuvsNNDescentIndexParamsCreate_v6(cuvsNNDescentIndexParam
       .max_iterations            = cpp_params.max_iterations,
       .termination_threshold     = cpp_params.termination_threshold,
       .return_distances          = cpp_params.return_distances,
-      .use_fp16_dist_comp        = cpp_params.use_fp16_dist_comp};
+      .internal_distance_dtype   = cpp_params.internal_distance_dtype};
   });
 }
 
diff --git a/cpp/include/cuvs/neighbors/nn_descent.hpp b/cpp/include/cuvs/neighbors/nn_descent.hpp
@@ -40,18 +40,19 @@ namespace nn_descent {
  * the graph for. More iterations produce a better quality graph at cost of performance
  * - `termination_threshold`: The delta at which nn-descent will terminate its iterations
  * - `return_distances`: Boolean to decide whether to return distances array
- * - `use_fp16_dist_comp`: When true and the input data is fp32, distance computation is
- * performed in fp16 for better performance and lower memory usage at the cost of precision.
- * This requires copying the fp32 input to an internal fp16 buffer on the device. Has no effect
- * on non-fp32 input types (fp16, int8, uint8) which always use fp16 distance computation.
+ * - `internal_distance_dtype`: Only applicable for fp32 input. Controls the precision used to
+ * compute distances. Possible values: [CUDA_R_32F, CUDA_R_16F]. Defaults to CUDA_R_32F. Set to
+ * CUDA_R_16F to compute distances in fp16 (faster, uses less device memory; not recommended for
+ * dim <= 16 due to precision loss). Has no effect on non-fp32 input types (fp16, int8, uint8)
+ * which always compute distances in fp16.
  */
 struct index_params : cuvs::neighbors::index_params {
-  size_t graph_degree              = 64;
-  size_t intermediate_graph_degree = 128;
-  size_t max_iterations            = 20;
-  float termination_threshold      = 0.0001;
-  bool return_distances            = true;
-  bool use_fp16_dist_comp          = false;
+  size_t graph_degree                    = 64;
+  size_t intermediate_graph_degree       = 128;
+  size_t max_iterations                  = 20;
+  float termination_threshold            = 0.0001;
+  bool return_distances                  = true;
+  cudaDataType_t internal_distance_dtype = CUDA_R_32F;
 
   /** @brief Construct NN descent parameters for a specific kNN graph degree
    *
diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh
@@ -859,7 +859,7 @@ __launch_bounds__(BLOCK_SIZE)
 // MAX_RESIDENT_THREAD_PER_SM = BLOCK_SIZE * BLOCKS_PER_SM = 2048
 // For architectures 750 and 860 (890), the values for MAX_RESIDENT_THREAD_PER_SM
 // is 1024 and 1536 respectively, which means the bounds don't work anymore
-// Used for fp32 data compressed to fp16, and all types using non-L1 distance metric.
+// Used for fp32 data downcast to fp16, and all types using non-L1 distance metric.
 template <typename Data_t,
           typename Index_t,
           typename ID_t = InternalID_t<Index_t>,
@@ -1373,11 +1373,11 @@ GNND<Data_t, Index_t>::GNND(raft::resources const& res, const BuildConfig& build
   static_assert(NUM_SAMPLES <= 32);
 
   using input_t = typename std::remove_const<Data_t>::type;
-  if (build_config.use_fp16_dist_comp && build_config.dataset_dim <= 16 &&
+  if (build_config.internal_distance_dtype == CUDA_R_16F && build_config.dataset_dim <= 16 &&
       std::is_same_v<input_t, float>) {
     RAFT_LOG_WARN(
       "Using fp16 for distance computation for data in fp32 with small dimensions (%zu) <= 16 may "
-      "result in low quality results. Consider setting use_fp16_dist_comp = false.",
+      "result in low quality results. Consider setting internal_distance_dtype = CUDA_R_32F.",
       build_config.dataset_dim);
   }
 
@@ -1431,14 +1431,17 @@ void GNND<Data_t, Index_t>::local_join(cudaStream_t stream, DistEpilogue_t dist_
 {
   raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits<float>::max());
   // Kernel dispatch logic:
-  // fp32 data can have an effective type of fp32 OR fp16 (when use_fp16_dist_comp flag = True for
-  // wmma usage) Based on EFFECTIVE dtype:
+  // fp32 data can have an effective type of fp32 OR fp16 (when internal_distance_dtype is
+  // CUDA_R_16F, fp32 host data is downcast into a device-side fp16 buffer at copy-in time so the
+  // WMMA kernel reads it in fp16). Based on EFFECTIVE dtype:
   //   fp32 data || L1 distance  -> SIMT: internally converted to fp32 for distance computation
-  //   on-the-fly dypte <= fp16 && non-L1 metrics  -> WMMA (tensor-core accelerated dot product):
-  //   internally converted to fp16 for distance computation on-the-fly
+  //     on-the-fly
+  //   dtype <= fp16 && non-L1 metrics  -> WMMA (tensor-core accelerated dot product):
+  //     internally converted to fp16 for distance computation on-the-fly
 
-  bool use_simt = (std::is_same_v<input_t, float> && !build_config_.use_fp16_dist_comp) ||
-                  build_config_.metric == cuvs::distance::DistanceType::L1;
+  bool use_simt =
+    (std::is_same_v<input_t, float> && build_config_.internal_distance_dtype != CUDA_R_16F) ||
+    build_config_.metric == cuvs::distance::DistanceType::L1;
 
   auto launch_kernel = [&](auto* typed_ptr) {
     if (use_simt) {
@@ -1479,7 +1482,8 @@ void GNND<Data_t, Index_t>::local_join(cudaStream_t stream, DistEpilogue_t dist_
   };
 
   if (d_data_half_.has_value()) {
-    // Host fp32 input compressed to fp16 via use_fp16_dist_comp.
+    // Host fp32 input was downcast to a device-side fp16 buffer via internal_distance_dtype =
+    // CUDA_R_16F.
     launch_kernel(static_cast<const half*>(d_data_ptr_));
   } else {
     // Data stored as input_t: device data used directly, or host data copied as-is.
@@ -1521,17 +1525,18 @@ void GNND<Data_t, Index_t>::build(Data_t* data,
                         build_config_.metric == cuvs::distance::DistanceType::L2SqrtExpanded ||
                         build_config_.metric == cuvs::distance::DistanceType::CosineExpanded;
 
-  bool compress_host_data =
-    !data_on_device && std::is_same_v<input_t, float> && build_config_.use_fp16_dist_comp;
+  bool downcast_host_data = !data_on_device && std::is_same_v<input_t, float> &&
+                            build_config_.internal_distance_dtype == CUDA_R_16F;
 
   if (data_on_device) {
     // When user-given data is on device, we use it directly. This can be any type (fp32, fp16,
     // int8, uint8)
     d_data_ptr_ = data;
-  } else if (compress_host_data) {
-    // When user-given data is fp32 host data, and use_fp16_dist_comp is true, we allocate fp16
-    // buffer to copy the data. This allows the wmma kernel to be used for distance computation
-    // instead of simt kernel.
+  } else if (downcast_host_data) {
+    // When user-given data is fp32 host data, and internal_distance_dtype is CUDA_R_16F, we
+    // allocate an fp16 device buffer and downcast at copy-in time. Storing the dataset on device
+    // in fp16 (instead of fp32) for this path halves both the device memory footprint and the
+    // per-iteration read bandwidth of the WMMA kernel.
     if (!d_data_half_.has_value()) {
       d_data_half_.emplace(raft::make_device_matrix<half, size_t, raft::row_major>(
         res, build_config_.max_dataset_size, build_config_.dataset_dim));
@@ -1545,7 +1550,7 @@ void GNND<Data_t, Index_t>::build(Data_t* data,
       int num_blocks    = raft::ceildiv(n_elems, static_cast<size_t>(TPB));
       size_t dst_offset = batch.offset() * build_config_.dataset_dim;
       if (needs_l2_norms) {
-        // we compute l2 norms on the fp32 data directly.
+        // Compute l2 norms on the fp32 batches before they're downcast to fp16.
         compute_l2_norms_kernel<<<batch.size(),
                                   raft::warp_size(),
                                   sizeof(float) *
@@ -1560,8 +1565,8 @@ void GNND<Data_t, Index_t>::build(Data_t* data,
     }
     d_data_ptr_ = d_data_half_.value().data_handle();
   } else {
-    // In other cases where user-given data is not device-accessible, we allocate a device buffer to
-    // copy the data. The input type is kept as-is (fp32, fp16, int8, uint8).
+    // Other cases: user-given data is not device-accessible, but we don't need a precision
+    // conversion. Allocate a device buffer in input_t and copy as-is.
     if (!d_data_direct_.has_value()) {
       d_data_direct_.emplace(raft::make_device_matrix<input_t, size_t, raft::row_major>(
         res, build_config_.max_dataset_size, build_config_.dataset_dim));
@@ -1573,7 +1578,7 @@ void GNND<Data_t, Index_t>::build(Data_t* data,
     d_data_ptr_ = d_data_direct_.value().data_handle();
   }
 
-  if (needs_l2_norms && !compress_host_data) {
+  if (needs_l2_norms && !downcast_host_data) {
     compute_l2_norms_kernel<<<
       nrow_,
       raft::warp_size(),
diff --git a/cpp/src/neighbors/detail/nn_descent_gnnd.hpp b/cpp/src/neighbors/detail/nn_descent_gnnd.hpp
@@ -64,7 +64,7 @@ struct BuildConfig {
   float termination_threshold{0.0001};
   size_t output_graph_degree{32};
   cuvs::distance::DistanceType metric{cuvs::distance::DistanceType::L2Expanded};
-  bool use_fp16_dist_comp{false};
+  cudaDataType_t internal_distance_dtype{CUDA_R_32F};
 };
 
 template <typename Index_t>
@@ -230,7 +230,8 @@ class GNND {
   using input_t = std::remove_const_t<Data_t>;
 
   // d_data_half_ is used for a special case when input data is fp32 on host and
-  // use_fp16_dist_comp flag is True
+  // internal_distance_dtype is CUDA_R_16F: we store the dataset on device as fp16 (instead of
+  // fp32) to halve the device memory footprint and WMMA kernel read bandwidth.
   std::optional<raft::device_matrix<half, size_t, raft::row_major>> d_data_half_;
   // d_data_direct_ is used when input data is on host, and we need to copy it to device
   std::optional<raft::device_matrix<input_t, size_t, raft::row_major>> d_data_direct_;
@@ -306,15 +307,15 @@ inline BuildConfig get_build_config(raft::resources const& res,
   size_t extended_intermediate_degree =
     roundUp32(static_cast<size_t>(intermediate_degree * (intermediate_degree <= 32 ? 1.0 : 1.3)));
 
-  BuildConfig build_config{.max_dataset_size      = num_rows,
-                           .dataset_dim           = num_cols,
-                           .node_degree           = extended_graph_degree,
-                           .internal_node_degree  = extended_intermediate_degree,
-                           .max_iterations        = params.max_iterations,
-                           .termination_threshold = params.termination_threshold,
-                           .output_graph_degree   = params.graph_degree,
-                           .metric                = params.metric,
-                           .use_fp16_dist_comp    = params.use_fp16_dist_comp};
+  BuildConfig build_config{.max_dataset_size        = num_rows,
+                           .dataset_dim             = num_cols,
+                           .node_degree             = extended_graph_degree,
+                           .internal_node_degree    = extended_intermediate_degree,
+                           .max_iterations          = params.max_iterations,
+                           .termination_threshold   = params.termination_threshold,
+                           .output_graph_degree     = params.graph_degree,
+                           .metric                  = params.metric,
+                           .internal_distance_dtype = params.internal_distance_dtype};
   return build_config;
 }
 
diff --git a/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pxd b/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pxd
@@ -12,6 +12,12 @@ from cuvs.common.cydlpack cimport DLDataType, DLManagedTensor
 from cuvs.distance_type cimport cuvsDistanceType
 
 
+cdef extern from "library_types.h":
+    ctypedef enum cudaDataType_t:
+        CUDA_R_32F "CUDA_R_32F"  # float
+        CUDA_R_16F "CUDA_R_16F"  # half
+
+
 cdef extern from "cuvs/neighbors/nn_descent.h" nogil:
     # Deprecated — to be removed in 26.08 and replaced by cuvsNNDescentIndexParams_v6.
     ctypedef enum cuvsNNDescentDistCompDtype:
@@ -39,7 +45,7 @@ cdef extern from "cuvs/neighbors/nn_descent.h" nogil:
         size_t max_iterations
         float termination_threshold
         bool return_distances
-        bool use_fp16_dist_comp
+        cudaDataType_t internal_distance_dtype
 
     ctypedef cuvsNNDescentIndexParams_v6* cuvsNNDescentIndexParams_v6_t
 
diff --git a/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pyx b/python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pyx
@@ -36,6 +36,14 @@ from libc.stdint cimport (
 from cuvs.common.exceptions import check_cuvs
 
 
+cdef _map_dtype_np_to_cuda(dtype):
+    mapping = {np.float32: cudaDataType_t.CUDA_R_32F,
+               np.float16: cudaDataType_t.CUDA_R_16F}
+    if dtype not in mapping:
+        raise TypeError("Type %s is not supported" % str(dtype))
+    return mapping[dtype]
+
+
 cdef class IndexParams:
     """
     Parameters to build NN-Descent Index
@@ -63,12 +71,13 @@ cdef class IndexParams:
         The delta at which nn-descent will terminate its iterations
     return_distances : bool
         Whether to return distances array
-    use_fp16_dist_comp : bool, default = False
-        When True and the input data is fp32, distance computation is performed
-        in fp16 for better performance and lower memory usage at the cost of
-        precision. This requires copying the fp32 input to an internal fp16
-        buffer on the device. Has no effect on non-fp32 input types (fp16,
-        int8, uint8) which always use fp16 distance computation.
+    internal_distance_dtype : numpy dtype, default = np.float32
+        Only applicable for fp32 input. Controls the precision used to compute
+        distances. Possible values: [np.float32, np.float16]. Set to np.float16
+        to compute distances in fp16 (faster, uses less device memory; not
+        recommended for dim <= 16 due to precision loss). Has no effect on
+        non-fp32 input types (fp16, int8, uint8) which always compute distances
+        in fp16.
     """
 
     cdef cuvsNNDescentIndexParams_v6* params
@@ -88,7 +97,7 @@ cdef class IndexParams:
                  max_iterations=None,
                  termination_threshold=None,
                  return_distances=None,
-                 use_fp16_dist_comp=None
+                 internal_distance_dtype=None
                  ):
         if metric is not None:
             self.params.metric = <cuvsDistanceType>DISTANCE_TYPES[metric]
@@ -102,8 +111,9 @@ cdef class IndexParams:
             self.params.termination_threshold = termination_threshold
         if return_distances is not None:
             self.params.return_distances = return_distances
-        if use_fp16_dist_comp is not None:
-            self.params.use_fp16_dist_comp = use_fp16_dist_comp
+        if internal_distance_dtype is not None:
+            self.params.internal_distance_dtype = \
+                _map_dtype_np_to_cuda(internal_distance_dtype)
 
     @property
     def metric(self):
diff --git a/python/cuvs/cuvs/tests/test_nn_descent.py b/python/cuvs/cuvs/tests/test_nn_descent.py
@@ -61,9 +61,11 @@ def test_nn_descent(
 
 
 @pytest.mark.parametrize("n_cols", [2, 17, 32])
-@pytest.mark.parametrize("use_fp16_dist_comp", [False, True])
+@pytest.mark.parametrize("internal_distance_dtype", [np.float32, np.float16])
 @pytest.mark.parametrize("dtype", [np.float32, np.float16])
-def test_nn_descent_use_fp16_dist_comp(n_cols, use_fp16_dist_comp, dtype):
+def test_nn_descent_internal_distance_dtype(
+    n_cols, internal_distance_dtype, dtype
+):
     metric = "sqeuclidean"
     graph_degree = 32
     n_rows = 100_000
@@ -77,7 +79,7 @@ def test_nn_descent_use_fp16_dist_comp(n_cols, use_fp16_dist_comp, dtype):
         metric=metric,
         graph_degree=graph_degree,
         return_distances=True,
-        use_fp16_dist_comp=use_fp16_dist_comp,
+        internal_distance_dtype=internal_distance_dtype,
     )
 
     index = nn_descent.build(params, X)
@@ -88,9 +90,13 @@ def test_nn_descent_use_fp16_dist_comp(n_cols, use_fp16_dist_comp, dtype):
     _, bf_indices = brute_force.search(index, gpu_X, k=graph_degree)
     bf_indices = bf_indices.copy_to_host()
 
-    if n_cols <= 16 and use_fp16_dist_comp and dtype == np.float32:
-        # for small dim, if data is fp32 but use_fp16_dist_comp is True, the
-        # recall will be low
+    if (
+        n_cols <= 16
+        and internal_distance_dtype == np.float16
+        and dtype == np.float32
+    ):
+        # for small dim, if data is fp32 but internal_distance_dtype is
+        # np.float16, the recall will be low
         assert calc_recall(nnd_indices, bf_indices) < 0.7
     else:
         assert calc_recall(nnd_indices, bf_indices) > 0.9