Skip to content

Commit 1c7589f

Browse files
committed
change to match ivfpq style with internal_distance_dtype
1 parent be9f30b commit 1c7589f

8 files changed

Lines changed: 100 additions & 68 deletions

File tree

c/include/cuvs/neighbors/nn_descent.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ extern "C" {
2525
* performance and memory usage.
2626
* - `NND_DIST_COMP_FP16`: Use fp16 distance computation.
2727
*
28-
* @deprecated To be removed in 26.08. Use cuvsNNDescentIndexParams_v6 with use_fp16_dist_comp
28+
* @deprecated To be removed in 26.08. Use cuvsNNDescentIndexParams_v6 with internal_distance_dtype
2929
* instead.
3030
*/
3131
typedef enum {
@@ -91,10 +91,11 @@ typedef struct cuvsNNDescentIndexParams* cuvsNNDescentIndexParams_t;
9191
* the graph for. More iterations produce a better quality graph at cost of performance
9292
* `termination_threshold`: The delta at which nn-descent will terminate its iterations
9393
* `return_distances`: Boolean to decide whether to return distances array
94-
* `use_fp16_dist_comp`: When true and the input data is fp32, distance computation is performed
95-
* in fp16 for better performance and lower memory usage at the cost of precision. This requires
96-
* copying the fp32 input to an internal fp16 buffer on the device. Has no effect on non-fp32
97-
* input types (fp16, int8, uint8) which always use fp16 distance computation.
94+
* `internal_distance_dtype`: Only applicable for fp32 input. Controls the precision used to
95+
* compute distances. Possible values: [CUDA_R_32F, CUDA_R_16F]. Defaults to CUDA_R_32F. Set to
96+
* CUDA_R_16F to compute distances in fp16 (faster, uses less device memory; not recommended for
97+
* dim <= 16 due to precision loss). Has no effect on non-fp32 input types (fp16, int8, uint8)
98+
* which always compute distances in fp16.
9899
*
99100
* @since 26.06
100101
*/
@@ -106,7 +107,7 @@ struct cuvsNNDescentIndexParams_v6 {
106107
size_t max_iterations;
107108
float termination_threshold;
108109
bool return_distances;
109-
bool use_fp16_dist_comp;
110+
cudaDataType_t internal_distance_dtype;
110111
};
111112

112113
typedef struct cuvsNNDescentIndexParams_v6* cuvsNNDescentIndexParams_v6_t;

c/src/neighbors/nn_descent.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ cuvs::neighbors::nn_descent::index_params convert_params(cuvsNNDescentIndexParam
6464
build_params.max_iterations = params.max_iterations;
6565
build_params.termination_threshold = params.termination_threshold;
6666
build_params.return_distances = params.return_distances;
67-
build_params.use_fp16_dist_comp = (params.dist_comp_dtype == NND_DIST_COMP_FP16);
67+
build_params.internal_distance_dtype =
68+
(params.dist_comp_dtype == NND_DIST_COMP_FP16) ? CUDA_R_16F : CUDA_R_32F;
6869
return build_params;
6970
}
7071

@@ -79,7 +80,7 @@ cuvs::neighbors::nn_descent::index_params convert_params_v6(
7980
build_params.max_iterations = params.max_iterations;
8081
build_params.termination_threshold = params.termination_threshold;
8182
build_params.return_distances = params.return_distances;
82-
build_params.use_fp16_dist_comp = params.use_fp16_dist_comp;
83+
build_params.internal_distance_dtype = params.internal_distance_dtype;
8384
return build_params;
8485
}
8586

@@ -219,8 +220,9 @@ extern "C" cuvsError_t cuvsNNDescentIndexParamsCreate(cuvsNNDescentIndexParams_t
219220
.max_iterations = cpp_params.max_iterations,
220221
.termination_threshold = cpp_params.termination_threshold,
221222
.return_distances = cpp_params.return_distances,
222-
.dist_comp_dtype = cpp_params.use_fp16_dist_comp ? NND_DIST_COMP_FP16
223-
: NND_DIST_COMP_AUTO};
223+
.dist_comp_dtype = cpp_params.internal_distance_dtype == CUDA_R_16F
224+
? NND_DIST_COMP_FP16
225+
: NND_DIST_COMP_AUTO};
224226
});
225227
}
226228

@@ -238,7 +240,7 @@ extern "C" cuvsError_t cuvsNNDescentIndexParamsCreate_v6(cuvsNNDescentIndexParam
238240
.max_iterations = cpp_params.max_iterations,
239241
.termination_threshold = cpp_params.termination_threshold,
240242
.return_distances = cpp_params.return_distances,
241-
.use_fp16_dist_comp = cpp_params.use_fp16_dist_comp};
243+
.internal_distance_dtype = cpp_params.internal_distance_dtype};
242244
});
243245
}
244246

cpp/include/cuvs/neighbors/nn_descent.hpp

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -40,18 +40,19 @@ namespace nn_descent {
4040
* the graph for. More iterations produce a better quality graph at cost of performance
4141
* - `termination_threshold`: The delta at which nn-descent will terminate its iterations
4242
* - `return_distances`: Boolean to decide whether to return distances array
43-
* - `use_fp16_dist_comp`: When true and the input data is fp32, distance computation is
44-
* performed in fp16 for better performance and lower memory usage at the cost of precision.
45-
* This requires copying the fp32 input to an internal fp16 buffer on the device. Has no effect
46-
* on non-fp32 input types (fp16, int8, uint8) which always use fp16 distance computation.
43+
* - `internal_distance_dtype`: Only applicable for fp32 input. Controls the precision used to
44+
* compute distances. Possible values: [CUDA_R_32F, CUDA_R_16F]. Defaults to CUDA_R_32F. Set to
45+
* CUDA_R_16F to compute distances in fp16 (faster, uses less device memory; not recommended for
46+
* dim <= 16 due to precision loss). Has no effect on non-fp32 input types (fp16, int8, uint8)
47+
* which always compute distances in fp16.
4748
*/
4849
struct index_params : cuvs::neighbors::index_params {
49-
size_t graph_degree = 64;
50-
size_t intermediate_graph_degree = 128;
51-
size_t max_iterations = 20;
52-
float termination_threshold = 0.0001;
53-
bool return_distances = true;
54-
bool use_fp16_dist_comp = false;
50+
size_t graph_degree = 64;
51+
size_t intermediate_graph_degree = 128;
52+
size_t max_iterations = 20;
53+
float termination_threshold = 0.0001;
54+
bool return_distances = true;
55+
cudaDataType_t internal_distance_dtype = CUDA_R_32F;
5556

5657
/** @brief Construct NN descent parameters for a specific kNN graph degree
5758
*

cpp/src/neighbors/detail/nn_descent.cuh

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -859,7 +859,7 @@ __launch_bounds__(BLOCK_SIZE)
859859
// MAX_RESIDENT_THREAD_PER_SM = BLOCK_SIZE * BLOCKS_PER_SM = 2048
860860
// For architectures 750 and 860 (890), the values for MAX_RESIDENT_THREAD_PER_SM
861861
// is 1024 and 1536 respectively, which means the bounds don't work anymore
862-
// Used for fp32 data compressed to fp16, and all types using non-L1 distance metric.
862+
// Used for fp32 data downcast to fp16, and all types using non-L1 distance metric.
863863
template <typename Data_t,
864864
typename Index_t,
865865
typename ID_t = InternalID_t<Index_t>,
@@ -1373,11 +1373,11 @@ GNND<Data_t, Index_t>::GNND(raft::resources const& res, const BuildConfig& build
13731373
static_assert(NUM_SAMPLES <= 32);
13741374

13751375
using input_t = typename std::remove_const<Data_t>::type;
1376-
if (build_config.use_fp16_dist_comp && build_config.dataset_dim <= 16 &&
1376+
if (build_config.internal_distance_dtype == CUDA_R_16F && build_config.dataset_dim <= 16 &&
13771377
std::is_same_v<input_t, float>) {
13781378
RAFT_LOG_WARN(
13791379
"Using fp16 for distance computation for data in fp32 with small dimensions (%zu) <= 16 may "
1380-
"result in low quality results. Consider setting use_fp16_dist_comp = false.",
1380+
"result in low quality results. Consider setting internal_distance_dtype = CUDA_R_32F.",
13811381
build_config.dataset_dim);
13821382
}
13831383

@@ -1431,14 +1431,17 @@ void GNND<Data_t, Index_t>::local_join(cudaStream_t stream, DistEpilogue_t dist_
14311431
{
14321432
raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits<float>::max());
14331433
// Kernel dispatch logic:
1434-
// fp32 data can have an effective type of fp32 OR fp16 (when use_fp16_dist_comp flag = True for
1435-
// wmma usage) Based on EFFECTIVE dtype:
1434+
// fp32 data can have an effective type of fp32 OR fp16 (when internal_distance_dtype is
1435+
// CUDA_R_16F, fp32 host data is downcast into a device-side fp16 buffer at copy-in time so the
1436+
// WMMA kernel reads it in fp16). Based on EFFECTIVE dtype:
14361437
// fp32 data || L1 distance -> SIMT: internally converted to fp32 for distance computation
1437-
// on-the-fly dypte <= fp16 && non-L1 metrics -> WMMA (tensor-core accelerated dot product):
1438-
// internally converted to fp16 for distance computation on-the-fly
1438+
// on-the-fly
1439+
// dtype <= fp16 && non-L1 metrics -> WMMA (tensor-core accelerated dot product):
1440+
// internally converted to fp16 for distance computation on-the-fly
14391441

1440-
bool use_simt = (std::is_same_v<input_t, float> && !build_config_.use_fp16_dist_comp) ||
1441-
build_config_.metric == cuvs::distance::DistanceType::L1;
1442+
bool use_simt =
1443+
(std::is_same_v<input_t, float> && build_config_.internal_distance_dtype != CUDA_R_16F) ||
1444+
build_config_.metric == cuvs::distance::DistanceType::L1;
14421445

14431446
auto launch_kernel = [&](auto* typed_ptr) {
14441447
if (use_simt) {
@@ -1479,7 +1482,8 @@ void GNND<Data_t, Index_t>::local_join(cudaStream_t stream, DistEpilogue_t dist_
14791482
};
14801483

14811484
if (d_data_half_.has_value()) {
1482-
// Host fp32 input compressed to fp16 via use_fp16_dist_comp.
1485+
// Host fp32 input was downcast to a device-side fp16 buffer via internal_distance_dtype =
1486+
// CUDA_R_16F.
14831487
launch_kernel(static_cast<const half*>(d_data_ptr_));
14841488
} else {
14851489
// Data stored as input_t: device data used directly, or host data copied as-is.
@@ -1521,17 +1525,18 @@ void GNND<Data_t, Index_t>::build(Data_t* data,
15211525
build_config_.metric == cuvs::distance::DistanceType::L2SqrtExpanded ||
15221526
build_config_.metric == cuvs::distance::DistanceType::CosineExpanded;
15231527

1524-
bool compress_host_data =
1525-
!data_on_device && std::is_same_v<input_t, float> && build_config_.use_fp16_dist_comp;
1528+
bool downcast_host_data = !data_on_device && std::is_same_v<input_t, float> &&
1529+
build_config_.internal_distance_dtype == CUDA_R_16F;
15261530

15271531
if (data_on_device) {
15281532
// When user-given data is on device, we use it directly. This can be any type (fp32, fp16,
15291533
// int8, uint8)
15301534
d_data_ptr_ = data;
1531-
} else if (compress_host_data) {
1532-
// When user-given data is fp32 host data, and use_fp16_dist_comp is true, we allocate fp16
1533-
// buffer to copy the data. This allows the wmma kernel to be used for distance computation
1534-
// instead of simt kernel.
1535+
} else if (downcast_host_data) {
1536+
// When user-given data is fp32 host data, and internal_distance_dtype is CUDA_R_16F, we
1537+
// allocate an fp16 device buffer and downcast at copy-in time. Storing the dataset on device
1538+
// in fp16 (instead of fp32) for this path halves both the device memory footprint and the
1539+
// per-iteration read bandwidth of the WMMA kernel.
15351540
if (!d_data_half_.has_value()) {
15361541
d_data_half_.emplace(raft::make_device_matrix<half, size_t, raft::row_major>(
15371542
res, build_config_.max_dataset_size, build_config_.dataset_dim));
@@ -1545,7 +1550,7 @@ void GNND<Data_t, Index_t>::build(Data_t* data,
15451550
int num_blocks = raft::ceildiv(n_elems, static_cast<size_t>(TPB));
15461551
size_t dst_offset = batch.offset() * build_config_.dataset_dim;
15471552
if (needs_l2_norms) {
1548-
// we compute l2 norms on the fp32 data directly.
1553+
// Compute l2 norms on the fp32 batches before they're downcast to fp16.
15491554
compute_l2_norms_kernel<<<batch.size(),
15501555
raft::warp_size(),
15511556
sizeof(float) *
@@ -1560,8 +1565,8 @@ void GNND<Data_t, Index_t>::build(Data_t* data,
15601565
}
15611566
d_data_ptr_ = d_data_half_.value().data_handle();
15621567
} else {
1563-
// In other cases where user-given data is not device-accessible, we allocate a device buffer to
1564-
// copy the data. The input type is kept as-is (fp32, fp16, int8, uint8).
1568+
// Other cases: user-given data is not device-accessible, but we don't need a precision
1569+
// conversion. Allocate a device buffer in input_t and copy as-is.
15651570
if (!d_data_direct_.has_value()) {
15661571
d_data_direct_.emplace(raft::make_device_matrix<input_t, size_t, raft::row_major>(
15671572
res, build_config_.max_dataset_size, build_config_.dataset_dim));
@@ -1573,7 +1578,7 @@ void GNND<Data_t, Index_t>::build(Data_t* data,
15731578
d_data_ptr_ = d_data_direct_.value().data_handle();
15741579
}
15751580
1576-
if (needs_l2_norms && !compress_host_data) {
1581+
if (needs_l2_norms && !downcast_host_data) {
15771582
compute_l2_norms_kernel<<<
15781583
nrow_,
15791584
raft::warp_size(),

cpp/src/neighbors/detail/nn_descent_gnnd.hpp

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ struct BuildConfig {
6464
float termination_threshold{0.0001};
6565
size_t output_graph_degree{32};
6666
cuvs::distance::DistanceType metric{cuvs::distance::DistanceType::L2Expanded};
67-
bool use_fp16_dist_comp{false};
67+
cudaDataType_t internal_distance_dtype{CUDA_R_32F};
6868
};
6969

7070
template <typename Index_t>
@@ -230,7 +230,8 @@ class GNND {
230230
using input_t = std::remove_const_t<Data_t>;
231231

232232
// d_data_half_ is used for a special case when input data is fp32 on host and
233-
// use_fp16_dist_comp flag is True
233+
// internal_distance_dtype is CUDA_R_16F: we store the dataset on device as fp16 (instead of
234+
// fp32) to halve the device memory footprint and WMMA kernel read bandwidth.
234235
std::optional<raft::device_matrix<half, size_t, raft::row_major>> d_data_half_;
235236
// d_data_direct_ is used when input data is on host, and we need to copy it to device
236237
std::optional<raft::device_matrix<input_t, size_t, raft::row_major>> d_data_direct_;
@@ -306,15 +307,15 @@ inline BuildConfig get_build_config(raft::resources const& res,
306307
size_t extended_intermediate_degree =
307308
roundUp32(static_cast<size_t>(intermediate_degree * (intermediate_degree <= 32 ? 1.0 : 1.3)));
308309

309-
BuildConfig build_config{.max_dataset_size = num_rows,
310-
.dataset_dim = num_cols,
311-
.node_degree = extended_graph_degree,
312-
.internal_node_degree = extended_intermediate_degree,
313-
.max_iterations = params.max_iterations,
314-
.termination_threshold = params.termination_threshold,
315-
.output_graph_degree = params.graph_degree,
316-
.metric = params.metric,
317-
.use_fp16_dist_comp = params.use_fp16_dist_comp};
310+
BuildConfig build_config{.max_dataset_size = num_rows,
311+
.dataset_dim = num_cols,
312+
.node_degree = extended_graph_degree,
313+
.internal_node_degree = extended_intermediate_degree,
314+
.max_iterations = params.max_iterations,
315+
.termination_threshold = params.termination_threshold,
316+
.output_graph_degree = params.graph_degree,
317+
.metric = params.metric,
318+
.internal_distance_dtype = params.internal_distance_dtype};
318319
return build_config;
319320
}
320321

python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pxd

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,12 @@ from cuvs.common.cydlpack cimport DLDataType, DLManagedTensor
1212
from cuvs.distance_type cimport cuvsDistanceType
1313

1414

15+
cdef extern from "library_types.h":
16+
ctypedef enum cudaDataType_t:
17+
CUDA_R_32F "CUDA_R_32F" # float
18+
CUDA_R_16F "CUDA_R_16F" # half
19+
20+
1521
cdef extern from "cuvs/neighbors/nn_descent.h" nogil:
1622
# Deprecated — to be removed in 26.08 and replaced by cuvsNNDescentIndexParams_v6.
1723
ctypedef enum cuvsNNDescentDistCompDtype:
@@ -39,7 +45,7 @@ cdef extern from "cuvs/neighbors/nn_descent.h" nogil:
3945
size_t max_iterations
4046
float termination_threshold
4147
bool return_distances
42-
bool use_fp16_dist_comp
48+
cudaDataType_t internal_distance_dtype
4349

4450
ctypedef cuvsNNDescentIndexParams_v6* cuvsNNDescentIndexParams_v6_t
4551

python/cuvs/cuvs/neighbors/nn_descent/nn_descent.pyx

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,14 @@ from libc.stdint cimport (
3636
from cuvs.common.exceptions import check_cuvs
3737

3838

39+
cdef _map_dtype_np_to_cuda(dtype):
40+
mapping = {np.float32: cudaDataType_t.CUDA_R_32F,
41+
np.float16: cudaDataType_t.CUDA_R_16F}
42+
if dtype not in mapping:
43+
raise TypeError("Type %s is not supported" % str(dtype))
44+
return mapping[dtype]
45+
46+
3947
cdef class IndexParams:
4048
"""
4149
Parameters to build NN-Descent Index
@@ -63,12 +71,13 @@ cdef class IndexParams:
6371
The delta at which nn-descent will terminate its iterations
6472
return_distances : bool
6573
Whether to return distances array
66-
use_fp16_dist_comp : bool, default = False
67-
When True and the input data is fp32, distance computation is performed
68-
in fp16 for better performance and lower memory usage at the cost of
69-
precision. This requires copying the fp32 input to an internal fp16
70-
buffer on the device. Has no effect on non-fp32 input types (fp16,
71-
int8, uint8) which always use fp16 distance computation.
74+
internal_distance_dtype : numpy dtype, default = np.float32
75+
Only applicable for fp32 input. Controls the precision used to compute
76+
distances. Possible values: [np.float32, np.float16]. Set to np.float16
77+
to compute distances in fp16 (faster, uses less device memory; not
78+
recommended for dim <= 16 due to precision loss). Has no effect on
79+
non-fp32 input types (fp16, int8, uint8) which always compute distances
80+
in fp16.
7281
"""
7382

7483
cdef cuvsNNDescentIndexParams_v6* params
@@ -88,7 +97,7 @@ cdef class IndexParams:
8897
max_iterations=None,
8998
termination_threshold=None,
9099
return_distances=None,
91-
use_fp16_dist_comp=None
100+
internal_distance_dtype=None
92101
):
93102
if metric is not None:
94103
self.params.metric = <cuvsDistanceType>DISTANCE_TYPES[metric]
@@ -102,8 +111,9 @@ cdef class IndexParams:
102111
self.params.termination_threshold = termination_threshold
103112
if return_distances is not None:
104113
self.params.return_distances = return_distances
105-
if use_fp16_dist_comp is not None:
106-
self.params.use_fp16_dist_comp = use_fp16_dist_comp
114+
if internal_distance_dtype is not None:
115+
self.params.internal_distance_dtype = \
116+
_map_dtype_np_to_cuda(internal_distance_dtype)
107117

108118
@property
109119
def metric(self):

python/cuvs/cuvs/tests/test_nn_descent.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,11 @@ def test_nn_descent(
6161

6262

6363
@pytest.mark.parametrize("n_cols", [2, 17, 32])
64-
@pytest.mark.parametrize("use_fp16_dist_comp", [False, True])
64+
@pytest.mark.parametrize("internal_distance_dtype", [np.float32, np.float16])
6565
@pytest.mark.parametrize("dtype", [np.float32, np.float16])
66-
def test_nn_descent_use_fp16_dist_comp(n_cols, use_fp16_dist_comp, dtype):
66+
def test_nn_descent_internal_distance_dtype(
67+
n_cols, internal_distance_dtype, dtype
68+
):
6769
metric = "sqeuclidean"
6870
graph_degree = 32
6971
n_rows = 100_000
@@ -77,7 +79,7 @@ def test_nn_descent_use_fp16_dist_comp(n_cols, use_fp16_dist_comp, dtype):
7779
metric=metric,
7880
graph_degree=graph_degree,
7981
return_distances=True,
80-
use_fp16_dist_comp=use_fp16_dist_comp,
82+
internal_distance_dtype=internal_distance_dtype,
8183
)
8284

8385
index = nn_descent.build(params, X)
@@ -88,9 +90,13 @@ def test_nn_descent_use_fp16_dist_comp(n_cols, use_fp16_dist_comp, dtype):
8890
_, bf_indices = brute_force.search(index, gpu_X, k=graph_degree)
8991
bf_indices = bf_indices.copy_to_host()
9092

91-
if n_cols <= 16 and use_fp16_dist_comp and dtype == np.float32:
92-
# for small dim, if data is fp32 but use_fp16_dist_comp is True, the
93-
# recall will be low
93+
if (
94+
n_cols <= 16
95+
and internal_distance_dtype == np.float16
96+
and dtype == np.float32
97+
):
98+
# for small dim, if data is fp32 but internal_distance_dtype is
99+
# np.float16, the recall will be low
94100
assert calc_recall(nnd_indices, bf_indices) < 0.7
95101
else:
96102
assert calc_recall(nnd_indices, bf_indices) > 0.9

0 commit comments

Comments
 (0)