Skip to content
Closed
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
ee94971
Fixed vector normalization for Inner Product -- still failing 15 test…
HowardHuang1 Mar 7, 2026
78e47f1
Merge branch 'main' into HH-Vector-Normalization
aamijar Mar 9, 2026
e822460
add logging to compare recall and search speed for raw v.s. normalize…
HowardHuang1 Mar 10, 2026
da32073
revert change comparing normalize and raw in single run -- that was p…
HowardHuang1 Mar 11, 2026
5375bb5
previously normalization was applied to entire IVF-PQ pipeline --> ch…
HowardHuang1 Mar 16, 2026
c320640
revert to raw vectors. No need to normalize here because normalized v…
HowardHuang1 Mar 16, 2026
0486bf5
clean up code
HowardHuang1 Mar 16, 2026
107e2b3
clean up code
HowardHuang1 Mar 16, 2026
5442b89
upload code that resolves linker issue + live csv updates
HowardHuang1 Mar 18, 2026
dc9b6df
remove live_csv
HowardHuang1 Mar 18, 2026
cf3666c
Merge branch 'main' into HH-Vector-Normalization
HowardHuang1 Mar 18, 2026
bdda881
hardcode file path instead of searching multiple directories + fix in…
HowardHuang1 Mar 19, 2026
c237db3
clean up unnecessary checks in data_export.py
HowardHuang1 Mar 19, 2026
3c20377
bring back comma parsing instead of underscore parsing
HowardHuang1 Mar 20, 2026
728c964
bring back parts of plot/__main__.py for clarity
HowardHuang1 Mar 20, 2026
6c9bc36
get rid of incremental JSON->CSV write for clarity
HowardHuang1 Mar 20, 2026
eb6bb88
bring back original plot/__main__.py for clarity
HowardHuang1 Mar 20, 2026
4546a85
fix cuvs-bench generate groundtruth which was sorted incorrectly for …
HowardHuang1 Mar 23, 2026
ea8a6b4
revert normalization outside kernel that requires copy of dataset res…
HowardHuang1 Mar 30, 2026
5800e53
didn't modify kernels themselves but routed IP predict through existi…
HowardHuang1 Mar 31, 2026
945d3ee
add inner_product_cosine_assignment flag which is set when ivf_pq_bui…
HowardHuang1 Apr 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 45 additions & 14 deletions cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1324,30 +1324,61 @@ auto build(raft::resources const& handle,
auto cluster_centers = cluster_centers_buf.data();

// Train balanced hierarchical kmeans clustering
auto trainset_const_view = raft::make_const_mdspan(trainset.view());
auto centers_view = raft::make_device_matrix_view<float, internal_extents_t>(
auto centers_view = raft::make_device_matrix_view<float, internal_extents_t>(
cluster_centers, impl->n_lists(), impl->dim());
cuvs::cluster::kmeans::balanced_params kmeans_params;
kmeans_params.n_iters = params.kmeans_n_iters;
kmeans_params.metric = static_cast<cuvs::distance::DistanceType>((int)impl->metric());

if (impl->metric() == distance::DistanceType::CosineExpanded) {
raft::linalg::row_normalize<raft::linalg::L2Norm>(
handle, trainset_const_view, trainset.view());
}
cuvs::cluster::kmeans::fit(handle, kmeans_params, trainset_const_view, centers_view);

// Trainset labels are needed for training PQ codebooks
rmm::device_uvector<uint32_t> labels(n_rows_train, stream, big_memory_resource);
auto labels_view =
raft::make_device_vector_view<uint32_t, internal_extents_t>(labels.data(), n_rows_train);
auto centers_const_view = raft::make_device_matrix_view<const float, internal_extents_t>(
cluster_centers, impl->n_lists(), impl->dim());
if (impl->metric() == distance::DistanceType::CosineExpanded) {

if (impl->metric() == distance::DistanceType::InnerProduct) {
// Normalization only for k-means: use a copy so trainset stays in original space; metric
// remains inner product for the rest of the pipeline.
auto trainset_kmeans = raft::make_device_mdarray<float>(

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wait- we seem to be doing a copy of the trainset? Why? Just do the normalization in the distance kernels.

@cjnolet cjnolet Mar 26, 2026

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not the way we do things- we don't do unecessary copies just to avoid changing kernels. This becomes frustrating for users, because they are almost always memory limited and every GB counts.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed.

handle, device_memory, raft::make_extents<int64_t>(n_rows_train, dim));
raft::copy(handle, trainset_kmeans.view(), trainset.view());
auto trainset_kmeans_view = raft::make_device_matrix_view<float, internal_extents_t>(
trainset_kmeans.data_handle(), n_rows_train, dim);
raft::linalg::row_normalize<raft::linalg::L2Norm>(
handle, raft::make_const_mdspan(trainset_kmeans_view), trainset_kmeans_view);
auto trainset_kmeans_const_view = raft::make_const_mdspan(trainset_kmeans.view());
cuvs::cluster::kmeans::fit(handle, kmeans_params, trainset_kmeans_const_view, centers_view);
raft::linalg::row_normalize<raft::linalg::L2Norm>(handle, centers_const_view, centers_view);
cuvs::cluster::kmeans::predict(
handle, kmeans_params, trainset_kmeans_const_view, centers_const_view, labels_view);
// Recompute centers in original space (mean of unnormalized trainset per cluster), overwrites centers_view
rmm::device_uvector<uint32_t> cluster_sizes(impl->n_lists(), stream, device_memory);
cuvs::cluster::kmeans::detail::calc_centers_and_sizes<float, float, internal_extents_t, uint32_t, uint32_t>(

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we use the calc_centers_and_sizes in the public namespace if possible?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey @jinsolp !

  1. Yes exactly. If we were to keep the normalized centroids that would cause the computation to no longer be Inner Product and instead it will degenerate to Cosine which is not what we want. To prevent this, we use calc_centers_and_sizes to re-compute the proper centroids on the raw vectors. In essence, we only want normalization to happen in the kmeans cluster assignment step rather than the whole pipeline (normalization of the whole pipeline changes metric to Cosine).
  2. The changes in cuvs bench are to resolve a linker issue. Will remove these when merging. They can be ignored for now.
  3. The screenshot is a bit outdated, will replace soon with a new one. Recall should be same search should be faster.

handle,
cluster_centers,
cluster_sizes.data(),
static_cast<internal_extents_t>(impl->n_lists()),
static_cast<internal_extents_t>(dim),
trainset.data_handle(),
static_cast<internal_extents_t>(n_rows_train),
labels.data(),
true,
raft::identity_op{},
device_memory);
} else if (impl->metric() == distance::DistanceType::CosineExpanded) {
auto trainset_const_view = raft::make_const_mdspan(trainset.view());
raft::linalg::row_normalize<raft::linalg::L2Norm>(
handle, trainset_const_view, trainset.view());
cuvs::cluster::kmeans::fit(handle, kmeans_params, trainset_const_view, centers_view);
raft::linalg::row_normalize<raft::linalg::L2Norm>(handle, centers_const_view, centers_view);
cuvs::cluster::kmeans::predict(
handle, kmeans_params, trainset_const_view, centers_const_view, labels_view);
} else {
auto trainset_const_view = raft::make_const_mdspan(trainset.view());
cuvs::cluster::kmeans::fit(handle, kmeans_params, trainset_const_view, centers_view);
cuvs::cluster::kmeans::predict(
handle, kmeans_params, trainset_const_view, centers_const_view, labels_view);
}
auto labels_view =
raft::make_device_vector_view<uint32_t, internal_extents_t>(labels.data(), n_rows_train);
cuvs::cluster::kmeans::predict(
handle, kmeans_params, trainset_const_view, centers_const_view, labels_view);

// Make rotation matrix
helpers::make_rotation_matrix(handle, impl->rotation_matrix(), params.force_random_rotation);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,13 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"):
else:
distances = xp.concatenate([distances, D], axis=1)
indices = xp.concatenate([indices, Ind], axis=1)
idx = xp.argsort(distances, axis=1)[:, :k]
# Euclidean: smaller distance is better → sort ascending.
# Inner product: D holds similarities, larger is better → sort
# descending (equivalent to ascending on -D if library used -sim).
if metric == "inner_product":
idx = xp.argsort(-distances, axis=1)[:, :k]
else:
idx = xp.argsort(distances, axis=1)[:, :k]
distances = xp.take_along_axis(distances, idx, axis=1)
indices = xp.take_along_axis(indices, idx, axis=1)

Expand Down
2 changes: 1 addition & 1 deletion python/cuvs_bench/cuvs_bench/run/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ def get_build_path(executable: str) -> Optional[str]:
build_path = os.getenv("CUVS_HOME")
if build_path:
build_path = os.path.join(
build_path, "cpp", "build", "release", executable
build_path, "cpp", "build", "bench", "ann", executable
)
if os.path.exists(build_path):
print(f"-- Using cuVS bench from repository in {build_path}.")
Expand Down
22 changes: 20 additions & 2 deletions python/cuvs_bench/cuvs_bench/run/runners.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,24 @@
from typing import Dict, List, Optional, Tuple


def _subprocess_env(ann_executable_path: str) -> Dict[str, str]:
"""Build env for C++ benchmark subprocess. When CUVS_HOME is set, force the repo's libcuvs.so to be used (LD_PRELOAD + LD_LIBRARY_PATH) so the correct local build runs."""
env = os.environ.copy()
repo = os.getenv("CUVS_HOME")
if repo:
build_dir = os.path.join(repo, "cpp", "build")
if os.path.isdir(build_dir):
lib = os.path.join(build_dir, "libcuvs.so")
if os.path.isfile(lib):
env["LD_PRELOAD"] = lib + (os.pathsep + env["LD_PRELOAD"] if env.get("LD_PRELOAD") else "")
env["LD_LIBRARY_PATH"] = build_dir + os.pathsep + env.get("LD_LIBRARY_PATH", "")
# So IVF-PQ normalization logging goes to a known path (C++ uses this when set)
log_path = os.path.join(build_dir, "cuvs_ivf_pq_normalization.log")
env["CUVS_IVF_PQ_NORMALIZATION_LOG"] = log_path
print(f"[cuvs_bench] IVF-PQ normalization log (if any) -> {log_path}", flush=True)
return env


def cuvs_bench_cpp(
conf_file: Dict,
conf_filename: str,
Expand Down Expand Up @@ -123,7 +141,7 @@ def cuvs_bench_cpp(
)
else:
try:
subprocess.run(cmd, check=True)
subprocess.run(cmd, check=True, env=_subprocess_env(ann_executable_path))
merge_build_files(
build_folder, build_file, temp_build_file
)
Expand Down Expand Up @@ -163,7 +181,7 @@ def cuvs_bench_cpp(
)
else:
try:
subprocess.run(cmd, check=True)
subprocess.run(cmd, check=True, env=_subprocess_env(ann_executable_path))
except Exception as e:
print(f"Error occurred running benchmark: {e}")
finally:
Expand Down
49 changes: 49 additions & 0 deletions python/cuvs_bench/run_benchmark_local.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env python
#
# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0
#
# Launcher to run cuvs_bench from THIS repository's Python code (ignoring
# any cuvs_bench installed in the environment). Use this when developing
# cuvs_bench in a fork or branch and you want to ensure the local code
# is used while still using conda-installed C++ binaries.
#
# Usage (from the repo root that contains python/cuvs_bench, e.g. cuvs_vector_norm):
# python python/cuvs_bench/run_benchmark_local.py --dataset ... --algorithms ...
# To confirm which package is used: CUVS_BENCH_DEBUG_LOAD=1 python python/cuvs_bench/run_benchmark_local.py ...
# To use local libcuvs.so: set CUVS_HOME to repo root (runner sets LD_PRELOAD and LD_LIBRARY_PATH for the benchmark subprocess).
#
# One-liner (must fix sys.argv so Click sees your flags; run from repo root):
# python -c "
# import sys, runpy, os
# sys.path.insert(0, os.path.join(os.getcwd(), 'python'))
# if '--' in sys.argv:
# sys.argv = ['cuvs_bench.run'] + sys.argv[sys.argv.index('--')+1:]
# runpy.run_module('cuvs_bench.run', run_name='__main__')
# " -- --dataset deep-image-96-inner -k 10 --batch-size 10 --algorithms cuvs_ivf_pq ...

from pathlib import Path
import os
import runpy
import sys

# Repo root: directory that contains python/cuvs_bench (one level up from this file's parent)
_SCRIPT_DIR = Path(__file__).resolve().parent
_REPO_PYTHON = _SCRIPT_DIR.parent # python/ inside the repo
_REPO_ROOT = _REPO_PYTHON.parent # repo root

# Prepend this repo's python directory so "import cuvs_bench" uses local code.
# Clear PYTHONPATH so the env cannot override (e.g. conda or shell set to cuvs).
if "PYTHONPATH" in os.environ:
os.environ.pop("PYTHONPATH")
_REPO_PYTHON_STR = str(_REPO_PYTHON)
if _REPO_PYTHON_STR not in sys.path:
sys.path.insert(0, _REPO_PYTHON_STR)
elif sys.path[0] != _REPO_PYTHON_STR:
sys.path.remove(_REPO_PYTHON_STR)
sys.path.insert(0, _REPO_PYTHON_STR)
if os.environ.get("CUVS_BENCH_DEBUG_LOAD"):
print(f"[cuvs_bench launcher] using python path: {_REPO_PYTHON_STR}", file=sys.stderr)

# Run the run module as __main__
runpy.run_module("cuvs_bench.run", run_name="__main__")
38 changes: 38 additions & 0 deletions python/cuvs_bench/run_plot_local.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env python
#
# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0
#
# Launcher to run cuvs_bench.plot from THIS repository's Python code (ignoring
# any cuvs_bench installed in the environment). Use this when developing
# cuvs_bench in a fork or branch so the local plot code is used.
#
# Usage (from the repo root that contains python/cuvs_bench, e.g. cuvs_vector_norm):
# python python/cuvs_bench/run_plot_local.py --search --dataset deep-image-96-inner --dataset-path ./datasets -k 10 -bs 10000 --output-filepath .
# To confirm which package is used: CUVS_BENCH_DEBUG_LOAD=1 python python/cuvs_bench/run_plot_local.py ...

from pathlib import Path
import os
import runpy
import sys

# Repo root: directory that contains python/cuvs_bench (one level up from this file's parent)
_SCRIPT_DIR = Path(__file__).resolve().parent
_REPO_PYTHON = _SCRIPT_DIR.parent # python/ inside the repo
_REPO_ROOT = _REPO_PYTHON.parent # repo root

# Prepend this repo's python directory so "import cuvs_bench" uses local code.
# Clear PYTHONPATH so the env cannot override (e.g. conda or another repo).
if "PYTHONPATH" in os.environ:
os.environ.pop("PYTHONPATH")
_REPO_PYTHON_STR = str(_REPO_PYTHON)
if _REPO_PYTHON_STR not in sys.path:
sys.path.insert(0, _REPO_PYTHON_STR)
elif sys.path[0] != _REPO_PYTHON_STR:
sys.path.remove(_REPO_PYTHON_STR)
sys.path.insert(0, _REPO_PYTHON_STR)
if os.environ.get("CUVS_BENCH_DEBUG_LOAD"):
print(f"[cuvs_bench launcher] using python path: {_REPO_PYTHON_STR}", file=sys.stderr)

# Run the plot module as __main__
runpy.run_module("cuvs_bench.plot", run_name="__main__")
154 changes: 154 additions & 0 deletions python/cuvs_bench/verify_ip_groundtruth.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#!/usr/bin/env python3
#
# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0
#
# Standalone check: does groundtruth.neighbors.ibin match exact inner-product
# top-k on the full base? (No dependency on installed cuvs_bench version.)
#
# Usage:
# python verify_ip_groundtruth.py base.fbin queries.fbin groundtruth.ibin [k] [query_row]
# Prints overlap, ids, and a side-by-side table of (idx, dot) for brute vs file (first 10 of k).
#
import struct
import sys

import numpy as np


def _read_shape(path):
with open(path, "rb") as f:
return struct.unpack("<II", f.read(8))


def mmap_fbin(path):
nr, nc = _read_shape(path)
return np.memmap(path, dtype=np.float32, mode="r", offset=8, shape=(nr, nc))


def mmap_ibin(path):
nr, nc = _read_shape(path)
return np.memmap(path, dtype=np.int32, mode="r", offset=8, shape=(nr, nc))


def ip_scores_for_indices(dataset_mm, query_vec, indices):
"""Inner product base[idx]·q for each index (for diagnostics)."""
q = np.asarray(query_vec, dtype=np.float32).ravel()
n = dataset_mm.shape[0]
out = np.empty(len(indices), dtype=np.float64)
for i, idx in enumerate(indices):
idx = int(idx)
if idx < 0 or idx >= n:
out[i] = np.nan
else:
row = np.asarray(dataset_mm[idx], dtype=np.float32)
out[i] = float(np.dot(row, q))
return out


def brute_ip_topk_chunked(query_vec, dataset_mm, k, chunk_rows=65536):
q = np.asarray(query_vec, dtype=np.float32).ravel()
n = dataset_mm.shape[0]
top_sim = np.full(k, -np.inf, dtype=np.float64)
top_idx = np.zeros(k, dtype=np.int64)
for start in range(0, n, chunk_rows):
end = min(start + chunk_rows, n)
block = np.asarray(dataset_mm[start:end], dtype=np.float32)
sim = block @ q
merged_sim = np.concatenate([top_sim, sim.astype(np.float64)])
merged_idx = np.concatenate(
[top_idx, np.arange(start, end, dtype=np.int64)]
)
pick = np.argsort(-merged_sim)[:k]
top_sim = merged_sim[pick]
top_idx = merged_idx[pick]
return top_idx


def main():
if len(sys.argv) < 4:
print(
"usage: python verify_ip_groundtruth.py "
"base.fbin queries.fbin groundtruth.neighbors.ibin [k] [query_row]",
file=sys.stderr,
)
sys.exit(2)
base_p, q_p, gt_p = sys.argv[1:4]
k = int(sys.argv[4]) if len(sys.argv) > 4 else 10
qi = int(sys.argv[5]) if len(sys.argv) > 5 else 0

base = mmap_fbin(base_p)
queries = mmap_fbin(q_p)
gt = mmap_ibin(gt_p)
if base.shape[1] != queries.shape[1]:
print(
f"dim mismatch base {base.shape[1]} vs queries {queries.shape[1]}",
file=sys.stderr,
)
sys.exit(1)
kk = min(k, gt.shape[1])
if qi >= queries.shape[0] or qi >= gt.shape[0]:
print("query_row out of range", file=sys.stderr)
sys.exit(1)

print(
f"shapes: base={base.shape} queries={queries.shape} gt={gt.shape} "
f"(gt rows should match queries rows; gt cols >= k)"
)

truth = brute_ip_topk_chunked(queries[qi], base, kk)
got = np.asarray(gt[qi, :kk], dtype=np.int64)
n_base = base.shape[0]
bad_got = np.logical_or(got < 0, got >= n_base)
if bad_got.any():
print(
f"warning: {bad_got.sum()} neighbor id(s) out of range [0, {n_base}) "
f"in file row {qi} — possible wrong dtype/endian or corrupt header"
)

inter = len(set(truth.tolist()) & set(got.tolist()))
print(f"query_row={qi} k={kk} overlap true∩file = {inter}/{kk}")
print(f" brute IP top-{kk} ids: {truth.tolist()}")
print(f" file row ids: {got.tolist()}")

qv = np.asarray(queries[qi], dtype=np.float32).ravel()
truth_dots = ip_scores_for_indices(base, qv, truth)
got_dots = ip_scores_for_indices(base, qv, got)
# Sort by dot descending so you see "best first" (same order as true IP ranking)
t_order = np.argsort(-truth_dots)
g_order = np.argsort(-got_dots)
show = min(10, kk)
print()
print(
f"Inner product (dot) scores for query_row={qi} "
f"(showing first {show} of {kk}; sorted by dot desc within each list):"
)
print(f" {'rank':>4} {'brute idx':>12} {'brute dot':>14} | {'file idx':>12} {'file dot':>14}")
for r in range(show):
ti = t_order[r]
gi = g_order[r]
print(
f" {r + 1:4d} {int(truth[ti]):12d} {truth_dots[ti]:14.6g} | "
f"{int(got[gi]):12d} {got_dots[gi]:14.6g}"
)
if kk > show:
print(f" ... ({kk - show} more rows per column not shown)")
print()
print(
f" brute: min_dot={np.nanmin(truth_dots):.6g} max_dot={np.nanmax(truth_dots):.6g} "
f"(true IP top-{kk} should have the k largest dots in the dataset)"
)
print(
f" file: min_dot={np.nanmin(got_dots):.6g} max_dot={np.nanmax(got_dots):.6g} "
f"(if file is IP GT, these should match brute up to ties)"
)

if inter < kk:
print(
"If not k/k, GT file is not raw IP top-k for this base/queries "
"(or rows misaligned)."
)


if __name__ == "__main__":
main()
Loading