NVIDIA · HowardHuang1 · Mar 7, 2026 · Mar 9, 2026 · Mar 10, 2026 · Mar 11, 2026
@@ -1324,30 +1324,61 @@ auto build(raft::resources const& handle,
     auto cluster_centers = cluster_centers_buf.data();
 
     // Train balanced hierarchical kmeans clustering
-    auto trainset_const_view = raft::make_const_mdspan(trainset.view());
-    auto centers_view        = raft::make_device_matrix_view<float, internal_extents_t>(
+    auto centers_view = raft::make_device_matrix_view<float, internal_extents_t>(
       cluster_centers, impl->n_lists(), impl->dim());
     cuvs::cluster::kmeans::balanced_params kmeans_params;
     kmeans_params.n_iters = params.kmeans_n_iters;
     kmeans_params.metric  = static_cast<cuvs::distance::DistanceType>((int)impl->metric());
 
-    if (impl->metric() == distance::DistanceType::CosineExpanded) {
-      raft::linalg::row_normalize<raft::linalg::L2Norm>(
-        handle, trainset_const_view, trainset.view());
-    }
-    cuvs::cluster::kmeans::fit(handle, kmeans_params, trainset_const_view, centers_view);
-
-    // Trainset labels are needed for training PQ codebooks
     rmm::device_uvector<uint32_t> labels(n_rows_train, stream, big_memory_resource);
+    auto labels_view =
+      raft::make_device_vector_view<uint32_t, internal_extents_t>(labels.data(), n_rows_train);
     auto centers_const_view = raft::make_device_matrix_view<const float, internal_extents_t>(
       cluster_centers, impl->n_lists(), impl->dim());
-    if (impl->metric() == distance::DistanceType::CosineExpanded) {
+
+    if (impl->metric() == distance::DistanceType::InnerProduct) {
+      // Normalization only for k-means: use a copy so trainset stays in original space; metric
+      // remains inner product for the rest of the pipeline.
+      auto trainset_kmeans = raft::make_device_mdarray<float>(
+        handle, device_memory, raft::make_extents<int64_t>(n_rows_train, dim));
+      raft::copy(handle, trainset_kmeans.view(), trainset.view());
+      auto trainset_kmeans_view = raft::make_device_matrix_view<float, internal_extents_t>(
+        trainset_kmeans.data_handle(), n_rows_train, dim);
+      raft::linalg::row_normalize<raft::linalg::L2Norm>(
+        handle, raft::make_const_mdspan(trainset_kmeans_view), trainset_kmeans_view);
+      auto trainset_kmeans_const_view = raft::make_const_mdspan(trainset_kmeans.view());
+      cuvs::cluster::kmeans::fit(handle, kmeans_params, trainset_kmeans_const_view, centers_view);
       raft::linalg::row_normalize<raft::linalg::L2Norm>(handle, centers_const_view, centers_view);
+      cuvs::cluster::kmeans::predict(
+        handle, kmeans_params, trainset_kmeans_const_view, centers_const_view, labels_view);
+      // Recompute centers in original space (mean of unnormalized trainset per cluster), overwrites centers_view
+      rmm::device_uvector<uint32_t> cluster_sizes(impl->n_lists(), stream, device_memory);
+      cuvs::cluster::kmeans::detail::calc_centers_and_sizes<float, float, internal_extents_t, uint32_t, uint32_t>(
+        handle,
+        cluster_centers,
+        cluster_sizes.data(),
+        static_cast<internal_extents_t>(impl->n_lists()),
+        static_cast<internal_extents_t>(dim),
+        trainset.data_handle(),
+        static_cast<internal_extents_t>(n_rows_train),
+        labels.data(),
+        true,
+        raft::identity_op{},
+        device_memory);
+    } else if (impl->metric() == distance::DistanceType::CosineExpanded) {
+      auto trainset_const_view = raft::make_const_mdspan(trainset.view());
+      raft::linalg::row_normalize<raft::linalg::L2Norm>(
+        handle, trainset_const_view, trainset.view());
+      cuvs::cluster::kmeans::fit(handle, kmeans_params, trainset_const_view, centers_view);
+      raft::linalg::row_normalize<raft::linalg::L2Norm>(handle, centers_const_view, centers_view);
+      cuvs::cluster::kmeans::predict(
+        handle, kmeans_params, trainset_const_view, centers_const_view, labels_view);
+    } else {
+      auto trainset_const_view = raft::make_const_mdspan(trainset.view());
+      cuvs::cluster::kmeans::fit(handle, kmeans_params, trainset_const_view, centers_view);
+      cuvs::cluster::kmeans::predict(
+        handle, kmeans_params, trainset_const_view, centers_const_view, labels_view);
     }
-    auto labels_view =
-      raft::make_device_vector_view<uint32_t, internal_extents_t>(labels.data(), n_rows_train);
-    cuvs::cluster::kmeans::predict(
-      handle, kmeans_params, trainset_const_view, centers_const_view, labels_view);
 
     // Make rotation matrix
     helpers::make_rotation_matrix(handle, impl->rotation_matrix(), params.force_random_rotation);

@@ -201,7 +201,13 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"):
         else:
             distances = xp.concatenate([distances, D], axis=1)
             indices = xp.concatenate([indices, Ind], axis=1)
-            idx = xp.argsort(distances, axis=1)[:, :k]
+            # Euclidean: smaller distance is better → sort ascending.
+            # Inner product: D holds similarities, larger is better → sort
+            # descending (equivalent to ascending on -D if library used -sim).
+            if metric == "inner_product":
+                idx = xp.argsort(-distances, axis=1)[:, :k]
+            else:
+                idx = xp.argsort(distances, axis=1)[:, :k]
             distances = xp.take_along_axis(distances, idx, axis=1)
             indices = xp.take_along_axis(indices, idx, axis=1)
 

@@ -345,7 +345,7 @@ def get_build_path(executable: str) -> Optional[str]:
     build_path = os.getenv("CUVS_HOME")
     if build_path:
         build_path = os.path.join(
-            build_path, "cpp", "build", "release", executable
+            build_path, "cpp", "build", "bench", "ann", executable
         )
         if os.path.exists(build_path):
             print(f"-- Using cuVS bench from repository in {build_path}.")

@@ -11,6 +11,24 @@
 from typing import Dict, List, Optional, Tuple
 
 
+def _subprocess_env(ann_executable_path: str) -> Dict[str, str]:
+    """Build env for C++ benchmark subprocess. When CUVS_HOME is set, force the repo's libcuvs.so to be used (LD_PRELOAD + LD_LIBRARY_PATH) so the correct local build runs."""
+    env = os.environ.copy()
+    repo = os.getenv("CUVS_HOME")
+    if repo:
+        build_dir = os.path.join(repo, "cpp", "build")
+        if os.path.isdir(build_dir):
+            lib = os.path.join(build_dir, "libcuvs.so")
+            if os.path.isfile(lib):
+                env["LD_PRELOAD"] = lib + (os.pathsep + env["LD_PRELOAD"] if env.get("LD_PRELOAD") else "")
+            env["LD_LIBRARY_PATH"] = build_dir + os.pathsep + env.get("LD_LIBRARY_PATH", "")
+            # So IVF-PQ normalization logging goes to a known path (C++ uses this when set)
+            log_path = os.path.join(build_dir, "cuvs_ivf_pq_normalization.log")
+            env["CUVS_IVF_PQ_NORMALIZATION_LOG"] = log_path
+            print(f"[cuvs_bench] IVF-PQ normalization log (if any) -> {log_path}", flush=True)
+    return env
+
+
 def cuvs_bench_cpp(
     conf_file: Dict,
     conf_filename: str,
@@ -123,7 +141,7 @@ def cuvs_bench_cpp(
                 )
             else:
                 try:
-                    subprocess.run(cmd, check=True)
+                    subprocess.run(cmd, check=True, env=_subprocess_env(ann_executable_path))
                     merge_build_files(
                         build_folder, build_file, temp_build_file
                     )
@@ -163,7 +181,7 @@ def cuvs_bench_cpp(
                 )
             else:
                 try:
-                    subprocess.run(cmd, check=True)
+                    subprocess.run(cmd, check=True, env=_subprocess_env(ann_executable_path))
                 except Exception as e:
                     print(f"Error occurred running benchmark: {e}")
                 finally:

@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+#
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Launcher to run cuvs_bench from THIS repository's Python code (ignoring
+# any cuvs_bench installed in the environment). Use this when developing
+# cuvs_bench in a fork or branch and you want to ensure the local code
+# is used while still using conda-installed C++ binaries.
+#
+# Usage (from the repo root that contains python/cuvs_bench, e.g. cuvs_vector_norm):
+#   python python/cuvs_bench/run_benchmark_local.py --dataset ... --algorithms ...
+# To confirm which package is used: CUVS_BENCH_DEBUG_LOAD=1 python python/cuvs_bench/run_benchmark_local.py ...
+# To use local libcuvs.so: set CUVS_HOME to repo root (runner sets LD_PRELOAD and LD_LIBRARY_PATH for the benchmark subprocess).
+#
+# One-liner (must fix sys.argv so Click sees your flags; run from repo root):
+#   python -c "
+#   import sys, runpy, os
+#   sys.path.insert(0, os.path.join(os.getcwd(), 'python'))
+#   if '--' in sys.argv:
+#       sys.argv = ['cuvs_bench.run'] + sys.argv[sys.argv.index('--')+1:]
+#   runpy.run_module('cuvs_bench.run', run_name='__main__')
+#   " -- --dataset deep-image-96-inner -k 10 --batch-size 10 --algorithms cuvs_ivf_pq ...
+
+from pathlib import Path
+import os
+import runpy
+import sys
+
+# Repo root: directory that contains python/cuvs_bench (one level up from this file's parent)
+_SCRIPT_DIR = Path(__file__).resolve().parent
+_REPO_PYTHON = _SCRIPT_DIR.parent  # python/ inside the repo
+_REPO_ROOT = _REPO_PYTHON.parent   # repo root
+
+# Prepend this repo's python directory so "import cuvs_bench" uses local code.
+# Clear PYTHONPATH so the env cannot override (e.g. conda or shell set to cuvs).
+if "PYTHONPATH" in os.environ:
+    os.environ.pop("PYTHONPATH")
+_REPO_PYTHON_STR = str(_REPO_PYTHON)
+if _REPO_PYTHON_STR not in sys.path:
+    sys.path.insert(0, _REPO_PYTHON_STR)
+elif sys.path[0] != _REPO_PYTHON_STR:
+    sys.path.remove(_REPO_PYTHON_STR)
+    sys.path.insert(0, _REPO_PYTHON_STR)
+if os.environ.get("CUVS_BENCH_DEBUG_LOAD"):
+    print(f"[cuvs_bench launcher] using python path: {_REPO_PYTHON_STR}", file=sys.stderr)
+
+# Run the run module as __main__
+runpy.run_module("cuvs_bench.run", run_name="__main__")
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+#
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Launcher to run cuvs_bench.plot from THIS repository's Python code (ignoring
+# any cuvs_bench installed in the environment). Use this when developing
+# cuvs_bench in a fork or branch so the local plot code is used.
+#
+# Usage (from the repo root that contains python/cuvs_bench, e.g. cuvs_vector_norm):
+#   python python/cuvs_bench/run_plot_local.py --search --dataset deep-image-96-inner --dataset-path ./datasets -k 10 -bs 10000 --output-filepath .
+# To confirm which package is used: CUVS_BENCH_DEBUG_LOAD=1 python python/cuvs_bench/run_plot_local.py ...
+
+from pathlib import Path
+import os
+import runpy
+import sys
+
+# Repo root: directory that contains python/cuvs_bench (one level up from this file's parent)
+_SCRIPT_DIR = Path(__file__).resolve().parent
+_REPO_PYTHON = _SCRIPT_DIR.parent  # python/ inside the repo
+_REPO_ROOT = _REPO_PYTHON.parent   # repo root
+
+# Prepend this repo's python directory so "import cuvs_bench" uses local code.
+# Clear PYTHONPATH so the env cannot override (e.g. conda or another repo).
+if "PYTHONPATH" in os.environ:
+    os.environ.pop("PYTHONPATH")
+_REPO_PYTHON_STR = str(_REPO_PYTHON)
+if _REPO_PYTHON_STR not in sys.path:
+    sys.path.insert(0, _REPO_PYTHON_STR)
+elif sys.path[0] != _REPO_PYTHON_STR:
+    sys.path.remove(_REPO_PYTHON_STR)
+    sys.path.insert(0, _REPO_PYTHON_STR)
+if os.environ.get("CUVS_BENCH_DEBUG_LOAD"):
+    print(f"[cuvs_bench launcher] using python path: {_REPO_PYTHON_STR}", file=sys.stderr)
+
+# Run the plot module as __main__
+runpy.run_module("cuvs_bench.plot", run_name="__main__")
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+#
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Standalone check: does groundtruth.neighbors.ibin match exact inner-product
+# top-k on the full base? (No dependency on installed cuvs_bench version.)
+#
+# Usage:
+#   python verify_ip_groundtruth.py base.fbin queries.fbin groundtruth.ibin [k] [query_row]
+# Prints overlap, ids, and a side-by-side table of (idx, dot) for brute vs file (first 10 of k).
+#
+import struct
+import sys
+
+import numpy as np
+
+
+def _read_shape(path):
+    with open(path, "rb") as f:
+        return struct.unpack("<II", f.read(8))
+
+
+def mmap_fbin(path):
+    nr, nc = _read_shape(path)
+    return np.memmap(path, dtype=np.float32, mode="r", offset=8, shape=(nr, nc))
+
+
+def mmap_ibin(path):
+    nr, nc = _read_shape(path)
+    return np.memmap(path, dtype=np.int32, mode="r", offset=8, shape=(nr, nc))
+
+
+def ip_scores_for_indices(dataset_mm, query_vec, indices):
+    """Inner product base[idx]·q for each index (for diagnostics)."""
+    q = np.asarray(query_vec, dtype=np.float32).ravel()
+    n = dataset_mm.shape[0]
+    out = np.empty(len(indices), dtype=np.float64)
+    for i, idx in enumerate(indices):
+        idx = int(idx)
+        if idx < 0 or idx >= n:
+            out[i] = np.nan
+        else:
+            row = np.asarray(dataset_mm[idx], dtype=np.float32)
+            out[i] = float(np.dot(row, q))
+    return out
+
+
+def brute_ip_topk_chunked(query_vec, dataset_mm, k, chunk_rows=65536):
+    q = np.asarray(query_vec, dtype=np.float32).ravel()
+    n = dataset_mm.shape[0]
+    top_sim = np.full(k, -np.inf, dtype=np.float64)
+    top_idx = np.zeros(k, dtype=np.int64)
+    for start in range(0, n, chunk_rows):
+        end = min(start + chunk_rows, n)
+        block = np.asarray(dataset_mm[start:end], dtype=np.float32)
+        sim = block @ q
+        merged_sim = np.concatenate([top_sim, sim.astype(np.float64)])
+        merged_idx = np.concatenate(
+            [top_idx, np.arange(start, end, dtype=np.int64)]
+        )
+        pick = np.argsort(-merged_sim)[:k]
+        top_sim = merged_sim[pick]
+        top_idx = merged_idx[pick]
+    return top_idx
+
+
+def main():
+    if len(sys.argv) < 4:
+        print(
+            "usage: python verify_ip_groundtruth.py "
+            "base.fbin queries.fbin groundtruth.neighbors.ibin [k] [query_row]",
+            file=sys.stderr,
+        )
+        sys.exit(2)
+    base_p, q_p, gt_p = sys.argv[1:4]
+    k = int(sys.argv[4]) if len(sys.argv) > 4 else 10
+    qi = int(sys.argv[5]) if len(sys.argv) > 5 else 0
+
+    base = mmap_fbin(base_p)
+    queries = mmap_fbin(q_p)
+    gt = mmap_ibin(gt_p)
+    if base.shape[1] != queries.shape[1]:
+        print(
+            f"dim mismatch base {base.shape[1]} vs queries {queries.shape[1]}",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+    kk = min(k, gt.shape[1])
+    if qi >= queries.shape[0] or qi >= gt.shape[0]:
+        print("query_row out of range", file=sys.stderr)
+        sys.exit(1)
+
+    print(
+        f"shapes: base={base.shape} queries={queries.shape} gt={gt.shape} "
+        f"(gt rows should match queries rows; gt cols >= k)"
+    )
+
+    truth = brute_ip_topk_chunked(queries[qi], base, kk)
+    got = np.asarray(gt[qi, :kk], dtype=np.int64)
+    n_base = base.shape[0]
+    bad_got = np.logical_or(got < 0, got >= n_base)
+    if bad_got.any():
+        print(
+            f"warning: {bad_got.sum()} neighbor id(s) out of range [0, {n_base}) "
+            f"in file row {qi} — possible wrong dtype/endian or corrupt header"
+        )
+
+    inter = len(set(truth.tolist()) & set(got.tolist()))
+    print(f"query_row={qi} k={kk} overlap true∩file = {inter}/{kk}")
+    print(f"  brute IP top-{kk} ids: {truth.tolist()}")
+    print(f"  file row ids:        {got.tolist()}")
+
+    qv = np.asarray(queries[qi], dtype=np.float32).ravel()
+    truth_dots = ip_scores_for_indices(base, qv, truth)
+    got_dots = ip_scores_for_indices(base, qv, got)
+    # Sort by dot descending so you see "best first" (same order as true IP ranking)
+    t_order = np.argsort(-truth_dots)
+    g_order = np.argsort(-got_dots)
+    show = min(10, kk)
+    print()
+    print(
+        f"Inner product (dot) scores for query_row={qi} "
+        f"(showing first {show} of {kk}; sorted by dot desc within each list):"
+    )
+    print(f"  {'rank':>4}  {'brute idx':>12}  {'brute dot':>14}  |  {'file idx':>12}  {'file dot':>14}")
+    for r in range(show):
+        ti = t_order[r]
+        gi = g_order[r]
+        print(
+            f"  {r + 1:4d}  {int(truth[ti]):12d}  {truth_dots[ti]:14.6g}  |  "
+            f"{int(got[gi]):12d}  {got_dots[gi]:14.6g}"
+        )
+    if kk > show:
+        print(f"  ... ({kk - show} more rows per column not shown)")
+    print()
+    print(
+        f"  brute: min_dot={np.nanmin(truth_dots):.6g}  max_dot={np.nanmax(truth_dots):.6g}  "
+        f"(true IP top-{kk} should have the k largest dots in the dataset)"
+    )
+    print(
+        f"  file:  min_dot={np.nanmin(got_dots):.6g}  max_dot={np.nanmax(got_dots):.6g}  "
+        f"(if file is IP GT, these should match brute up to ties)"
+    )
+
+    if inter < kk:
+        print(
+            "If not k/k, GT file is not raw IP top-k for this base/queries "
+            "(or rows misaligned)."
+        )
+
+
+if __name__ == "__main__":
+    main()