From e3c5794346bc369e1a5f5f721ea970c6983759c4 Mon Sep 17 00:00:00 2001
From: blaise-muhirwa <blaise@groundlight.ai>
Date: Thu, 23 Nov 2023 15:04:36 -0800
Subject: [PATCH] add big bench experiment runner

---
 experiments/Makefile                     |  23 +++
 experiments/big-ann/run_bigann.py        | 222 -----------------------
 experiments/evaluate_faiss.py            |   2 +-
 experiments/pyproject.toml               |   2 +-
 experiments/run-big-bench.py             | 207 +++++++++++++++++++++
 experiments/utils.py                     |  44 -----
 flatnav/Index.h                          |  16 +-
 flatnav/distances/InnerProductDistance.h |   6 +-
 flatnav/distances/SquaredL2Distance.h    |   6 +-
 flatnav_python/install_flatnav.sh        |  39 +++-
 flatnav_python/python_bindings.cpp       |  14 +-
 flatnav_python/test_index.py             |   3 +-
 quantization/ProductQuantization.h       |  20 +-
 13 files changed, 308 insertions(+), 296 deletions(-)
 create mode 100644 experiments/Makefile
 delete mode 100644 experiments/big-ann/run_bigann.py
 create mode 100644 experiments/run-big-bench.py
 delete mode 100644 experiments/utils.py
diff --git a/experiments/Makefile b/experiments/Makefile
new file mode 100644
index 0000000..45fb57b
--- /dev/null
+++ b/experiments/Makefile
@@ -0,0 +1,23 @@
+
+
+# Run the yandex-deep benchmark and log everything to a logs.txt file 
+yandex-deep-bench: install-lib 
+	poetry run python run-big-bench.py \
+		--dataset path/to/yandex-deep.350.fbin \
+		--queries path/to/yandex-deep-queries.fbin \
+		--gtruth path/to/yandex-ground-truth \
+		--metric l2  \
+		logs.txt 2>&1
+
+
+# Install all dependencies including flatnav
+install-lib: generate-wheel 
+	poetry add ../flatnav_python/dist/*.whl 
+	poetry install --no-root
+
+# This will generate the wheel for flatnav and put it in
+# ../flatnav_python/dist
+generate-wheel:
+	pwd 
+	cd .. && cd flatnav_python && ./install_flatnav.sh 
+	
\ No newline at end of file
diff --git a/experiments/big-ann/run_bigann.py b/experiments/big-ann/run_bigann.py
deleted file mode 100644
index 7db5c61..0000000
--- a/experiments/big-ann/run_bigann.py
+++ /dev/null
@@ -1,222 +0,0 @@
-import json
-import numpy as np
-from typing import Optional, Tuple, List
-import numpy as np
-from dvclive import Live
-import faiss
-import os
-import logging
-import platform, socket, psutil
-import argparse
-
-
-ENVIRONMENT_INFO = {
-    "load_before_experiment": os.getloadavg()[2],
-    "platform": platform.platform(),
-    "platform_version": platform.version(),
-    "platform_release": platform.release(),
-    "architecture": platform.machine(),
-    "processor": platform.processor(),
-    "hostname": socket.gethostname(),
-    "ram_gb": round(psutil.virtual_memory().total / (1024.0**3)),
-    "num_cores": psutil.cpu_count(logical=True),
-}
-
-
-DATASETS = {
-    "mnist-784-euclidean": (
-        "mnist-784-euclidean.train.npy",
-        "mnist-784-euclidean.test.npy",
-        "mnist-784-euclidean.gtruth.npy",
-    ),
-    "glove-25-angular": (
-        "glove-25-angular.train.npy",
-        "glove-25-angular.test.npy",
-        "glove-25-angular.gtruth.npy",
-    ),
-    "glove-100-angular": (
-        "glove-100-angular.train.npy",
-        "glove-100-angular.test.npy",
-        "glove-100-angular.gtruth.npy",
-    ),
-    "glove-200-angular": (
-        "glove-200-angular.train.npy",
-        "glove-200-angular.test.npy",
-        "glove-200-angular.gtruth.npy",
-    ),
-    "sift-128-euclidean": (
-        "sift-128-euclidean.train.npy",
-        "sift-128-euclidean.test.npy",
-        "sift-128-euclidean.gtruth.npy",
-    ),
-}
-
-
-def load_benchmark_dataset(dataset_name: Optional[None]) -> Tuple[np.ndarray]:
-    """
-    This assumes that we have a /data/<dataset_name> already present.
-    This data directory can be generated by running
-        $ /bin/download_anns_datasets.sh <dataset-name>
-
-    This directory will be expected to have the following files:
-        - <dataset-name>/<dataset-name>.train.npy
-        - <dataset-name>/<dataset-name>.test.npy
-        - <dataset-name>/<dataset-name>.gtruth.npy
-    """
-    dataset_name = dataset_name.lower()
-    if not dataset_name in DATASETS.keys():
-        raise AssertionError(
-            f"{dataset_name=} not in the list of supported datasets."
-            "Consider adding it to the list of checking if you misspelled the name."
-        )
-
-    train_file, queries_file, gtruth_file = DATASETS[dataset_name]
-    base_dir = os.path.join(os.getcwd(), "..", "data", dataset_name)
-
-    return (
-        np.load(os.path.join(base_dir, train_file)),
-        np.load(os.path.join(base_dir, queries_file)),
-        np.load(os.path.join(base_dir, gtruth_file)),
-    )
-
-
-def compute_recall(index, queries: np.ndarray, ground_truth: np.ndarray, k=100):
-    """
-    Compute recall for given queries, ground truth, and a Faiss index.
-
-    Args:
-        - index: The Faiss index to search.
-        - queries: The query vectors.
-        - ground_truth: The ground truth indices for each query.
-        - k: Number of neighbors to search.
-
-    Returns:
-        Mean recall over all queries.
-    """
-    _, top_k_indices = index.search(queries, k)
-
-    # Convert each ground truth list to a set for faster lookup
-    ground_truth_sets = [set(gt) for gt in ground_truth]
-
-    mean_recall = 0
-
-    for idx, k_neighbors in enumerate(top_k_indices):
-        query_recall = sum(
-            1 for neighbor in k_neighbors if neighbor in ground_truth_sets[idx]
-        )
-        mean_recall += query_recall / k
-
-    recall = mean_recall / len(queries)
-    return recall
-
-
-def train_hnsw_index(
-    data,
-    pq_m,
-    num_node_links,
-    ef_construction: Optional[int] = 128,
-    ef_search: Optional[int] = 128,
-    serialize=False,
-):
-    """
-    Train a HNSW index topped with product quantization
-    Args:
-        - pq_m: Number of subquantizers for PQ. This should exactly divide the
-            dataset dimensions.
-        - num_node_links: Maximum number of links to keep for each node in the graph
-        - serialize: Serialize so we can get a sense of how large the index binary is.
-
-    Returns:
-        Index
-
-    Helpful link on correct usage: https://github.com/facebookresearch/faiss/issues/1621
-    """
-    # configure the index
-    dim = data.shape[1]  # data dimension
-
-    # Create the HNSW index
-    index = faiss.IndexHNSWPQ(dim, pq_m, num_node_links)
-    index.hnsw.efConstruction = ef_construction
-    index.hnsw.efSearchh = ef_search
-
-    logging.info("Training index...")
-    index.train(data)
-
-    # Add vectors to the index
-    index.add(data)
-
-    if serialize:
-        # Serialize the index to disk
-        buffer = faiss.serialize_index(index)
-        index_size = len(buffer)
-
-        logging.info(f"Index size: {index_size} bytes")
-
-    return index
-
-def main(
-    train_dataset: np.ndarray,
-    queries: np.ndarray,
-    gtruth: np.ndarray,
-    ef_cons_params: List[int],
-    ef_search_params: List[int],
-    num_node_links: List[int],
-    pq_m: Optional[int] = 8,
-):
-    with Live() as live:
-        for param_key, param_val in ENVIRONMENT_INFO.items():
-            live.log_param(param_key, param_val)
-
-        for node_links in num_node_links:
-            for ef_cons in ef_cons_params:
-                for ef_search in ef_search_params:
-                    live.log_param("node_links", node_links)
-                    live.log_param("ef_construction", ef_cons)
-                    live.log_param("ef_search", ef_search)
-
-                    index = train_hnsw_index(
-                        data=train_dataset,
-                        pq_m=pq_m,
-                        num_node_links=node_links,
-                        ef_construction=ef_cons,
-                        ef_search=ef_search,
-                    )
-                    recall = compute_recall(
-                        index=index, queries=queries, ground_truth=gtruth
-                    )
-                    live.log_metric("Recall@100", recall)
-                    logging.info(
-                        f"Recall@100: {recall}, node_links={node_links}, ef_cons={ef_cons}, ef_search={ef_search}"
-                    )
-
-                    live.next_step()
-
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.DEBUG)
-    parser = argparse.ArgumentParser(description="")
-    parser.add_argument(
-        "--datasets", required=True, nargs="+", help="ANNS benchmark dataset to run on."
-    )
-    parser.add_argument(
-        "--log_metrics", required=False, default=False, help="Log metrics to MLFlow."
-    )
-
-    args = parser.parse_args()
-
-    ef_constructions = [32, 64, 128]
-    ef_searches = [32, 64, 128]
-    num_node_links = [8, 16, 32, 64]
-
-    for dataset in args.datasets:
-        train_data, queries, ground_truth = load_benchmark_dataset(dataset_name=dataset)
-
-        main(
-            train_dataset=train_data,
-            queries=queries,
-            gtruth=ground_truth,
-            ef_cons_params=ef_constructions,
-            ef_search_params=ef_searches,
-            num_node_links=num_node_links,
-        )
-
diff --git a/experiments/evaluate_faiss.py b/experiments/evaluate_faiss.py
index 7db5c61..5508720 100644
--- a/experiments/evaluate_faiss.py
+++ b/experiments/evaluate_faiss.py
@@ -154,6 +154,7 @@ def train_hnsw_index(
 
     return index
 
+
 def main(
     train_dataset: np.ndarray,
     queries: np.ndarray,
@@ -219,4 +220,3 @@ def main(
             ef_search_params=ef_searches,
             num_node_links=num_node_links,
         )
-
diff --git a/experiments/pyproject.toml b/experiments/pyproject.toml
index 8b3c60e..da29bed 100644
--- a/experiments/pyproject.toml
+++ b/experiments/pyproject.toml
@@ -12,7 +12,7 @@ dvc = "^3.28.0"
 black = "^23.11.0"
 numpy = "^1.26.1"
 faiss-cpu = "^1.7.4"
-flatnav = {path = "../flatnav_python/dist/flatnav-0.0.1-cp310-cp310-linux_x86_64.whl"}
+flatnav = {path = "../flatnav_python/dist/flatnav-0.0.1-cp311-cp311-macosx_13_0_arm64.whl"}
 
 
 [build-system]
diff --git a/experiments/run-big-bench.py b/experiments/run-big-bench.py
new file mode 100644
index 0000000..dddb5e4
--- /dev/null
+++ b/experiments/run-big-bench.py
@@ -0,0 +1,207 @@
+import time
+from typing import Union
+import json
+import numpy as np
+from typing import Optional, Tuple, List
+import numpy as np
+from dvclive import Live
+import os
+import logging
+import platform, socket, psutil
+import argparse
+import flatnav
+
+
+ENVIRONMENT_INFO = {
+    "load_before_experiment": os.getloadavg()[2],
+    "platform": platform.platform(),
+    "platform_version": platform.version(),
+    "platform_release": platform.release(),
+    "architecture": platform.machine(),
+    "processor": platform.processor(),
+    "hostname": socket.gethostname(),
+    "ram_gb": round(psutil.virtual_memory().total / (1024.0**3)),
+    "num_cores": psutil.cpu_count(logical=True),
+}
+
+
+def load_benchmark_dataset(
+    train_dataset_path: str, queries_path: str, gtruth_path: str
+) -> Tuple[np.ndarray]:
+    def verify_paths_exist(paths: List[str]) -> None:
+        for path in paths:
+            if not os.path.exists(path):
+                raise ValueError(f"Invalid file path: {path}")
+
+    verify_paths_exist([train_dataset_path, queries_path, gtruth_path])
+
+    train_dataset = np.fromfile(
+        train_dataset_path,
+        dtype=np.float32 if train_dataset_path.endswith("fbin") else np.uint8,
+    )
+    queries_dataset = np.fromfile(
+        queries_path, dtype=np.float32 if queries_path.endswith("fbin") else np.uint8
+    )
+    gtruth_dataset = np.fromfile(
+        gtruth_path,
+        dtype=np.float32 if gtruth_path.endswith("fbin") else np.uint8,
+    )
+
+    return train_dataset, queries_dataset, gtruth_dataset
+
+
+def compute_metrics(
+    index: Union[flatnav.index.L2Index, flatnav.index.IPIndex],
+    queries: np.ndarray,
+    ground_truth: np.ndarray,
+    ef_search: int,
+    k=100,
+) -> Tuple[float, float]:
+    """
+    Compute recall and QPS for given queries, ground truth, and a FlaNav index.
+
+    Args:
+        - index: The Faiss index to search.
+        - queries: The query vectors.
+        - ground_truth: The ground truth indices for each query.
+        - k: Number of neighbors to search.
+
+    Returns:
+        Mean recall over all queries.
+        QPS over all queries
+    """
+    start = time.time()
+    _, top_k_indices = index.search(queries=queries, ef_search=ef_search, K=k)
+    end = time.time()
+
+    querying_time = end - start
+    qps = 1 / querying_time
+
+    # Convert each ground truth list to a set for faster lookup
+    ground_truth_sets = [set(gt) for gt in ground_truth]
+
+    mean_recall = 0
+
+    for idx, k_neighbors in enumerate(top_k_indices):
+        query_recall = sum(
+            1 for neighbor in k_neighbors if neighbor in ground_truth_sets[idx]
+        )
+        mean_recall += query_recall / k
+
+    recall = mean_recall / len(queries)
+
+    return recall, qps
+
+
+def train_flatnav_index(
+    train_dataset: np.ndarray,
+    distance_type: str,
+    dim: int,
+    dataset_size: int,
+    max_edges_per_node: int,
+    ef_construction: int,
+) -> Union[flatnav.index.L2Index, flatnav.index.IPIndex]:
+    index = flatnav.index.index_factory(
+        distance_type=distance_type,
+        dim=dim,
+        dataset_size=dataset_size,
+        max_edges_per_node=max_edges_per_node,
+        verbose=True,
+    )
+
+    # Train the index.
+    start = time.time()
+    index.add(data=train_dataset, ef_construction=ef_construction)
+    end = time.time()
+
+    logging.info(f"Indexing time = {end - start} seconds")
+
+    return index
+
+
+def main(
+    train_dataset: np.ndarray,
+    queries: np.ndarray,
+    gtruth: np.ndarray,
+    ef_cons_params: List[int],
+    ef_search_params: List[int],
+    num_node_links: List[int],
+    distance_type: str,
+):
+    dataset_size = train_dataset.shape[0]
+    dim = train_dataset.shape[1]
+
+    for node_links in num_node_links:
+        for ef_cons in ef_cons_params:
+            for ef_search in ef_search_params:
+                index = train_flatnav_index(
+                    train_dataset=train_dataset,
+                    max_edges_per_node=node_links,
+                    ef_construction=ef_cons,
+                    dataset_size=dataset_size,
+                    dim=dim,
+                    distance_type=distance_type,
+                )
+
+                recall, qps = compute_metrics(
+                    index=index,
+                    queries=queries,
+                    ground_truth=gtruth,
+                    ef_search=ef_search,
+                )
+
+                logging.info(
+                    f"Recall@100: {recall}, QPS={qps}, node_links={node_links},"
+                    f" ef_cons={ef_cons}, ef_search={ef_search}"
+                )
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
+    parser = argparse.ArgumentParser(
+        description="Benchmark Flatnav on Big ANN datasets."
+    )
+    parser.add_argument(
+        "--dataset",
+        required=True,
+        help="Path to a single ANNS benchmark dataset to run on.",
+    )
+    parser.add_argument(
+        "--queries", required=True, help="Path to a singe queries file."
+    )
+    parser.add_argument(
+        "--gtruth",
+        required=True,
+        help="Path to a single ground truth file to evaluate on.",
+    )
+    parser.add_argument(
+        "--metric",
+        required=True,
+        default="l2",
+        help="Distance tye. Options include `l2` and `angular`.",
+    )
+    parser.add_argument(
+        "--log_metrics", required=False, default=False, help="Log metrics to DVC."
+    )
+
+    args = parser.parse_args()
+
+    ef_construction_params = [32, 64, 128]
+    ef_search_params = [32, 64, 128, 256]
+    num_node_links = [8, 16, 32, 64]
+
+    train_data, queries, ground_truth = load_benchmark_dataset(
+        train_dataset_path=args.dataset,
+        queries_path=args.queries,
+        gtruth_path=args.gtruth,
+    )
+
+    main(
+        train_dataset=train_data,
+        queries=queries,
+        gtruth=ground_truth,
+        ef_cons_params=ef_construction_params,
+        ef_search_params=ef_search_params,
+        num_node_links=num_node_links,
+        distance_type=args.metric.lower(),
+    )
diff --git a/experiments/utils.py b/experiments/utils.py
deleted file mode 100644
index d0cac00..0000000
--- a/experiments/utils.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import platform, socket, psutil 
-import os 
-import numpy as np 
-
-ENVIRONMENT_INFO = {
-    "load_before_experiment": os.getloadavg()[2],
-    "platform": platform.platform(),
-    "platform_version": platform.version(),
-    "platform_release": platform.release(),
-    "architecture": platform.machine(),
-    "processor": platform.processor(),
-    "hostname": socket.gethostname(),
-    "ram_gb": round(psutil.virtual_memory().total / (1024.0**3)),
-    "num_cores": psutil.cpu_count(logical=True),
-}
-
-def compute_recall(index, queries: np.ndarray, ground_truth: np.ndarray, k=100):
-    """
-    Compute recall for given queries, ground truth, and an index.
-
-    Args:
-        - index: The Faiss index to search.
-        - queries: The query vectors.
-        - ground_truth: The ground truth indices for each query.
-        - k: Number of neighbors to search.
-
-    Returns:
-        Mean recall over all queries.
-    """
-    _, top_k_indices = index.search(queries, k)
-
-    # Convert each ground truth list to a set for faster lookup
-    ground_truth_sets = [set(gt) for gt in ground_truth]
-
-    mean_recall = 0
-
-    for idx, k_neighbors in enumerate(top_k_indices):
-        query_recall = sum(
-            1 for neighbor in k_neighbors if neighbor in ground_truth_sets[idx]
-        )
-        mean_recall += query_recall / k
-
-    recall = mean_recall / len(queries)
-    return recall
\ No newline at end of file
diff --git a/flatnav/Index.h b/flatnav/Index.h
index 6f9e766..07b757c 100644
--- a/flatnav/Index.h
+++ b/flatnav/Index.h
@@ -201,14 +201,14 @@ template <typename dist_t, typename label_t> class Index {
   inline size_t dataDimension() const { return _distance->dimension(); }
 
   void printIndexParams() const {
-    std::cout << "\nIndex Parameters" << std::endl;
-    std::cout << "-----------------------------" << std::endl;
-    std::cout << "max_edges_per_node (M): " << _M << std::endl;
-    std::cout << "data_size_bytes: " << _data_size_bytes << std::endl;
-    std::cout << "node_size_bytes: " << _node_size_bytes << std::endl;
-    std::cout << "max_node_count: " << _max_node_count << std::endl;
-    std::cout << "cur_num_nodes: " << _cur_num_nodes << std::endl;
-    std::cout << "visited_nodes size: " << _visited_nodes.size() << std::endl;
+    std::cout << "\nIndex Parameters" << std::flush;
+    std::cout << "-----------------------------" << std::flush;
+    std::cout << "max_edges_per_node (M): " << _M << std::flush;
+    std::cout << "data_size_bytes: " << _data_size_bytes << std::flush;
+    std::cout << "node_size_bytes: " << _node_size_bytes << std::flush;
+    std::cout << "max_node_count: " << _max_node_count << std::flush;
+    std::cout << "cur_num_nodes: " << _cur_num_nodes << std::flush;
+    std::cout << "visited_nodes size: " << _visited_nodes.size() << std::flush;
 
     _distance->printParams();
   }
diff --git a/flatnav/distances/InnerProductDistance.h b/flatnav/distances/InnerProductDistance.h
index a182e66..457b634 100644
--- a/flatnav/distances/InnerProductDistance.h
+++ b/flatnav/distances/InnerProductDistance.h
@@ -64,9 +64,9 @@ class InnerProductDistance : public DistanceInterface<InnerProductDistance> {
   }
 
   void printParamsImpl() {
-    std::cout << "\nInnerProductDistance Parameters" << std::endl;
-    std::cout << "-----------------------------" << std::endl;
-    std::cout << "Dimension: " << _dimension << std::endl;
+    std::cout << "\nInnerProductDistance Parameters" << std::flush;
+    std::cout << "-----------------------------" << std::flush;
+    std::cout << "Dimension: " << _dimension << std::flush;
   }
 };
 
diff --git a/flatnav/distances/SquaredL2Distance.h b/flatnav/distances/SquaredL2Distance.h
index 8056262..dfcf956 100644
--- a/flatnav/distances/SquaredL2Distance.h
+++ b/flatnav/distances/SquaredL2Distance.h
@@ -69,9 +69,9 @@ class SquaredL2Distance : public DistanceInterface<SquaredL2Distance> {
   }
 
   void printParamsImpl() {
-    std::cout << "\nSquaredL2Distance Parameters" << std::endl;
-    std::cout << "-----------------------------" << std::endl;
-    std::cout << "Dimension: " << _dimension << std::endl;
+    std::cout << "\nSquaredL2Distance Parameters" << std::flush;
+    std::cout << "-----------------------------" << std::flush;
+    std::cout << "Dimension: " << _dimension << std::flush;
   }
 };
 
diff --git a/flatnav_python/install_flatnav.sh b/flatnav_python/install_flatnav.sh
index 0a2b087..d707dbd 100755
--- a/flatnav_python/install_flatnav.sh
+++ b/flatnav_python/install_flatnav.sh
@@ -2,7 +2,44 @@
 
 set -ex 
 
-poetry install --no-root
+function check_poetry_installed() {
+    if ! command -v poetry &> /dev/null; then 
+        echo "Poetry not found. Installing it now..."
+
+        curl -sSL https://install.python-poetry.org | python3 -
+
+        # Check the shell and append to poetry to PATH 
+        SHELL_NAME=$(basename "$SHELL")
+        # For newer poetry versions, this might be different. 
+        # On ubuntu x86-64, for instance, I found this to be instead
+        # $HOME/.local/share/pypoetry/venv/bin 
+        POETRY_PATH="$HOME/.poetry/bin"
+
+        if [[ "$SHELL_NAME" == "zsh" ]]; then 
+            echo "Detected zsh shell."
+            echo "export PATH=\"$POETRY_PATH:\$PATH\"" >> $HOME/.zshrc
+            source $HOME/.zshrc
+
+        elif [[ "$SHELL_NAME" == "bash" ]]; then 
+            echo "Detected bash shell."
+            echo "export PATH=\"$POETRY_PATH:\$PATH\"" >> $HOME/.bashrc
+            source $HOME/.bashrc 
+
+        else 
+            echo "Unsupported shell for poetry installation. $SHELL_NAME"
+            exit 1
+        fi 
+    fi
+}
+
+
+# Make sure we are in this directory 
+cd "$(dirname "$0")"
+
+# Install poetry if not yet installed
+check_poetry_installed
+
+poetry lock && poetry install --no-root
 
 # Activate the poetry environment 
 POETRY_ENV=$(poetry env info --path)
diff --git a/flatnav_python/python_bindings.cpp b/flatnav_python/python_bindings.cpp
index 2476305..702199d 100644
--- a/flatnav_python/python_bindings.cpp
+++ b/flatnav_python/python_bindings.cpp
@@ -34,7 +34,12 @@ template <typename dist_t, typename label_t> class PyIndex {
 
   explicit PyIndex(std::unique_ptr<Index<dist_t, label_t>> index)
       : _dim(index->dataDimension()), label_id(0), _verbose(false),
-        _index(index.get()) {}
+        _index(index.get()) {
+
+    if (_verbose) {
+      _index->printIndexParams();
+    }
+  }
 
   PyIndex(std::shared_ptr<DistanceInterface<dist_t>> distance, int dim,
           int dataset_size, int max_edges_per_node, bool verbose = false)
@@ -42,7 +47,12 @@ template <typename dist_t, typename label_t> class PyIndex {
         _index(new Index<dist_t, label_t>(
             /* dist = */ std::move(distance),
             /* dataset_size = */ dataset_size,
-            /* max_edges_per_node = */ max_edges_per_node)) {}
+            /* max_edges_per_node = */ max_edges_per_node)) {
+
+    if (_verbose) {
+      _index->printIndexParams();
+    }
+  }
 
   Index<dist_t, label_t> *getIndex() { return _index; }
 
diff --git a/flatnav_python/test_index.py b/flatnav_python/test_index.py
index c2f32d5..35b9890 100644
--- a/flatnav_python/test_index.py
+++ b/flatnav_python/test_index.py
@@ -11,7 +11,8 @@
 
 
 def generate_random_data(dataset_length: int, dim: int) -> np.ndarray:
-    return np.random.rand(dataset_length, dim)
+    # return np.random.rand(dataset_length, dim)
+    return np.random.randint(0, 256, size=(dataset_length, dim), dtype=np.uint8)
 
 
 def get_ann_benchmark_dataset(dataset_name):
diff --git a/quantization/ProductQuantization.h b/quantization/ProductQuantization.h
index 2e45507..544fd4d 100644
--- a/quantization/ProductQuantization.h
+++ b/quantization/ProductQuantization.h
@@ -419,18 +419,18 @@ class ProductQuantizer : public flatnav::DistanceInterface<ProductQuantizer> {
   }
 
   void printParamsImpl() const {
-    std::cout << "\nProduct Quantizer Parameters" << std::endl;
-    std::cout << "-----------------------------" << std::endl;
+    std::cout << "\nProduct Quantizer Parameters" << std::flush;
+    std::cout << "-----------------------------" << std::flush;
     std::cout << "Number of subquantizers (M): " << _num_subquantizers
-              << std::endl;
-    std::cout << "Number of bits per index: " << _num_bits << std::endl;
-    std::cout << "Subvector dimension: " << _subvector_dim << std::endl;
+              << std::flush;
+    std::cout << "Number of bits per index: " << _num_bits << std::flush;
+    std::cout << "Subvector dimension: " << _subvector_dim << std::flush;
     std::cout << "Subquantizer centroids count: " << _subq_centroids_count
-              << std::endl;
-    std::cout << "Code size: " << _code_size << std::endl;
-    std::cout << "Is trained: " << _is_trained << std::endl;
-    std::cout << "Train type: " << _train_type << std::endl;
-    std::cout << "\n" << std::endl;
+              << std::flush;
+    std::cout << "Code size: " << _code_size << std::flush;
+    std::cout << "Is trained: " << _is_trained << std::flush;
+    std::cout << "Train type: " << _train_type << std::flush;
+    std::cout << "\n" << std::flush;
   }
 
   inline uint32_t getNumSubquantizers() const { return _num_subquantizers; }