From 53d11d86ee83e7a334b9466eadb3e363f6fb0138 Mon Sep 17 00:00:00 2001
From: blaise-muhirwa <blaise@groundlight.ai>
Date: Mon, 15 Jan 2024 19:31:05 +0000
Subject: [PATCH 1/2] improve experiment runner

---
 experiments/Makefile                          |   1 +
 experiments/run-big-bench.py                  | 103 ++++++++++++++----
 flatnav_python/python_bindings.cpp            |  26 +++--
 .../unit_tests/test_parallel_insertions.py    |   4 +-
 4 files changed, 97 insertions(+), 37 deletions(-)

diff --git a/experiments/Makefile b/experiments/Makefile
index bff71ee..13669e8 100644
--- a/experiments/Makefile
+++ b/experiments/Makefile
@@ -8,6 +8,7 @@ yandex-deep-bench:
 		--gtruth /media/scratch/yandex-deep/ground_truth.10K.bin \
 		--metric l2 > logs.txt 2>&1
 
+# Other available options include: --index-type (hnsw, flatnav)
 sift-bench: 
 	poetry run python run-big-bench.py \
 		--dataset /root/data/sift-128-euclidean/sift-128-euclidean.train.npy \
diff --git a/experiments/run-big-bench.py b/experiments/run-big-bench.py
index ce1de15..22c1ed8 100644
--- a/experiments/run-big-bench.py
+++ b/experiments/run-big-bench.py
@@ -116,14 +116,14 @@ def load_ground_truth(path: str) -> Tuple[np.ndarray, np.ndarray, int, int]:
 
 
 def compute_metrics(
-    index: Union[flatnav.index.L2Index, flatnav.index.IPIndex],
+    index: Union[flatnav.index.L2Index, flatnav.index.IPIndex, hnswlib.Index],
     queries: np.ndarray,
     ground_truth: np.ndarray,
     ef_search: int,
     k=100,
 ) -> Tuple[float, float]:
     """
-    Compute recall and QPS for given queries, ground truth, and a FlaNav index.
+    Compute recall and QPS for given queries, ground truth for the given index(FlatNav or HNSW).
 
     Args:
         - index: A FlatNav index to search.
@@ -136,9 +136,18 @@ def compute_metrics(
         QPS over all queries
 
     """
-    start = time.time()
-    _, top_k_indices = index.search(queries=queries, ef_search=ef_search, K=k)
-    end = time.time()
+    if type(index) in (flatnav.index.L2Index, flatnav.index.IPIndex):
+        start = time.time()
+        _, top_k_indices = index.search(
+            queries=queries, ef_search=ef_search, K=k, num_initializations=300
+        )
+        end = time.time()
+    else:
+        index.set_ef(ef_search)
+        start = time.time()
+        # Search for HNSW return (ids, distances) instead of (distances, ids)
+        top_k_indices, _ = index.knn_query(data=queries, k=k)
+        end = time.time()
 
     querying_time = end - start
     qps = len(queries) / querying_time
@@ -159,34 +168,72 @@ def compute_metrics(
     return recall, qps
 
 
-def train_flatnav_index(
+def create_and_train_hnsw_index(
+    data: np.ndarray,
+    space: str,
+    dim: int,
+    dataset_size: int,
+    ef_construction: int,
+    max_edges_per_node: int,
+    num_threads,
+) -> hnswlib.Index:
+    hnsw_index = hnswlib.Index(space=space, dim=dim)
+    hnsw_index.init_index(
+        max_elements=dataset_size, ef_construction=ef_construction, M=max_edges_per_node
+    )
+    hnsw_index.set_num_threads(num_threads)
+
+    start = time.time()
+    hnsw_index.add_items(data=data, ids=np.arange(dataset_size))
+    end = time.time()
+    logging.info(f"Indexing time = {end - start} seconds")
+
+    return hnsw_index
+
+
+def train_index(
     train_dataset: np.ndarray,
     distance_type: str,
     dim: int,
     dataset_size: int,
     max_edges_per_node: int,
     ef_construction: int,
+    index_type: str = "flatnav",
     use_hnsw_base_layer: bool = False,
     hnsw_base_layer_filename: Optional[str] = None,
     num_build_threads: int = 1,
-) -> Union[flatnav.index.L2Index, flatnav.index.IPIndex]:
+) -> Union[flatnav.index.L2Index, flatnav.index.IPIndex, hnswlib.Index]:
+    if index_type == "hnsw":
+        # We use "angular" instead of "ip", so here we are just converting.
+        _distance_type = distance_type if distance_type == "l2" else "ip"
+        # HNSWlib will have M * 2 edges in the base layer.
+        # So if we want to use M=32, we need to set M=16 here.
+        hnsw_index = create_and_train_hnsw_index(
+            data=train_dataset,
+            space=_distance_type,
+            dim=dim,
+            dataset_size=dataset_size,
+            ef_construction=ef_construction,
+            max_edges_per_node=max_edges_per_node // 2,
+            num_threads=num_build_threads,
+        )
+
+        return hnsw_index
+
     if use_hnsw_base_layer:
         if not hnsw_base_layer_filename:
             raise ValueError("Must provide a filename for the HNSW base layer graph.")
 
-        # build the HNSW index to use as the base layer
-        # We use "angular" instead of "ip", so here we are just converting.
         _distance_type = distance_type if distance_type == "l2" else "ip"
-        hnsw_index = hnswlib.Index(space=_distance_type, dim=dim)
-
-        # HNSWlib will have M * 2 edges in the base layer.
-        # So if we want to use M=32, we need to set M=16 here.
-        hnsw_index.init_index(
-            max_elements=dataset_size,
+        hnsw_index = create_and_train_hnsw_index(
+            data=train_dataset,
+            space=_distance_type,
+            dim=dim,
+            dataset_size=dataset_size,
             ef_construction=ef_construction,
-            M=max_edges_per_node // 2,
+            max_edges_per_node=max_edges_per_node // 2,
+            num_threads=num_build_threads,
         )
-        hnsw_index.add_items(data=train_dataset, ids=np.arange(dataset_size))
 
         # Now extract the base layer's graph and save it to a file.
         # This will be a Matrix Market file that we use to construct the Flatnav index.
@@ -211,13 +258,13 @@ def train_flatnav_index(
             max_edges_per_node=max_edges_per_node,
             verbose=False,
         )
-        index.num_threads = num_build_threads
-
-        logging.info(f"Using {index.num_threads} threads for index construction.")
+        index.set_num_threads(num_build_threads)
 
         # Train the index.
         start = time.time()
-        index.add(data=train_dataset, ef_construction=ef_construction)
+        index.add(
+            data=train_dataset, ef_construction=ef_construction, num_initializations=300
+        )
         end = time.time()
 
         logging.info(f"Indexing time = {end - start} seconds")
@@ -233,6 +280,7 @@ def main(
     ef_search_params: List[int],
     num_node_links: List[int],
     distance_type: str,
+    index_type: str = "flatnav",
     use_hnsw_base_layer: bool = False,
     hnsw_base_layer_filename: Optional[str] = None,
     reordering_strategies: List[str] | None = None,
@@ -245,7 +293,8 @@ def main(
     for node_links in num_node_links:
         for ef_cons in ef_cons_params:
             for ef_search in ef_search_params:
-                index = train_flatnav_index(
+                index = train_index(
+                    index_type=index_type,
                     train_dataset=train_dataset,
                     max_edges_per_node=node_links,
                     ef_construction=ef_cons,
@@ -268,7 +317,7 @@ def main(
                     index.reorder(strategies=reordering_strategies)
 
                 if num_search_threads > 1:
-                    index.num_threads = num_search_threads
+                    index.set_num_threads(num_search_threads)
 
                 recall, qps = compute_metrics(
                     index=index,
@@ -288,6 +337,13 @@ def parse_arguments() -> argparse.Namespace:
         description="Benchmark Flatnav on Big ANN datasets."
     )
 
+    parser.add_argument(
+        "--index-type",
+        required=False,
+        default="flatnav",
+        help="Type of index to benchmark. Options include `flatnav` and `hnsw`.",
+    )
+
     parser.add_argument(
         "--use-hnsw-base-layer",
         action="store_true",
@@ -398,6 +454,7 @@ def parse_arguments() -> argparse.Namespace:
         ef_search_params=args.ef_search,
         num_node_links=args.num_node_links,
         distance_type=args.metric.lower(),
+        index_type=args.index_type.lower(),
         use_hnsw_base_layer=args.use_hnsw_base_layer,
         hnsw_base_layer_filename=args.hnsw_base_layer_filename,
         reordering_strategies=args.reordering_strategies,
diff --git a/flatnav_python/python_bindings.cpp b/flatnav_python/python_bindings.cpp
index 75b6495..3fa294d 100644
--- a/flatnav_python/python_bindings.cpp
+++ b/flatnav_python/python_bindings.cpp
@@ -312,22 +312,24 @@ void bindIndexMethods(
           "Perform graph re-ordering based on the given sequence of "
           "re-ordering strategies. "
           "Supported re-ordering strategies include `gorder` and `rcm`.")
-      .def_property(
-          "num_threads",
-          [](IndexType &index_type) -> uint32_t {
-            auto *index = index_type.getIndex();
-            return index->getNumThreads();
-          },
+      .def(
+          "set_num_threads",
           [](IndexType &index_type, uint32_t num_threads) {
-            if (num_threads == 0 ||
-                num_threads > std::thread::hardware_concurrency()) {
-              throw std::invalid_argument("Invalid number of threads.");
-            }
             auto *index = index_type.getIndex();
             index->setNumThreads(num_threads);
           },
-          "Configure the desired number of threads. This is useful for "
-          "constructing the graph or performing KNN search in parallel.")
+          py::arg("num_threads"),
+          "Set the number of threads to use for constructing the graph and/or "
+          "performing KNN search.")
+      .def_property_readonly(
+          "num_threads",
+          [](IndexType &index_type) {
+            auto *index = index_type.getIndex();
+            return index->getNumThreads();
+          },
+          "Returns the number of threads used for "
+          "constructing the graph and/or performing KNN "
+          "search.")
       .def_property_readonly(
           "max_edges_per_node",
           [](IndexType &index_type) {
diff --git a/flatnav_python/unit_tests/test_parallel_insertions.py b/flatnav_python/unit_tests/test_parallel_insertions.py
index 2354f17..71ec4ba 100644
--- a/flatnav_python/unit_tests/test_parallel_insertions.py
+++ b/flatnav_python/unit_tests/test_parallel_insertions.py
@@ -25,7 +25,7 @@ def test_parallel_insertions_yield_similar_recall():
 
     assert index.max_edges_per_node == 16
 
-    index.num_threads = os.cpu_count()
+    index.set_num_threads(os.cpu_count())
 
     print(f"Using {index.num_threads} threads for parallel index construction.")
 
@@ -44,7 +44,7 @@ def test_parallel_insertions_yield_similar_recall():
     )
 
     # This is not necessary since by default FlatNav uses a single thread.
-    single_threaded_index.num_threads = 1
+    single_threaded_index.set_num_threads(1)
 
     start = time.time()
     single_threaded_index.add(data=training_set, ef_construction=100)

From d073f11e825e92bfeb68978b4c9aa7a54d29d731 Mon Sep 17 00:00:00 2001
From: blaise-muhirwa <blaise@groundlight.ai>
Date: Thu, 18 Jan 2024 01:30:43 +0000
Subject: [PATCH 2/2] address pr comments

---
 experiments/run-big-bench.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/experiments/run-big-bench.py b/experiments/run-big-bench.py
index 22c1ed8..f262463 100644
--- a/experiments/run-big-bench.py
+++ b/experiments/run-big-bench.py
@@ -136,7 +136,8 @@ def compute_metrics(
         QPS over all queries
 
     """
-    if type(index) in (flatnav.index.L2Index, flatnav.index.IPIndex):
+    is_flatnav_index = type(index) in (flatnav.index.L2Index, flatnav.index.IPIndex)
+    if is_flatnav_index:
         start = time.time()
         _, top_k_indices = index.search(
             queries=queries, ef_search=ef_search, K=k, num_initializations=300
@@ -339,7 +340,6 @@ def parse_arguments() -> argparse.Namespace:
 
     parser.add_argument(
         "--index-type",
-        required=False,
         default="flatnav",
         help="Type of index to benchmark. Options include `flatnav` and `hnsw`.",
     )
@@ -351,14 +351,12 @@ def parse_arguments() -> argparse.Namespace:
     )
     parser.add_argument(
         "--hnsw-base-layer-filename",
-        required=False,
         default=None,
         help="Filename to save the HNSW base layer graph to. Please use the .mtx extension for clarity.",
     )
 
     parser.add_argument(
         "--num-node-links",
-        required=False,
         nargs="+",
         type=int,
         default=[16, 32],
@@ -367,7 +365,6 @@ def parse_arguments() -> argparse.Namespace:
 
     parser.add_argument(
         "--ef-construction",
-        required=False,
         nargs="+",
         type=int,
         default=[100, 200, 300, 400, 500],
@@ -376,7 +373,6 @@ def parse_arguments() -> argparse.Namespace:
 
     parser.add_argument(
         "--ef-search",
-        required=False,
         nargs="+",
         type=int,
         default=[100, 200, 300, 400, 500, 1000, 2000, 3000, 4000],