From 53d11d86ee83e7a334b9466eadb3e363f6fb0138 Mon Sep 17 00:00:00 2001 From: blaise-muhirwa Date: Mon, 15 Jan 2024 19:31:05 +0000 Subject: [PATCH 1/2] improve experiment runner --- experiments/Makefile | 1 + experiments/run-big-bench.py | 103 ++++++++++++++---- flatnav_python/python_bindings.cpp | 26 +++-- .../unit_tests/test_parallel_insertions.py | 4 +- 4 files changed, 97 insertions(+), 37 deletions(-) diff --git a/experiments/Makefile b/experiments/Makefile index bff71ee..13669e8 100644 --- a/experiments/Makefile +++ b/experiments/Makefile @@ -8,6 +8,7 @@ yandex-deep-bench: --gtruth /media/scratch/yandex-deep/ground_truth.10K.bin \ --metric l2 > logs.txt 2>&1 +# Other available options include: --index-type (hnsw, flatnav) sift-bench: poetry run python run-big-bench.py \ --dataset /root/data/sift-128-euclidean/sift-128-euclidean.train.npy \ diff --git a/experiments/run-big-bench.py b/experiments/run-big-bench.py index ce1de15..22c1ed8 100644 --- a/experiments/run-big-bench.py +++ b/experiments/run-big-bench.py @@ -116,14 +116,14 @@ def load_ground_truth(path: str) -> Tuple[np.ndarray, np.ndarray, int, int]: def compute_metrics( - index: Union[flatnav.index.L2Index, flatnav.index.IPIndex], + index: Union[flatnav.index.L2Index, flatnav.index.IPIndex, hnswlib.Index], queries: np.ndarray, ground_truth: np.ndarray, ef_search: int, k=100, ) -> Tuple[float, float]: """ - Compute recall and QPS for given queries, ground truth, and a FlaNav index. + Compute recall and QPS for given queries, ground truth for the given index(FlatNav or HNSW). Args: - index: A FlatNav index to search. @@ -136,9 +136,18 @@ def compute_metrics( QPS over all queries """ - start = time.time() - _, top_k_indices = index.search(queries=queries, ef_search=ef_search, K=k) - end = time.time() + if type(index) in (flatnav.index.L2Index, flatnav.index.IPIndex): + start = time.time() + _, top_k_indices = index.search( + queries=queries, ef_search=ef_search, K=k, num_initializations=300 + ) + end = time.time() + else: + index.set_ef(ef_search) + start = time.time() + # Search for HNSW return (ids, distances) instead of (distances, ids) + top_k_indices, _ = index.knn_query(data=queries, k=k) + end = time.time() querying_time = end - start qps = len(queries) / querying_time @@ -159,34 +168,72 @@ def compute_metrics( return recall, qps -def train_flatnav_index( +def create_and_train_hnsw_index( + data: np.ndarray, + space: str, + dim: int, + dataset_size: int, + ef_construction: int, + max_edges_per_node: int, + num_threads, +) -> hnswlib.Index: + hnsw_index = hnswlib.Index(space=space, dim=dim) + hnsw_index.init_index( + max_elements=dataset_size, ef_construction=ef_construction, M=max_edges_per_node + ) + hnsw_index.set_num_threads(num_threads) + + start = time.time() + hnsw_index.add_items(data=data, ids=np.arange(dataset_size)) + end = time.time() + logging.info(f"Indexing time = {end - start} seconds") + + return hnsw_index + + +def train_index( train_dataset: np.ndarray, distance_type: str, dim: int, dataset_size: int, max_edges_per_node: int, ef_construction: int, + index_type: str = "flatnav", use_hnsw_base_layer: bool = False, hnsw_base_layer_filename: Optional[str] = None, num_build_threads: int = 1, -) -> Union[flatnav.index.L2Index, flatnav.index.IPIndex]: +) -> Union[flatnav.index.L2Index, flatnav.index.IPIndex, hnswlib.Index]: + if index_type == "hnsw": + # We use "angular" instead of "ip", so here we are just converting. + _distance_type = distance_type if distance_type == "l2" else "ip" + # HNSWlib will have M * 2 edges in the base layer. + # So if we want to use M=32, we need to set M=16 here. + hnsw_index = create_and_train_hnsw_index( + data=train_dataset, + space=_distance_type, + dim=dim, + dataset_size=dataset_size, + ef_construction=ef_construction, + max_edges_per_node=max_edges_per_node // 2, + num_threads=num_build_threads, + ) + + return hnsw_index + if use_hnsw_base_layer: if not hnsw_base_layer_filename: raise ValueError("Must provide a filename for the HNSW base layer graph.") - # build the HNSW index to use as the base layer - # We use "angular" instead of "ip", so here we are just converting. _distance_type = distance_type if distance_type == "l2" else "ip" - hnsw_index = hnswlib.Index(space=_distance_type, dim=dim) - - # HNSWlib will have M * 2 edges in the base layer. - # So if we want to use M=32, we need to set M=16 here. - hnsw_index.init_index( - max_elements=dataset_size, + hnsw_index = create_and_train_hnsw_index( + data=train_dataset, + space=_distance_type, + dim=dim, + dataset_size=dataset_size, ef_construction=ef_construction, - M=max_edges_per_node // 2, + max_edges_per_node=max_edges_per_node // 2, + num_threads=num_build_threads, ) - hnsw_index.add_items(data=train_dataset, ids=np.arange(dataset_size)) # Now extract the base layer's graph and save it to a file. # This will be a Matrix Market file that we use to construct the Flatnav index. @@ -211,13 +258,13 @@ def train_flatnav_index( max_edges_per_node=max_edges_per_node, verbose=False, ) - index.num_threads = num_build_threads - - logging.info(f"Using {index.num_threads} threads for index construction.") + index.set_num_threads(num_build_threads) # Train the index. start = time.time() - index.add(data=train_dataset, ef_construction=ef_construction) + index.add( + data=train_dataset, ef_construction=ef_construction, num_initializations=300 + ) end = time.time() logging.info(f"Indexing time = {end - start} seconds") @@ -233,6 +280,7 @@ def main( ef_search_params: List[int], num_node_links: List[int], distance_type: str, + index_type: str = "flatnav", use_hnsw_base_layer: bool = False, hnsw_base_layer_filename: Optional[str] = None, reordering_strategies: List[str] | None = None, @@ -245,7 +293,8 @@ def main( for node_links in num_node_links: for ef_cons in ef_cons_params: for ef_search in ef_search_params: - index = train_flatnav_index( + index = train_index( + index_type=index_type, train_dataset=train_dataset, max_edges_per_node=node_links, ef_construction=ef_cons, @@ -268,7 +317,7 @@ def main( index.reorder(strategies=reordering_strategies) if num_search_threads > 1: - index.num_threads = num_search_threads + index.set_num_threads(num_search_threads) recall, qps = compute_metrics( index=index, @@ -288,6 +337,13 @@ def parse_arguments() -> argparse.Namespace: description="Benchmark Flatnav on Big ANN datasets." ) + parser.add_argument( + "--index-type", + required=False, + default="flatnav", + help="Type of index to benchmark. Options include `flatnav` and `hnsw`.", + ) + parser.add_argument( "--use-hnsw-base-layer", action="store_true", @@ -398,6 +454,7 @@ def parse_arguments() -> argparse.Namespace: ef_search_params=args.ef_search, num_node_links=args.num_node_links, distance_type=args.metric.lower(), + index_type=args.index_type.lower(), use_hnsw_base_layer=args.use_hnsw_base_layer, hnsw_base_layer_filename=args.hnsw_base_layer_filename, reordering_strategies=args.reordering_strategies, diff --git a/flatnav_python/python_bindings.cpp b/flatnav_python/python_bindings.cpp index 75b6495..3fa294d 100644 --- a/flatnav_python/python_bindings.cpp +++ b/flatnav_python/python_bindings.cpp @@ -312,22 +312,24 @@ void bindIndexMethods( "Perform graph re-ordering based on the given sequence of " "re-ordering strategies. " "Supported re-ordering strategies include `gorder` and `rcm`.") - .def_property( - "num_threads", - [](IndexType &index_type) -> uint32_t { - auto *index = index_type.getIndex(); - return index->getNumThreads(); - }, + .def( + "set_num_threads", [](IndexType &index_type, uint32_t num_threads) { - if (num_threads == 0 || - num_threads > std::thread::hardware_concurrency()) { - throw std::invalid_argument("Invalid number of threads."); - } auto *index = index_type.getIndex(); index->setNumThreads(num_threads); }, - "Configure the desired number of threads. This is useful for " - "constructing the graph or performing KNN search in parallel.") + py::arg("num_threads"), + "Set the number of threads to use for constructing the graph and/or " + "performing KNN search.") + .def_property_readonly( + "num_threads", + [](IndexType &index_type) { + auto *index = index_type.getIndex(); + return index->getNumThreads(); + }, + "Returns the number of threads used for " + "constructing the graph and/or performing KNN " + "search.") .def_property_readonly( "max_edges_per_node", [](IndexType &index_type) { diff --git a/flatnav_python/unit_tests/test_parallel_insertions.py b/flatnav_python/unit_tests/test_parallel_insertions.py index 2354f17..71ec4ba 100644 --- a/flatnav_python/unit_tests/test_parallel_insertions.py +++ b/flatnav_python/unit_tests/test_parallel_insertions.py @@ -25,7 +25,7 @@ def test_parallel_insertions_yield_similar_recall(): assert index.max_edges_per_node == 16 - index.num_threads = os.cpu_count() + index.set_num_threads(os.cpu_count()) print(f"Using {index.num_threads} threads for parallel index construction.") @@ -44,7 +44,7 @@ def test_parallel_insertions_yield_similar_recall(): ) # This is not necessary since by default FlatNav uses a single thread. - single_threaded_index.num_threads = 1 + single_threaded_index.set_num_threads(1) start = time.time() single_threaded_index.add(data=training_set, ef_construction=100) From d073f11e825e92bfeb68978b4c9aa7a54d29d731 Mon Sep 17 00:00:00 2001 From: blaise-muhirwa Date: Thu, 18 Jan 2024 01:30:43 +0000 Subject: [PATCH 2/2] address pr comments --- experiments/run-big-bench.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/experiments/run-big-bench.py b/experiments/run-big-bench.py index 22c1ed8..f262463 100644 --- a/experiments/run-big-bench.py +++ b/experiments/run-big-bench.py @@ -136,7 +136,8 @@ def compute_metrics( QPS over all queries """ - if type(index) in (flatnav.index.L2Index, flatnav.index.IPIndex): + is_flatnav_index = type(index) in (flatnav.index.L2Index, flatnav.index.IPIndex) + if is_flatnav_index: start = time.time() _, top_k_indices = index.search( queries=queries, ef_search=ef_search, K=k, num_initializations=300 @@ -339,7 +340,6 @@ def parse_arguments() -> argparse.Namespace: parser.add_argument( "--index-type", - required=False, default="flatnav", help="Type of index to benchmark. Options include `flatnav` and `hnsw`.", ) @@ -351,14 +351,12 @@ def parse_arguments() -> argparse.Namespace: ) parser.add_argument( "--hnsw-base-layer-filename", - required=False, default=None, help="Filename to save the HNSW base layer graph to. Please use the .mtx extension for clarity.", ) parser.add_argument( "--num-node-links", - required=False, nargs="+", type=int, default=[16, 32], @@ -367,7 +365,6 @@ def parse_arguments() -> argparse.Namespace: parser.add_argument( "--ef-construction", - required=False, nargs="+", type=int, default=[100, 200, 300, 400, 500], @@ -376,7 +373,6 @@ def parse_arguments() -> argparse.Namespace: parser.add_argument( "--ef-search", - required=False, nargs="+", type=int, default=[100, 200, 300, 400, 500, 1000, 2000, 3000, 4000],