From e3c5794346bc369e1a5f5f721ea970c6983759c4 Mon Sep 17 00:00:00 2001 From: blaise-muhirwa Date: Thu, 23 Nov 2023 15:04:36 -0800 Subject: [PATCH] add big bench experiment runner --- experiments/Makefile | 23 +++ experiments/big-ann/run_bigann.py | 222 ----------------------- experiments/evaluate_faiss.py | 2 +- experiments/pyproject.toml | 2 +- experiments/run-big-bench.py | 207 +++++++++++++++++++++ experiments/utils.py | 44 ----- flatnav/Index.h | 16 +- flatnav/distances/InnerProductDistance.h | 6 +- flatnav/distances/SquaredL2Distance.h | 6 +- flatnav_python/install_flatnav.sh | 39 +++- flatnav_python/python_bindings.cpp | 14 +- flatnav_python/test_index.py | 3 +- quantization/ProductQuantization.h | 20 +- 13 files changed, 308 insertions(+), 296 deletions(-) create mode 100644 experiments/Makefile delete mode 100644 experiments/big-ann/run_bigann.py create mode 100644 experiments/run-big-bench.py delete mode 100644 experiments/utils.py diff --git a/experiments/Makefile b/experiments/Makefile new file mode 100644 index 0000000..45fb57b --- /dev/null +++ b/experiments/Makefile @@ -0,0 +1,23 @@ + + +# Run the yandex-deep benchmark and log everything to a logs.txt file +yandex-deep-bench: install-lib + poetry run python run-big-bench.py \ + --dataset path/to/yandex-deep.350.fbin \ + --queries path/to/yandex-deep-queries.fbin \ + --gtruth path/to/yandex-ground-truth \ + --metric l2 \ + logs.txt 2>&1 + + +# Install all dependencies including flatnav +install-lib: generate-wheel + poetry add ../flatnav_python/dist/*.whl + poetry install --no-root + +# This will generate the wheel for flatnav and put it in +# ../flatnav_python/dist +generate-wheel: + pwd + cd .. && cd flatnav_python && ./install_flatnav.sh + \ No newline at end of file diff --git a/experiments/big-ann/run_bigann.py b/experiments/big-ann/run_bigann.py deleted file mode 100644 index 7db5c61..0000000 --- a/experiments/big-ann/run_bigann.py +++ /dev/null @@ -1,222 +0,0 @@ -import json -import numpy as np -from typing import Optional, Tuple, List -import numpy as np -from dvclive import Live -import faiss -import os -import logging -import platform, socket, psutil -import argparse - - -ENVIRONMENT_INFO = { - "load_before_experiment": os.getloadavg()[2], - "platform": platform.platform(), - "platform_version": platform.version(), - "platform_release": platform.release(), - "architecture": platform.machine(), - "processor": platform.processor(), - "hostname": socket.gethostname(), - "ram_gb": round(psutil.virtual_memory().total / (1024.0**3)), - "num_cores": psutil.cpu_count(logical=True), -} - - -DATASETS = { - "mnist-784-euclidean": ( - "mnist-784-euclidean.train.npy", - "mnist-784-euclidean.test.npy", - "mnist-784-euclidean.gtruth.npy", - ), - "glove-25-angular": ( - "glove-25-angular.train.npy", - "glove-25-angular.test.npy", - "glove-25-angular.gtruth.npy", - ), - "glove-100-angular": ( - "glove-100-angular.train.npy", - "glove-100-angular.test.npy", - "glove-100-angular.gtruth.npy", - ), - "glove-200-angular": ( - "glove-200-angular.train.npy", - "glove-200-angular.test.npy", - "glove-200-angular.gtruth.npy", - ), - "sift-128-euclidean": ( - "sift-128-euclidean.train.npy", - "sift-128-euclidean.test.npy", - "sift-128-euclidean.gtruth.npy", - ), -} - - -def load_benchmark_dataset(dataset_name: Optional[None]) -> Tuple[np.ndarray]: - """ - This assumes that we have a /data/ already present. - This data directory can be generated by running - $ /bin/download_anns_datasets.sh - - This directory will be expected to have the following files: - - /.train.npy - - /.test.npy - - /.gtruth.npy - """ - dataset_name = dataset_name.lower() - if not dataset_name in DATASETS.keys(): - raise AssertionError( - f"{dataset_name=} not in the list of supported datasets." - "Consider adding it to the list of checking if you misspelled the name." - ) - - train_file, queries_file, gtruth_file = DATASETS[dataset_name] - base_dir = os.path.join(os.getcwd(), "..", "data", dataset_name) - - return ( - np.load(os.path.join(base_dir, train_file)), - np.load(os.path.join(base_dir, queries_file)), - np.load(os.path.join(base_dir, gtruth_file)), - ) - - -def compute_recall(index, queries: np.ndarray, ground_truth: np.ndarray, k=100): - """ - Compute recall for given queries, ground truth, and a Faiss index. - - Args: - - index: The Faiss index to search. - - queries: The query vectors. - - ground_truth: The ground truth indices for each query. - - k: Number of neighbors to search. - - Returns: - Mean recall over all queries. - """ - _, top_k_indices = index.search(queries, k) - - # Convert each ground truth list to a set for faster lookup - ground_truth_sets = [set(gt) for gt in ground_truth] - - mean_recall = 0 - - for idx, k_neighbors in enumerate(top_k_indices): - query_recall = sum( - 1 for neighbor in k_neighbors if neighbor in ground_truth_sets[idx] - ) - mean_recall += query_recall / k - - recall = mean_recall / len(queries) - return recall - - -def train_hnsw_index( - data, - pq_m, - num_node_links, - ef_construction: Optional[int] = 128, - ef_search: Optional[int] = 128, - serialize=False, -): - """ - Train a HNSW index topped with product quantization - Args: - - pq_m: Number of subquantizers for PQ. This should exactly divide the - dataset dimensions. - - num_node_links: Maximum number of links to keep for each node in the graph - - serialize: Serialize so we can get a sense of how large the index binary is. - - Returns: - Index - - Helpful link on correct usage: https://github.com/facebookresearch/faiss/issues/1621 - """ - # configure the index - dim = data.shape[1] # data dimension - - # Create the HNSW index - index = faiss.IndexHNSWPQ(dim, pq_m, num_node_links) - index.hnsw.efConstruction = ef_construction - index.hnsw.efSearchh = ef_search - - logging.info("Training index...") - index.train(data) - - # Add vectors to the index - index.add(data) - - if serialize: - # Serialize the index to disk - buffer = faiss.serialize_index(index) - index_size = len(buffer) - - logging.info(f"Index size: {index_size} bytes") - - return index - -def main( - train_dataset: np.ndarray, - queries: np.ndarray, - gtruth: np.ndarray, - ef_cons_params: List[int], - ef_search_params: List[int], - num_node_links: List[int], - pq_m: Optional[int] = 8, -): - with Live() as live: - for param_key, param_val in ENVIRONMENT_INFO.items(): - live.log_param(param_key, param_val) - - for node_links in num_node_links: - for ef_cons in ef_cons_params: - for ef_search in ef_search_params: - live.log_param("node_links", node_links) - live.log_param("ef_construction", ef_cons) - live.log_param("ef_search", ef_search) - - index = train_hnsw_index( - data=train_dataset, - pq_m=pq_m, - num_node_links=node_links, - ef_construction=ef_cons, - ef_search=ef_search, - ) - recall = compute_recall( - index=index, queries=queries, ground_truth=gtruth - ) - live.log_metric("Recall@100", recall) - logging.info( - f"Recall@100: {recall}, node_links={node_links}, ef_cons={ef_cons}, ef_search={ef_search}" - ) - - live.next_step() - - -if __name__ == "__main__": - logging.basicConfig(level=logging.DEBUG) - parser = argparse.ArgumentParser(description="") - parser.add_argument( - "--datasets", required=True, nargs="+", help="ANNS benchmark dataset to run on." - ) - parser.add_argument( - "--log_metrics", required=False, default=False, help="Log metrics to MLFlow." - ) - - args = parser.parse_args() - - ef_constructions = [32, 64, 128] - ef_searches = [32, 64, 128] - num_node_links = [8, 16, 32, 64] - - for dataset in args.datasets: - train_data, queries, ground_truth = load_benchmark_dataset(dataset_name=dataset) - - main( - train_dataset=train_data, - queries=queries, - gtruth=ground_truth, - ef_cons_params=ef_constructions, - ef_search_params=ef_searches, - num_node_links=num_node_links, - ) - diff --git a/experiments/evaluate_faiss.py b/experiments/evaluate_faiss.py index 7db5c61..5508720 100644 --- a/experiments/evaluate_faiss.py +++ b/experiments/evaluate_faiss.py @@ -154,6 +154,7 @@ def train_hnsw_index( return index + def main( train_dataset: np.ndarray, queries: np.ndarray, @@ -219,4 +220,3 @@ def main( ef_search_params=ef_searches, num_node_links=num_node_links, ) - diff --git a/experiments/pyproject.toml b/experiments/pyproject.toml index 8b3c60e..da29bed 100644 --- a/experiments/pyproject.toml +++ b/experiments/pyproject.toml @@ -12,7 +12,7 @@ dvc = "^3.28.0" black = "^23.11.0" numpy = "^1.26.1" faiss-cpu = "^1.7.4" -flatnav = {path = "../flatnav_python/dist/flatnav-0.0.1-cp310-cp310-linux_x86_64.whl"} +flatnav = {path = "../flatnav_python/dist/flatnav-0.0.1-cp311-cp311-macosx_13_0_arm64.whl"} [build-system] diff --git a/experiments/run-big-bench.py b/experiments/run-big-bench.py new file mode 100644 index 0000000..dddb5e4 --- /dev/null +++ b/experiments/run-big-bench.py @@ -0,0 +1,207 @@ +import time +from typing import Union +import json +import numpy as np +from typing import Optional, Tuple, List +import numpy as np +from dvclive import Live +import os +import logging +import platform, socket, psutil +import argparse +import flatnav + + +ENVIRONMENT_INFO = { + "load_before_experiment": os.getloadavg()[2], + "platform": platform.platform(), + "platform_version": platform.version(), + "platform_release": platform.release(), + "architecture": platform.machine(), + "processor": platform.processor(), + "hostname": socket.gethostname(), + "ram_gb": round(psutil.virtual_memory().total / (1024.0**3)), + "num_cores": psutil.cpu_count(logical=True), +} + + +def load_benchmark_dataset( + train_dataset_path: str, queries_path: str, gtruth_path: str +) -> Tuple[np.ndarray]: + def verify_paths_exist(paths: List[str]) -> None: + for path in paths: + if not os.path.exists(path): + raise ValueError(f"Invalid file path: {path}") + + verify_paths_exist([train_dataset_path, queries_path, gtruth_path]) + + train_dataset = np.fromfile( + train_dataset_path, + dtype=np.float32 if train_dataset_path.endswith("fbin") else np.uint8, + ) + queries_dataset = np.fromfile( + queries_path, dtype=np.float32 if queries_path.endswith("fbin") else np.uint8 + ) + gtruth_dataset = np.fromfile( + gtruth_path, + dtype=np.float32 if gtruth_path.endswith("fbin") else np.uint8, + ) + + return train_dataset, queries_dataset, gtruth_dataset + + +def compute_metrics( + index: Union[flatnav.index.L2Index, flatnav.index.IPIndex], + queries: np.ndarray, + ground_truth: np.ndarray, + ef_search: int, + k=100, +) -> Tuple[float, float]: + """ + Compute recall and QPS for given queries, ground truth, and a FlaNav index. + + Args: + - index: The Faiss index to search. + - queries: The query vectors. + - ground_truth: The ground truth indices for each query. + - k: Number of neighbors to search. + + Returns: + Mean recall over all queries. + QPS over all queries + """ + start = time.time() + _, top_k_indices = index.search(queries=queries, ef_search=ef_search, K=k) + end = time.time() + + querying_time = end - start + qps = 1 / querying_time + + # Convert each ground truth list to a set for faster lookup + ground_truth_sets = [set(gt) for gt in ground_truth] + + mean_recall = 0 + + for idx, k_neighbors in enumerate(top_k_indices): + query_recall = sum( + 1 for neighbor in k_neighbors if neighbor in ground_truth_sets[idx] + ) + mean_recall += query_recall / k + + recall = mean_recall / len(queries) + + return recall, qps + + +def train_flatnav_index( + train_dataset: np.ndarray, + distance_type: str, + dim: int, + dataset_size: int, + max_edges_per_node: int, + ef_construction: int, +) -> Union[flatnav.index.L2Index, flatnav.index.IPIndex]: + index = flatnav.index.index_factory( + distance_type=distance_type, + dim=dim, + dataset_size=dataset_size, + max_edges_per_node=max_edges_per_node, + verbose=True, + ) + + # Train the index. + start = time.time() + index.add(data=train_dataset, ef_construction=ef_construction) + end = time.time() + + logging.info(f"Indexing time = {end - start} seconds") + + return index + + +def main( + train_dataset: np.ndarray, + queries: np.ndarray, + gtruth: np.ndarray, + ef_cons_params: List[int], + ef_search_params: List[int], + num_node_links: List[int], + distance_type: str, +): + dataset_size = train_dataset.shape[0] + dim = train_dataset.shape[1] + + for node_links in num_node_links: + for ef_cons in ef_cons_params: + for ef_search in ef_search_params: + index = train_flatnav_index( + train_dataset=train_dataset, + max_edges_per_node=node_links, + ef_construction=ef_cons, + dataset_size=dataset_size, + dim=dim, + distance_type=distance_type, + ) + + recall, qps = compute_metrics( + index=index, + queries=queries, + ground_truth=gtruth, + ef_search=ef_search, + ) + + logging.info( + f"Recall@100: {recall}, QPS={qps}, node_links={node_links}," + f" ef_cons={ef_cons}, ef_search={ef_search}" + ) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) + parser = argparse.ArgumentParser( + description="Benchmark Flatnav on Big ANN datasets." + ) + parser.add_argument( + "--dataset", + required=True, + help="Path to a single ANNS benchmark dataset to run on.", + ) + parser.add_argument( + "--queries", required=True, help="Path to a singe queries file." + ) + parser.add_argument( + "--gtruth", + required=True, + help="Path to a single ground truth file to evaluate on.", + ) + parser.add_argument( + "--metric", + required=True, + default="l2", + help="Distance tye. Options include `l2` and `angular`.", + ) + parser.add_argument( + "--log_metrics", required=False, default=False, help="Log metrics to DVC." + ) + + args = parser.parse_args() + + ef_construction_params = [32, 64, 128] + ef_search_params = [32, 64, 128, 256] + num_node_links = [8, 16, 32, 64] + + train_data, queries, ground_truth = load_benchmark_dataset( + train_dataset_path=args.dataset, + queries_path=args.queries, + gtruth_path=args.gtruth, + ) + + main( + train_dataset=train_data, + queries=queries, + gtruth=ground_truth, + ef_cons_params=ef_construction_params, + ef_search_params=ef_search_params, + num_node_links=num_node_links, + distance_type=args.metric.lower(), + ) diff --git a/experiments/utils.py b/experiments/utils.py deleted file mode 100644 index d0cac00..0000000 --- a/experiments/utils.py +++ /dev/null @@ -1,44 +0,0 @@ -import platform, socket, psutil -import os -import numpy as np - -ENVIRONMENT_INFO = { - "load_before_experiment": os.getloadavg()[2], - "platform": platform.platform(), - "platform_version": platform.version(), - "platform_release": platform.release(), - "architecture": platform.machine(), - "processor": platform.processor(), - "hostname": socket.gethostname(), - "ram_gb": round(psutil.virtual_memory().total / (1024.0**3)), - "num_cores": psutil.cpu_count(logical=True), -} - -def compute_recall(index, queries: np.ndarray, ground_truth: np.ndarray, k=100): - """ - Compute recall for given queries, ground truth, and an index. - - Args: - - index: The Faiss index to search. - - queries: The query vectors. - - ground_truth: The ground truth indices for each query. - - k: Number of neighbors to search. - - Returns: - Mean recall over all queries. - """ - _, top_k_indices = index.search(queries, k) - - # Convert each ground truth list to a set for faster lookup - ground_truth_sets = [set(gt) for gt in ground_truth] - - mean_recall = 0 - - for idx, k_neighbors in enumerate(top_k_indices): - query_recall = sum( - 1 for neighbor in k_neighbors if neighbor in ground_truth_sets[idx] - ) - mean_recall += query_recall / k - - recall = mean_recall / len(queries) - return recall \ No newline at end of file diff --git a/flatnav/Index.h b/flatnav/Index.h index 6f9e766..07b757c 100644 --- a/flatnav/Index.h +++ b/flatnav/Index.h @@ -201,14 +201,14 @@ template class Index { inline size_t dataDimension() const { return _distance->dimension(); } void printIndexParams() const { - std::cout << "\nIndex Parameters" << std::endl; - std::cout << "-----------------------------" << std::endl; - std::cout << "max_edges_per_node (M): " << _M << std::endl; - std::cout << "data_size_bytes: " << _data_size_bytes << std::endl; - std::cout << "node_size_bytes: " << _node_size_bytes << std::endl; - std::cout << "max_node_count: " << _max_node_count << std::endl; - std::cout << "cur_num_nodes: " << _cur_num_nodes << std::endl; - std::cout << "visited_nodes size: " << _visited_nodes.size() << std::endl; + std::cout << "\nIndex Parameters" << std::flush; + std::cout << "-----------------------------" << std::flush; + std::cout << "max_edges_per_node (M): " << _M << std::flush; + std::cout << "data_size_bytes: " << _data_size_bytes << std::flush; + std::cout << "node_size_bytes: " << _node_size_bytes << std::flush; + std::cout << "max_node_count: " << _max_node_count << std::flush; + std::cout << "cur_num_nodes: " << _cur_num_nodes << std::flush; + std::cout << "visited_nodes size: " << _visited_nodes.size() << std::flush; _distance->printParams(); } diff --git a/flatnav/distances/InnerProductDistance.h b/flatnav/distances/InnerProductDistance.h index a182e66..457b634 100644 --- a/flatnav/distances/InnerProductDistance.h +++ b/flatnav/distances/InnerProductDistance.h @@ -64,9 +64,9 @@ class InnerProductDistance : public DistanceInterface { } void printParamsImpl() { - std::cout << "\nInnerProductDistance Parameters" << std::endl; - std::cout << "-----------------------------" << std::endl; - std::cout << "Dimension: " << _dimension << std::endl; + std::cout << "\nInnerProductDistance Parameters" << std::flush; + std::cout << "-----------------------------" << std::flush; + std::cout << "Dimension: " << _dimension << std::flush; } }; diff --git a/flatnav/distances/SquaredL2Distance.h b/flatnav/distances/SquaredL2Distance.h index 8056262..dfcf956 100644 --- a/flatnav/distances/SquaredL2Distance.h +++ b/flatnav/distances/SquaredL2Distance.h @@ -69,9 +69,9 @@ class SquaredL2Distance : public DistanceInterface { } void printParamsImpl() { - std::cout << "\nSquaredL2Distance Parameters" << std::endl; - std::cout << "-----------------------------" << std::endl; - std::cout << "Dimension: " << _dimension << std::endl; + std::cout << "\nSquaredL2Distance Parameters" << std::flush; + std::cout << "-----------------------------" << std::flush; + std::cout << "Dimension: " << _dimension << std::flush; } }; diff --git a/flatnav_python/install_flatnav.sh b/flatnav_python/install_flatnav.sh index 0a2b087..d707dbd 100755 --- a/flatnav_python/install_flatnav.sh +++ b/flatnav_python/install_flatnav.sh @@ -2,7 +2,44 @@ set -ex -poetry install --no-root +function check_poetry_installed() { + if ! command -v poetry &> /dev/null; then + echo "Poetry not found. Installing it now..." + + curl -sSL https://install.python-poetry.org | python3 - + + # Check the shell and append to poetry to PATH + SHELL_NAME=$(basename "$SHELL") + # For newer poetry versions, this might be different. + # On ubuntu x86-64, for instance, I found this to be instead + # $HOME/.local/share/pypoetry/venv/bin + POETRY_PATH="$HOME/.poetry/bin" + + if [[ "$SHELL_NAME" == "zsh" ]]; then + echo "Detected zsh shell." + echo "export PATH=\"$POETRY_PATH:\$PATH\"" >> $HOME/.zshrc + source $HOME/.zshrc + + elif [[ "$SHELL_NAME" == "bash" ]]; then + echo "Detected bash shell." + echo "export PATH=\"$POETRY_PATH:\$PATH\"" >> $HOME/.bashrc + source $HOME/.bashrc + + else + echo "Unsupported shell for poetry installation. $SHELL_NAME" + exit 1 + fi + fi +} + + +# Make sure we are in this directory +cd "$(dirname "$0")" + +# Install poetry if not yet installed +check_poetry_installed + +poetry lock && poetry install --no-root # Activate the poetry environment POETRY_ENV=$(poetry env info --path) diff --git a/flatnav_python/python_bindings.cpp b/flatnav_python/python_bindings.cpp index 2476305..702199d 100644 --- a/flatnav_python/python_bindings.cpp +++ b/flatnav_python/python_bindings.cpp @@ -34,7 +34,12 @@ template class PyIndex { explicit PyIndex(std::unique_ptr> index) : _dim(index->dataDimension()), label_id(0), _verbose(false), - _index(index.get()) {} + _index(index.get()) { + + if (_verbose) { + _index->printIndexParams(); + } + } PyIndex(std::shared_ptr> distance, int dim, int dataset_size, int max_edges_per_node, bool verbose = false) @@ -42,7 +47,12 @@ template class PyIndex { _index(new Index( /* dist = */ std::move(distance), /* dataset_size = */ dataset_size, - /* max_edges_per_node = */ max_edges_per_node)) {} + /* max_edges_per_node = */ max_edges_per_node)) { + + if (_verbose) { + _index->printIndexParams(); + } + } Index *getIndex() { return _index; } diff --git a/flatnav_python/test_index.py b/flatnav_python/test_index.py index c2f32d5..35b9890 100644 --- a/flatnav_python/test_index.py +++ b/flatnav_python/test_index.py @@ -11,7 +11,8 @@ def generate_random_data(dataset_length: int, dim: int) -> np.ndarray: - return np.random.rand(dataset_length, dim) + # return np.random.rand(dataset_length, dim) + return np.random.randint(0, 256, size=(dataset_length, dim), dtype=np.uint8) def get_ann_benchmark_dataset(dataset_name): diff --git a/quantization/ProductQuantization.h b/quantization/ProductQuantization.h index 2e45507..544fd4d 100644 --- a/quantization/ProductQuantization.h +++ b/quantization/ProductQuantization.h @@ -419,18 +419,18 @@ class ProductQuantizer : public flatnav::DistanceInterface { } void printParamsImpl() const { - std::cout << "\nProduct Quantizer Parameters" << std::endl; - std::cout << "-----------------------------" << std::endl; + std::cout << "\nProduct Quantizer Parameters" << std::flush; + std::cout << "-----------------------------" << std::flush; std::cout << "Number of subquantizers (M): " << _num_subquantizers - << std::endl; - std::cout << "Number of bits per index: " << _num_bits << std::endl; - std::cout << "Subvector dimension: " << _subvector_dim << std::endl; + << std::flush; + std::cout << "Number of bits per index: " << _num_bits << std::flush; + std::cout << "Subvector dimension: " << _subvector_dim << std::flush; std::cout << "Subquantizer centroids count: " << _subq_centroids_count - << std::endl; - std::cout << "Code size: " << _code_size << std::endl; - std::cout << "Is trained: " << _is_trained << std::endl; - std::cout << "Train type: " << _train_type << std::endl; - std::cout << "\n" << std::endl; + << std::flush; + std::cout << "Code size: " << _code_size << std::flush; + std::cout << "Is trained: " << _is_trained << std::flush; + std::cout << "Train type: " << _train_type << std::flush; + std::cout << "\n" << std::flush; } inline uint32_t getNumSubquantizers() const { return _num_subquantizers; }