From f9aebaf2baf85d928fea8adcd3ebef6719913e2a Mon Sep 17 00:00:00 2001 From: blaise-muhirwa Date: Tue, 21 Nov 2023 05:47:22 +0000 Subject: [PATCH] add python tests --- .gitignore | 1 + CMakeLists.txt | 3 +- bin/run_anns.sh | 20 +- flatnav_python/bindings.cpp | 251 ---------------------- flatnav_python/pyproject.toml | 3 + flatnav_python/python_bindings.cpp | 332 +++++++++++++++++------------ flatnav_python/setup.py | 28 +-- flatnav_python/test_index.py | 210 ++++++++++++++++-- tools/query.cpp | 224 ------------------- tools/query_npy.cpp | 2 +- 10 files changed, 410 insertions(+), 664 deletions(-) delete mode 100644 flatnav_python/bindings.cpp delete mode 100644 tools/query.cpp diff --git a/.gitignore b/.gitignore index 2b50cb0..6ceb4e4 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ build flatnav_python/flatnav.egg-info/ flatnav_python/poetry.lock flatnav_python/dist +flatnav_python/__pycache__ # other files diff --git a/CMakeLists.txt b/CMakeLists.txt index b713529..f501cc0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -218,7 +218,6 @@ set(HEADERS ${PROJECT_SOURCE_DIR}/flatnav/distances/inner_products_from_hnswlib.h ${PROJECT_SOURCE_DIR}/flatnav/distances/SquaredL2Distance.h ${PROJECT_SOURCE_DIR}/flatnav/distances/SquaredL2DistanceSpecializations.h - ${PROJECT_SOURCE_DIR}/flatnav/distances/SQDistance.h ${PROJECT_SOURCE_DIR}/flatnav/util/ExplicitSet.h ${PROJECT_SOURCE_DIR}/flatnav/util/GorderPriorityQueue.h ${PROJECT_SOURCE_DIR}/flatnav/util/reordering.h @@ -238,7 +237,7 @@ set_target_properties(FLAT_NAV_LIB PROPERTIES LINKER_LANGUAGE CXX) if(BUILD_EXAMPLES) message(STATUS "Building examples for Flatnav") - foreach(CONSTRUCT_EXEC construct_npy query query_npy cereal_tests) + foreach(CONSTRUCT_EXEC construct_npy query_npy cereal_tests) add_executable(${CONSTRUCT_EXEC} ${PROJECT_SOURCE_DIR}/tools/${CONSTRUCT_EXEC}.cpp ${HEADERS}) add_dependencies(${CONSTRUCT_EXEC} FLAT_NAV_LIB) diff --git a/bin/run_anns.sh b/bin/run_anns.sh index 9678674..0ae3719 100755 --- a/bin/run_anns.sh +++ b/bin/run_anns.sh @@ -5,13 +5,13 @@ if [ -f mnist_784.index ]; then rm mnist_784.index fi -if [ -f sift_128.index ]; then - rm sift_128.index -fi +# if [ -f sift_128.index ]; then +# rm sift_128.index +# fi -if [ -f glove_25.index ]; then - rm glove_25.index -fi +# if [ -f glove_25.index ]; then +# rm glove_25.index +# fi # if [ -f gist_960.index ]; then # rm gist_960.index @@ -22,10 +22,10 @@ fi # fi # Build the index for MNIST -# build/construct_npy 1 0 data/mnist-784-euclidean/mnist-784-euclidean.train.npy 16 128 mnist_784.index +build/construct_npy 0 0 data/mnist-784-euclidean/mnist-784-euclidean.train.npy 16 128 mnist_784.index # # Query MNIST -# build/query_npy 0 mnist_784.index data/mnist-784-euclidean/mnist-784-euclidean.test.npy data/mnist-784-euclidean/mnist-784-euclidean.gtruth.npy 256 100 0 1 +build/query_npy 0 mnist_784.index data/mnist-784-euclidean/mnist-784-euclidean.test.npy data/mnist-784-euclidean/mnist-784-euclidean.gtruth.npy 256 100 0 0 # # Query MNIST with reordering # build/query_npy 0 mnist_784.index data/mnist/mnist-784-euclidean.test.npy data/mnist/mnist-784-euclidean.gtruth.npy 256,512 100 1 @@ -37,10 +37,10 @@ fi # build/query_npy 0 sift_128.index data/sift/sift-128-euclidean.test.npy data/sift/sift-128-euclidean.gtruth.npy 256,512 100 0 # Build the index for GloVe -build/construct_npy 1 1 data/glove/glove-25-angular.train.npy 16 128 glove_25.index +# build/construct_npy 1 1 data/glove/glove-25-angular.train.npy 16 128 glove_25.index # Query GloVe without reordering -build/query_npy 1 glove_25.index data/glove/glove-25-angular.test.npy data/glove/glove-25-angular.gtruth.npy 128,256 100 0 1 +# build/query_npy 1 glove_25.index data/glove/glove-25-angular.test.npy data/glove/glove-25-angular.gtruth.npy 128,256 100 0 1 # # Query GloVe with reordering # build/query_npy 1 glove_25.index data/glove/glove-25-angular.test.npy data/glove/glove-25-angular.gtruth.npy 256,512 100 1 diff --git a/flatnav_python/bindings.cpp b/flatnav_python/bindings.cpp deleted file mode 100644 index 0338fba..0000000 --- a/flatnav_python/bindings.cpp +++ /dev/null @@ -1,251 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -using flatnav::DistanceInterface; -using flatnav::Index; -using flatnav::InnerProductDistance; -using flatnav::SquaredL2Distance; - -namespace py = pybind11; - -template class PythonIndex { - const uint32_t NUM_LOG_STEPS = 1000; -private: - int _dim, label_id; - bool _verbose; - Index *_index; - -public: - typedef std::pair, py::array_t> - DistancesLabelsPair; - - explicit PythonIndex(std::unique_ptr> index) - : _dim(index->dataDimension()), label_id(0), _verbose(false), - _index(index.get()) {} - - PythonIndex(std::shared_ptr> distance, int dim, - int dataset_size, int max_edges_per_node, bool verbose = false) - : _dim(dim), label_id(0), _verbose(verbose), - _index(new Index( - /* dist = */ std::move(distance), - /* dataset_size = */ dataset_size, - /* max_edges_per_node = */ max_edges_per_node)) {} - - Index *getIndex() { return _index; } - - ~PythonIndex() { delete _index; } - - static std::unique_ptr> - loadIndex(const std::string &filename) { - auto index = Index::loadIndex(/* filename = */ filename); - return std::make_unique>(std::move(index)); - } - - void - add(const py::array_t &data, - int ef_construction, py::object labels = py::none()) { - // py::array_t means that - // the functions expects either a Numpy array of floats or a castable type - // to that type. If the given type can't be casted, pybind11 will throw an - // error. - - auto num_vectors = data.shape(0); - auto data_dim = data.shape(1); - if (data.ndim() != 2 || data_dim != _dim) { - throw std::invalid_argument("Data has incorrect dimensions."); - } - - std::clog << "[num-vectors] = " << num_vectors << std::flush; - std::clog << "[data_dim] = " << data_dim << std::flush; - if (labels.is_none()) { - for (size_t vec_index = 0; vec_index < num_vectors; vec_index++) { - this->_index->add(/* data = */ (void *)data.data(vec_index), - /* label = */ label_id, - /* ef_construction = */ ef_construction); - if (_verbose && vec_index % NUM_LOG_STEPS == 0) { - std::clog << "." << std::flush; - } - label_id++; - } - std::clog << std::endl; - return; - } - - // Use the provided labels now - py::array_t node_labels( - labels); - if (node_labels.ndim() != 1 || node_labels.shape(0) != num_vectors) { - throw std::invalid_argument("Labels have incorrect dimensions."); - } - - for (size_t vec_index = 0; vec_index < num_vectors; vec_index++) { - label_t label_id = *node_labels.data(vec_index); - this->_index->add(/* data = */ (void *)data.data(vec_index), - /* label = */ label_id, - /* ef_construction = */ ef_construction); - - if (_verbose && vec_index % NUM_LOG_STEPS == 0) { - std::clog << "." << std::flush; - } - } - std::clog << std::endl; - } - - DistancesLabelsPair - search(const py::array_t - queries, - int K, int ef_search) { - size_t num_queries = queries.shape(0); - size_t queries_dim = queries.shape(1); - - if (queries.ndim() != 2 || queries_dim != _dim) { - throw std::invalid_argument("Queries have incorrect dimensions."); - } - - label_t *results = new label_t[num_queries * K]; - float *distances = new float[num_queries * K]; - - for (size_t query_index = 0; query_index < num_queries; query_index++) { - std::vector> top_k = this->_index->search( - /* query = */ (const void *)queries.data(query_index), /* K = */ K, - /* ef_search = */ ef_search); - - for (size_t i = 0; i < top_k.size(); i++) { - distances[query_index * K + i] = top_k[i].first; - results[query_index * K + i] = top_k[i].second; - } - } - - // Allows to transfer ownership to Python - py::capsule free_results_when_done( - results, [](void *ptr) { delete (label_t *)ptr; }); - py::capsule free_distances_when_done( - distances, [](void *ptr) { delete (float *)ptr; }); - - py::array_t labels = - py::array_t({num_queries, (size_t)K}, // shape of the array - {K * sizeof(label_t), sizeof(label_t)}, // strides - results, // data pointer - free_results_when_done // capsule - ); - - py::array_t dists = py::array_t( - {num_queries, (size_t)K}, {K * sizeof(float), sizeof(float)}, distances, - free_distances_when_done); - - return {dists, labels}; - } -}; - -using L2FlatNavIndex = PythonIndex; -using InnerProductFlatNavIndex = PythonIndex; - -template -void bindIndexMethods(py::class_ &index_class) { - index_class - .def( - "save", - [](IndexType &index_type, const std::string &filename) { - auto index = index_type.getIndex(); - index->saveIndex(/* filename = */ filename); - }, - py::arg("filename"), - "Save a FlatNav index at the given file location.") - .def_static("load", &IndexType::loadIndex, py::arg("filename"), - "Load a FlatNav index from a given file location") - .def("add", &IndexType::add, py::arg("data"), py::arg("ef_construction"), - py::arg("labels") = py::none(), - "Add vectors(data) to the index with the given `ef_construction` " - "parameter and optional labels. `ef_construction` determines how " - "many " - "vertices are visited while inserting every vector in the " - "underlying graph structure.") - .def("search", &IndexType::search, py::arg("queries"), py::arg("K"), - py::arg("ef_search"), - "Return top `K` closest data points for every query in the " - "provided `queries`. The results are returned as a Tuple of " - "distances and label ID's. The `ef_search` parameter determines how " - "many neighbors are visited while finding the closest neighbors " - "for every query.") - .def( - "reorder", - [](IndexType &index_type, const std::string &algorithm) { - auto index = index_type.getIndex(); - auto alg = algorithm; - std::transform(alg.begin(), alg.end(), alg.begin(), - [](unsigned char c) { return std::tolower(c); }); - if (alg == "gorder") { - index->reorderGOrder(); - } else if (alg == "rcm") { - index->reorderRCM(); - } else { - throw std::invalid_argument( - "`" + algorithm + - "` is not a supported graph re-ordering algorithm."); - } - }, - py::arg("algorithm"), - "Perform graph re-ordering based on the given re-ordering strategy.") - .def_property_readonly( - "max_edges_per_node", - [](IndexType &index_type) { - return index_type.getIndex()->maxEdgesPerNode(); - }, - "Maximum number of edges(links) per node in the underlying NSW graph " - "data structure."); -} - -py::object createIndex(const std::string &distance_type, int dim, - int dataset_size, int max_edges_per_node, - bool verbose = false) { - auto dist_type = distance_type; - std::transform(dist_type.begin(), dist_type.end(), dist_type.begin(), - [](unsigned char c) { return std::tolower(c); }); - - if (dist_type == "l2") { - auto distance = std::make_shared(/* dim = */ dim); - return py::cast(new L2FlatNavIndex(std::move(distance), dim, dataset_size, - max_edges_per_node, verbose)); - } else if (dist_type == "angular") { - auto distance = std::make_shared(/* dim = */ dim); - return py::cast(new InnerProductFlatNavIndex( - std::move(distance), dim, dataset_size, max_edges_per_node, verbose)); - } - throw std::invalid_argument("Invalid distance type: `" + dist_type + - "` during index construction. Valid options " - "include `l2` and `angular`."); -} - -void defineIndexSubmodule(py::module_ &index_submodule) { - index_submodule.def("index_factory", &createIndex, py::arg("distance_type"), - py::arg("dim"), py::arg("dataset_size"), - py::arg("max_edges_per_node"), py::arg("verbose") = false, - "Creates a FlatNav index given the corresponding " - "parameters. The `distance_type` argument determines the " - "kind of index created (either L2Index or IPIndex)"); - - py::class_ l2_index_class(index_submodule, "L2Index"); - bindIndexMethods(l2_index_class); - - py::class_ ip_index_class(index_submodule, - "IPIndex"); - bindIndexMethods(ip_index_class); -} - -PYBIND11_MODULE(flatnav, module) { - auto index_submodule = module.def_submodule("index"); - - defineIndexSubmodule(index_submodule); -} \ No newline at end of file diff --git a/flatnav_python/pyproject.toml b/flatnav_python/pyproject.toml index e1cb0fb..9539524 100644 --- a/flatnav_python/pyproject.toml +++ b/flatnav_python/pyproject.toml @@ -21,6 +21,9 @@ setuptools = "68.2.2" black = "^23.11.0" pytest = "^7.4.3" numpy = "^1.26.2" +h5py = "^3.10.0" +requests = "^2.31.0" + [build-system] diff --git a/flatnav_python/python_bindings.cpp b/flatnav_python/python_bindings.cpp index 793bcb9..2476305 100644 --- a/flatnav_python/python_bindings.cpp +++ b/flatnav_python/python_bindings.cpp @@ -1,192 +1,250 @@ #include -#include -#include -#include - +#include +#include +#include #include #include - -#include +#include +#include +#include #include #include +#include #include -using namespace flatnav; +using flatnav::DistanceInterface; using flatnav::Index; using flatnav::InnerProductDistance; using flatnav::SquaredL2Distance; -using flatnav::quantization::ProductQuantizer; namespace py = pybind11; template class PyIndex { + const uint32_t NUM_LOG_STEPS = 10000; + private: + int _dim, label_id; + bool _verbose; Index *_index; - std::unique_ptr> _distance; - size_t _dim; - int _added; +public: + typedef std::pair, py::array_t> + DistancesLabelsPair; - void setIndexMetric(std::string &metric) { - std::transform(metric.begin(), metric.end(), metric.begin(), - [](unsigned char c) { return std::tolower(c); }); + explicit PyIndex(std::unique_ptr> index) + : _dim(index->dataDimension()), label_id(0), _verbose(false), + _index(index.get()) {} - if (metric == "l2") { - _distance = std::make_unique(/* dim = */ _dim); - } else if (metric == "angular") { - _distance = std::make_unique(/* dim = */ _dim); - } - throw std::invalid_argument("Invalid metric `" + metric + - "` used during index construction."); - } + PyIndex(std::shared_ptr> distance, int dim, + int dataset_size, int max_edges_per_node, bool verbose = false) + : _dim(dim), label_id(0), _verbose(verbose), + _index(new Index( + /* dist = */ std::move(distance), + /* dataset_size = */ dataset_size, + /* max_edges_per_node = */ max_edges_per_node)) {} -public: - PyIndex(std::string metric_type, size_t dim, int N, int M) - : _dim(dim), _added(0) { - setIndexMetric(metric_type); - _index = new Index( - /* dist = */ std::move(_distance), /* dataset_size = */ N, - /* max_edges_per_node = */ M); - } + Index *getIndex() { return _index; } - PyIndex(std::string filename) { - _index = new Index(/* in = */ filename); + ~PyIndex() { delete _index; } + + static std::unique_ptr> + loadIndex(const std::string &filename) { + auto index = Index::loadIndex(/* filename = */ filename); + return std::make_unique>(std::move(index)); } - void add(py::array_t data, - int ef_construction, py::object labels_obj = py::none()) { + void + add(const py::array_t &data, + int ef_construction, py::object labels = py::none()) { + // py::array_t means that + // the functions expects either a Numpy array of floats or a castable type + // to that type. If the given type can't be casted, pybind11 will throw an + // error. + + auto num_vectors = data.shape(0); + auto data_dim = data.shape(1); + if (data.ndim() != 2 || data_dim != _dim) { + throw std::invalid_argument("Data has incorrect dimensions."); + } + if (labels.is_none()) { + for (size_t vec_index = 0; vec_index < num_vectors; vec_index++) { + this->_index->add(/* data = */ (void *)data.data(vec_index), + /* label = */ label_id, + /* ef_construction = */ ef_construction); + if (_verbose && vec_index % NUM_LOG_STEPS == 0) { + std::clog << "." << std::flush; + } + label_id++; + } + std::clog << std::endl; + return; + } - if (data.n_dim() != 2 || data.shape(1) != _dim) { - throw std::invalid_argument("Data has incorrect _dimensions"); + // Use the provided labels now + py::array_t node_labels( + labels); + if (node_labels.ndim() != 1 || node_labels.shape(0) != num_vectors) { + throw std::invalid_argument("Labels have incorrect dimensions."); } - if (labels_obj.is_none()) { - for (size_t n = 0; n < data.shape(0); n++) { - this->index->add((void *)data.data(n), _added, ef_construction); - _added++; - } - } else { - py::array_t labels( - labels_obj); - if (labels.n_dim() != 1 || labels.shape(0) != data.shape(0)) { - throw std::invalid_argument("Labels have incorrect _dimensions"); - } + for (size_t vec_index = 0; vec_index < num_vectors; vec_index++) { + label_t label_id = *node_labels.data(vec_index); + this->_index->add(/* data = */ (void *)data.data(vec_index), + /* label = */ label_id, + /* ef_construction = */ ef_construction); - for (size_t n = 0; n < data.shape(0); n++) { - label_t l = *labels.data(n); - this->index->add((void *)data.data(n), l, ef_construction); - _added++; + if (_verbose && vec_index % NUM_LOG_STEPS == 0) { + std::clog << "." << std::flush; } } + std::clog << std::endl; } - py::array_t - search(py::array_t queries, + DistancesLabelsPair + search(const py::array_t + queries, int K, int ef_search) { - if (queries.n_dim() != 2 || queries.shape(1) != _dim) { - throw std::invalid_argument("Queries have incorrect _dimensions"); - } size_t num_queries = queries.shape(0); + size_t queries_dim = queries.shape(1); + + if (queries.ndim() != 2 || queries_dim != _dim) { + throw std::invalid_argument("Queries have incorrect dimensions."); + } label_t *results = new label_t[num_queries * K]; + float *distances = new float[num_queries * K]; + + for (size_t query_index = 0; query_index < num_queries; query_index++) { + std::vector> top_k = this->_index->search( + /* query = */ (const void *)queries.data(query_index), /* K = */ K, + /* ef_search = */ ef_search); - for (size_t q = 0; q < num_queries; q++) { - std::vector> topK = - this->index->search(queries.data(q), K, ef_search); - for (size_t i = 0; i < topK.size(); i++) { - results[q * K + i] = topK[i].second; + for (size_t i = 0; i < top_k.size(); i++) { + distances[query_index * K + i] = top_k[i].first; + results[query_index * K + i] = top_k[i].second; } } - py::capsule free_when_done(results, [](void *ptr) { delete ptr; }); + // Allows to transfer ownership to Python + py::capsule free_results_when_done( + results, [](void *ptr) { delete (label_t *)ptr; }); + py::capsule free_distances_when_done( + distances, [](void *ptr) { delete (float *)ptr; }); - return py::array_t({num_queries, (size_t)K}, - {K * sizeof(label_t), sizeof(label_t)}, results, - free_when_done); - } - - void reorder(std::string alg) { - std::transform(alg.begin(), alg.end(), std::tolower); - - if (alg == "gorder") { - this->index->reorder_gorder(); - } else if (alg == "rcm") { - this->index->reorder_rcm(); - } else { - throw std::invalid_argument( - "'" + alg + "' is not a supported graph re-ordering algorithm."); - } - } + py::array_t labels = + py::array_t({num_queries, (size_t)K}, // shape of the array + {K * sizeof(label_t), sizeof(label_t)}, // strides + results, // data pointer + free_results_when_done // capsule + ); - void save(std::string filename) { this->index->save(filename); } + py::array_t dists = py::array_t( + {num_queries, (size_t)K}, {K * sizeof(float), sizeof(float)}, distances, + free_distances_when_done); - ~PyIndex() { - delete index; - delete space; + return {dists, labels}; } }; -template -double ComputeRecall(py::array_t results, - py::array_t gtruths) { - double avg_recall = 0.0; - for (size_t q = 0; q < results.shape(0); q++) { - double recall = 0.0; - const label_t *result = results.data(q); - const label_t *topk = gtruths.data(q); - for (size_t i = 0; i < results.shape(1); i++) { - for (size_t j = 0; j < results.shape(1); j++) { - if (result[i] == topk[j]) { - recall += 1.0; - break; - } - } - } - avg_recall += recall; - } - - return avg_recall /= (results.shape(0) * results.shape(1)); +using L2FlatNavIndex = PyIndex; +using InnerProductFlatNavIndex = PyIndex; + +template +void bindIndexMethods(py::class_ &index_class) { + index_class + .def( + "save", + [](IndexType &index_type, const std::string &filename) { + auto index = index_type.getIndex(); + index->saveIndex(/* filename = */ filename); + }, + py::arg("filename"), + "Save a FlatNav index at the given file location.") + .def_static("load", &IndexType::loadIndex, py::arg("filename"), + "Load a FlatNav index from a given file location") + .def("add", &IndexType::add, py::arg("data"), py::arg("ef_construction"), + py::arg("labels") = py::none(), + "Add vectors(data) to the index with the given `ef_construction` " + "parameter and optional labels. `ef_construction` determines how " + "many " + "vertices are visited while inserting every vector in the " + "underlying graph structure.") + .def("search", &IndexType::search, py::arg("queries"), py::arg("K"), + py::arg("ef_search"), + "Return top `K` closest data points for every query in the " + "provided `queries`. The results are returned as a Tuple of " + "distances and label ID's. The `ef_search` parameter determines how " + "many neighbors are visited while finding the closest neighbors " + "for every query.") + .def( + "reorder", + [](IndexType &index_type, const std::string &algorithm) { + auto index = index_type.getIndex(); + auto alg = algorithm; + std::transform(alg.begin(), alg.end(), alg.begin(), + [](unsigned char c) { return std::tolower(c); }); + if (alg == "gorder") { + index->reorderGOrder(); + } else if (alg == "rcm") { + index->reorderRCM(); + } else { + throw std::invalid_argument( + "`" + algorithm + + "` is not a supported graph re-ordering algorithm."); + } + }, + py::arg("algorithm"), + "Perform graph re-ordering based on the given re-ordering strategy. " + "Supported re-ordering algorithms include `gorder` and `rcm`.") + .def_property_readonly( + "max_edges_per_node", + [](IndexType &index_type) { + return index_type.getIndex()->maxEdgesPerNode(); + }, + "Maximum number of edges(links) per node in the underlying NSW graph " + "data structure."); } -using L2FloatPyIndex = PyIndex; - -PYBIND11_MODULE(flatnav, m) { - py::class_(m, "Index") - .def(py::init(), py::arg("metric"), - py::arg("_dim"), py::arg("N"), py::arg("M")) - .def(py::init(), py::arg("save_loc")) - .def("add", &L2FloatPyIndex::add, py::arg("data"), - py::arg("ef_construction"), py::arg("labels") = py::none()) - .def("search", &L2FloatPyIndex::search, py::arg("queries"), py::arg("K"), - py::arg("ef_search")) - .def("reorder", &L2FloatPyIndex::reorder, py::arg("alg")) - .def("save", &L2FloatPyIndex::save, py::arg("filename")); +py::object createIndex(const std::string &distance_type, int dim, + int dataset_size, int max_edges_per_node, + bool verbose = false) { + auto dist_type = distance_type; + std::transform(dist_type.begin(), dist_type.end(), dist_type.begin(), + [](unsigned char c) { return std::tolower(c); }); + + if (dist_type == "l2") { + auto distance = std::make_shared(/* dim = */ dim); + return py::cast(new L2FlatNavIndex(std::move(distance), dim, dataset_size, + max_edges_per_node, verbose)); + } else if (dist_type == "angular") { + auto distance = std::make_shared(/* dim = */ dim); + return py::cast(new InnerProductFlatNavIndex( + std::move(distance), dim, dataset_size, max_edges_per_node, verbose)); + } + throw std::invalid_argument("Invalid distance type: `" + dist_type + + "` during index construction. Valid options " + "include `l2` and `angular`."); } -#include -#include - -namespace py = pybind11; - -class Index { -public: - int _m; - explicit Index(int num) : _m(num) {} - - int add(int j) { return _m + j; } -}; - void defineIndexSubmodule(py::module_ &index_submodule) { - py::class_(index_submodule, "Index") - .def(py::init(), py::arg("num"), - "Initializes a naive quantizer (int8) object.") - .def("add", &Index::add, py::arg("j"), - "Quantizes input vectors based by clipping the bit width."); + index_submodule.def("index_factory", &createIndex, py::arg("distance_type"), + py::arg("dim"), py::arg("dataset_size"), + py::arg("max_edges_per_node"), py::arg("verbose") = false, + "Creates a FlatNav index given the corresponding " + "parameters. The `distance_type` argument determines the " + "kind of index created (either L2Index or IPIndex)"); + + py::class_ l2_index_class(index_submodule, "L2Index"); + bindIndexMethods(l2_index_class); + + py::class_ ip_index_class(index_submodule, + "IPIndex"); + bindIndexMethods(ip_index_class); } -PYBIND11_MODULE(flatnav, module_) { +PYBIND11_MODULE(flatnav, module) { + auto index_submodule = module.def_submodule("index"); - auto index_submodule = module_.def_submodule("index"); defineIndexSubmodule(index_submodule); -} +} \ No newline at end of file diff --git a/flatnav_python/setup.py b/flatnav_python/setup.py index 85bc91c..c93e53e 100644 --- a/flatnav_python/setup.py +++ b/flatnav_python/setup.py @@ -1,21 +1,11 @@ -# import toml import os - -# Available at setup time due to pyproject.toml from pybind11.setup_helpers import Pybind11Extension, build_ext from setuptools import setup -# def parse_version_from_pyproject() -> str: -# with open("pyproject.toml") as f: -# pyproject = toml.load(f) -# return pyproject["tool"]["poetry"]["version"] - -# raise RuntimeError("Unable to find version string.") - __version__ = "0.0.1" CURRENT_DIR = os.getcwd() -SOURCE_PATH = os.path.join(CURRENT_DIR, "bindings.cpp") +SOURCE_PATH = os.path.join(CURRENT_DIR, "python_bindings.cpp") ext_modules = [ @@ -28,9 +18,16 @@ os.path.join(CURRENT_DIR, ".."), os.path.join(CURRENT_DIR, "..", "external", "cereal", "include"), ], - # Ignoring the `Wno-sign-compare` which warns you when you compare int with something like - # uint64_t. - extra_compile_args=["-Wno-sign-compare", "-fopenmp"], + extra_compile_args=[ + "-fopenmp", # Enable OpenMP + "-Ofast", # Use the fastest optimization + "-fpic", # Position-independent code + "-w", # Suppress all warnings (note: this overrides -Wall) + "-ffast-math", # Enable fast math optimizations + "-funroll-loops", # Unroll loops + "-ftree-vectorize", # Vectorize where possible + ], + extra_link_args=["-fopenmp"], # Link OpenMP when linking the extension ) ] @@ -44,9 +41,6 @@ description="Graph kNN with reordering.", long_description="", ext_modules=ext_modules, - # extras_require={"test": "pytest"}, - # Currently, build_ext only provides an optional "highest supported C++ - # level" feature, but in the future it may provide more features. cmdclass={"build_ext": build_ext}, zip_safe=False, python_requires=">=3.7", diff --git a/flatnav_python/test_index.py b/flatnav_python/test_index.py index 76f0d4f..77f7542 100644 --- a/flatnav_python/test_index.py +++ b/flatnav_python/test_index.py @@ -1,16 +1,81 @@ import flatnav from flatnav.index import index_factory from flatnav.index import L2Index, IPIndex -from typing import Union -import pytest +from typing import Union, Optional import numpy as np -import time +import time +import tempfile +import h5py +import requests +import os def generate_random_data(dataset_length: int, dim: int) -> np.ndarray: return np.random.rand(dataset_length, dim) +def get_ann_benchmark_dataset(dataset_name): + base_uri = "http://ann-benchmarks.com" + dataset_uri = f"{base_uri}/{dataset_name}.hdf5" + + with tempfile.TemporaryDirectory() as tmp: + response = requests.get(dataset_uri) + loc = os.path.join(tmp, dataset_name) + + with open(loc, "wb") as f: + f.write(response.content) + data = h5py.File(loc, "r") + + training_set = data["train"] + queries = data["test"] + true_neighbors = data["neighbors"] + distances = data["distances"] + + return ( + np.array(training_set), + np.array(queries), + np.array(true_neighbors), + np.array(distances), + ) + + +def compute_recall( + index, queries: np.ndarray, ground_truth: np.ndarray, ef_search: int, k: int = 100 +): + """ + Compute recall for given queries, ground truth, and a FlatNav index. + + Args: + - index: The Faiss index to search. + - queries: The query vectors. + - ground_truth: The ground truth indices for each query. + - k: Number of neighbors to search. + + Returns: + Mean recall over all queries. + """ + start = time.time() + _, top_k_indices = index.search(queries=queries, ef_search=ef_search, K=k) + end = time.time() + + duration = (end - start) / len(queries) + print(f"Querying time: {duration * 1000} milliseconds") + + # Convert each ground truth list to a set for faster lookup + ground_truth_sets = [set(gt) for gt in ground_truth] + + mean_recall = 0 + + for idx, k_neighbors in enumerate(top_k_indices): + query_recall = sum( + 1 for neighbor in k_neighbors if neighbor in ground_truth_sets[idx] + ) + mean_recall += query_recall / k + + recall = mean_recall / len(queries) + return recall + + def create_index( distance_type: str, dim: int, dataset_size: int, max_edges_per_node: int ) -> Union[L2Index, IPIndex]: @@ -19,20 +84,18 @@ def create_index( dim=dim, dataset_size=dataset_size, max_edges_per_node=max_edges_per_node, - verbose=True + verbose=True, ) - if not ( - isinstance(index, flatnav.index.L2Index) - or isinstance(index, flatnav.index.IPIndex) - ): + if not (isinstance(index, L2Index) or isinstance(index, IPIndex)): raise RuntimeError("Invalid index.") return index -def test_flatnav_l2_index(): - dataset_to_index = generate_random_data(dataset_length=60_000, dim=784) - queries = generate_random_data(dataset_length=10_000, dim=784) +def test_flatnav_l2_index_random_dataset(): + dataset_to_index = generate_random_data(dataset_length=30_000, dim=784) + queries = generate_random_data(dataset_length=5_000, dim=784) + ground_truth = np.random.randint(low=0, high=50, size=(5_000, 100)) index = create_index( distance_type="l2", dim=dataset_to_index.shape[1], @@ -43,22 +106,125 @@ def test_flatnav_l2_index(): assert hasattr(index, "max_edges_per_node") assert index.max_edges_per_node == 32 - start = time.time() - index.add(data=dataset_to_index, ef_construction=64) - end = time.time() + run_test( + index=index, + ef_construction=64, + ef_search=32, + training_set=dataset_to_index, + queries=queries, + ground_truth=ground_truth, + ) + + +def test_flatnav_l2_index_mnist_dataset(): + training_set, queries, ground_truth, _ = get_ann_benchmark_dataset( + dataset_name="mnist-784-euclidean" + ) + + index = create_index( + distance_type="l2", + dim=training_set.shape[1], + dataset_size=training_set.shape[0], + max_edges_per_node=16, + ) + + assert hasattr(index, "max_edges_per_node") + assert index.max_edges_per_node == 16 + + run_test( + index=index, + ef_construction=128, + ef_search=256, + training_set=training_set, + queries=queries, + ground_truth=ground_truth, + assert_recall_threshold=True, + recall_threshold=0.97, + ) + + +def test_flatnav_ip_index_random_dataset(): + dataset_to_index = generate_random_data(dataset_length=30_000, dim=225) + queries = generate_random_data(dataset_length=5_000, dim=225) + ground_truth = np.random.randint(low=0, high=50, size=(5_000, 100)) - print(f"Indexing time = {end - start}") + index = create_index( + distance_type="angular", + dim=dataset_to_index.shape[1], + dataset_size=len(dataset_to_index), + max_edges_per_node=16, + ) + assert hasattr(index, "max_edges_per_node") + assert index.max_edges_per_node == 16 + + run_test( + index=index, + ef_construction=64, + ef_search=32, + training_set=dataset_to_index, + queries=queries, + ground_truth=ground_truth, + ) + +def test_flatnav_index_with_reordering(): + training_set, queries, ground_truth, _ = get_ann_benchmark_dataset( + dataset_name="mnist-784-euclidean" + ) + + index = create_index( + distance_type="l2", + dim=training_set.shape[1], + dataset_size=training_set.shape[0], + max_edges_per_node=16, + ) + + assert hasattr(index, "max_edges_per_node") + assert index.max_edges_per_node == 16 + + run_test( + index=index, + ef_construction=128, + ef_search=256, + training_set=training_set, + queries=queries, + ground_truth=ground_truth, + assert_recall_threshold=True, + recall_threshold=0.97, + use_reordering=True, + reordering_algorithm="gorder" + ) + + +def run_test( + index: Union[L2Index, IPIndex], + ef_construction: int, + ef_search: int, + training_set: np.ndarray, + queries: np.ndarray, + ground_truth: np.ndarray, + use_reordering: bool = False, + reordering_algorithm: Optional[str] = None, + assert_recall_threshold: bool = False, + recall_threshold: Optional[float] = None, +): start = time.time() - distances, node_ids = index.search(queries=queries, ef_search=64, K=100) + index.add(data=training_set, ef_construction=ef_construction) end = time.time() - print(f"Querying time = {end - start}") - assert distances.shape == node_ids.shape + print(f"Indexing time = {end - start} seconds") + + if use_reordering: + if not reordering_algorithm: + raise RuntimeError("Re-ordering algorithm must be provided.") + index.reorder(algorithm=reordering_algorithm) + recall = compute_recall( + index=index, queries=queries, ground_truth=ground_truth, ef_search=ef_search + ) -""" -Indexing time = 693.3694415092468 -Querying time = 48.112215518951416 -""" \ No newline at end of file + if assert_recall_threshold: + if not recall_threshold: + raise RuntimeError("Recall threshold must be provided.") + assert recall >= recall_threshold diff --git a/tools/query.cpp b/tools/query.cpp deleted file mode 100644 index 1d52ba0..0000000 --- a/tools/query.cpp +++ /dev/null @@ -1,224 +0,0 @@ -#include "cnpy.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using flatnav::Index; -using flatnav::SquaredL2Distance; - -std::shared_ptr> -buildIndex(float *data, uint32_t dim, uint64_t N, uint32_t max_edges, - uint32_t ef_construction) { - auto distance = std::make_unique(dim); - auto index = std::make_shared>( - /* dist = */ std::move(distance), /* dataset_size = */ N, - /* max_edges = */ max_edges); - - auto start = std::chrono::high_resolution_clock::now(); - - for (int label = 0; label < N; label++) { - float *element = data + (dim * label); - index->add(/* data = */ (void *)element, /* label = */ label, - /* ef_construction */ ef_construction); - if (label % 100000 == 0) - std::clog << "." << std::flush; - } - std::clog << std::endl; - - auto stop = std::chrono::high_resolution_clock::now(); - auto duration = - std::chrono::duration_cast(stop - start); - std::clog << "Build time: " << (float)duration.count() << " milliseconds" - << std::endl; - return index; -} - -int main(int argc, char **argv) { - - if (argc < 6) { - std::clog << "Usage: " << std::endl; - std::clog << "query "; - std::clog << " [--nq num_queries] [--reorder_id reorder_id] [--ef_profile " - "ef_profile] [--num_profile num_profile]" - << std::endl; - std::clog << "Positional arguments:" << std::endl; - std::clog << "\t index: Filename for the training data (float32 index)." - << std::endl; - std::clog << "\t space: Integer distance ID: 0 for L2 distance, 1 for " - "inner product (angular distance)." - << std::endl; - std::clog << "\t queries: Filename for queries (float32 file)." - << std::endl; - std::clog << "\t gtruth: Filename for ground truth (int32 file)." - << std::endl; - - std::clog << "\t k: Number of neighbors to return." << std::endl; - - std::clog << "Optional arguments:" << std::endl; - std::clog << "\t [--nq num_queries]: (Optional, default 0) Number of " - "queries to use. If 0, uses all queries." - << std::endl; - std::clog << "\t [--reorder_id reorder_id]: (Optional, default 0) Which " - "reordering algorithm to use? 0:none 1:gorder 2:indegsort " - "3:outdegsort 4:RCM 5:hubsort 6:hubcluster 7:DBG 8:corder " - "91:profiled_gorder 94:profiled_rcm 41:RCM+gorder" - << std::endl; - std::clog << "\t [--ef_profile ef_profile]: (Optional, default 100) " - "ef_search parameter to use for profiling." - << std::endl; - std::clog << "\t [--num_profile num_profile]: (Optional, default 1000) " - "Number of queries to use for profiling." - << std::endl; - return -1; - } - - // Optional arguments. - int num_queries = 10000; - bool reorder = false; - int reorder_ID = 0; - int ef_profile = 100; - int num_profile = 1000; - - std::string train_file = - "data/mnist-784-euclidean/mnist-784-euclidean.train.npy"; - std::string queries_file = - "data/mnist-784-euclidean/mnist-784-euclidean.test.npy"; - std::string groundtruth_file = - "data/mnist-784-euclidean/mnist-784-euclidean.gtruth.npy"; - - for (int i = 0; i < argc; ++i) { - if (std::strcmp("--nq", argv[i]) == 0) { - if ((i + 1) < argc) { - num_queries = std::stoi(argv[i + 1]); - } else { - std::cerr << "Invalid argument for optional parameter --nq" - << std::endl; - return -1; - } - } - if (std::strcmp("--reorder_id", argv[i]) == 0) { - if ((i + 1) < argc) { - reorder_ID = std::stoi(argv[i + 1]); - } else { - std::cerr << "Invalid argument for optional parameter --reorder_id" - << std::endl; - return -1; - } - } - if (std::strcmp("--ef_profile", argv[i]) == 0) { - if ((i + 1) < argc) { - ef_profile = std::stoi(argv[i + 1]); - } else { - std::cerr << "Invalid argument for optional parameter --ef_profile" - << std::endl; - return -1; - } - } - if (std::strcmp("--num_profile", argv[i]) == 0) { - if ((i + 1) < argc) { - num_profile = std::stoi(argv[i + 1]); - } else { - std::cerr << "Invalid argument for optional parameter --num_profile" - << std::endl; - return -1; - } - } - } - // Positional arguments. - std::string indexfilename(train_file); // Index filename. - int space_ID = 0; // Space ID for querying. - - // Load queries. - std::clog << "[INFO] Loading queries." << std::endl; - cnpy::NpyArray queries_array = cnpy::npy_load(queries_file); - float *queries = queries_array.data(); - - // Load ground truth. - std::clog << "[INFO] Loading ground truth." << std::endl; - cnpy::NpyArray gtruth_array = cnpy::npy_load(groundtruth_file); - uint32_t *gtruth = gtruth_array.data(); - - // EF search vector. - std::vector ef_searches{100}; - - // Number of search results. - int k = 100; - - std::clog << "[INFO] Loading training data." << std::endl; - cnpy::NpyArray train_data_array = cnpy::npy_load(train_file); - float *data = train_data_array.data(); - - std::clog << "[INFO] Building index from " << indexfilename << std::endl; - - uint32_t dim = 784; - auto index = buildIndex(/* data = */ data, /* dim = */ dim, /* N = */ 60000, - /* max_edges = */ 16, /* ef_construction = */ 200); - - // Do reordering, if necessary. - if (num_profile > num_queries) { - std::clog << "Warning: Number of profiling queries (" << num_profile - << ") is greater than number of queries (" << num_queries << ")!" - << std::endl; - num_profile = num_queries; - } - if (reorder) { - std::clog << "Using GORDER" << std::endl; - std::clog << "Reordering: " << std::endl; - auto start_r = std::chrono::high_resolution_clock::now(); - index->reorder_gorder(); - auto stop_r = std::chrono::high_resolution_clock::now(); - auto duration_r = - std::chrono::duration_cast(stop_r - start_r); - std::clog << "Reorder time: " << (float)(duration_r.count()) / (1000.0) - << " seconds" << std::endl; - } else { - std::clog << "No reordering" << std::endl; - } - - int num_gtruth_entries = 100; - - // Now, finally, do the actual search. - std::cout << "recall, mean_latency_ms" << std::endl; - for (int &ef_search : ef_searches) { - double mean_recall = 0; - - auto start_q = std::chrono::high_resolution_clock::now(); - for (int i = 0; i < num_queries; i++) { - float *query = queries + dim * i; - uint32_t *g = gtruth + num_gtruth_entries * i; - - std::vector> result = index->search( - /* query = */ query, /* K = */ k, /* ef_search = */ ef_search); - - double recall = 0; - for (int j = 0; j < k; j++) { - for (int l = 0; l < k; l++) { - if (static_cast(result[j].second) == g[l]) { - recall += 1; - break; - } - } - } - recall /= k; - mean_recall = mean_recall + recall; - } - auto stop_q = std::chrono::high_resolution_clock::now(); - auto duration_q = - std::chrono::duration_cast(stop_q - start_q); - std::cout << "[INFO] recall: " << mean_recall / num_queries << std::endl; - std::cout << "[INFO] mean_latency_ms: " - << (float)(duration_q.count()) / num_queries << std::endl; - } - - return 0; -} \ No newline at end of file diff --git a/tools/query_npy.cpp b/tools/query_npy.cpp index fb4a4d7..fad19f8 100644 --- a/tools/query_npy.cpp +++ b/tools/query_npy.cpp @@ -34,7 +34,7 @@ void run(float *queries, int *gtruth, const std::string &index_filename, if (reorder) { std::clog << "[INFO] Gorder Reordering: " << std::endl; auto start_r = std::chrono::high_resolution_clock::now(); - index->reorder_gorder(); + index->reorderGOrder(); auto stop_r = std::chrono::high_resolution_clock::now(); auto duration_r = std::chrono::duration_cast(stop_r - start_r);