Skip to content

Commit

Permalink
Switch to uintptr_t's to pass pointers to/from C++. (#35) (#1)
Browse files Browse the repository at this point in the history
This is motivated by a similar change in mattress. It eliminates the need to
rely on pybind11 in downstream packages; as long as they can take a uintptr_t,
they can use knncolle. We add a knncolle_py.h header with type definitions for
correct casting of the uintptr_t; this can be obtained via include().

A side-effect of this change is that we need to implement destructors for the
pointed-to objects for each uintptr_t. This involves writing __del__ methods
for GenericIndex instances as well as creating a new Builder class. While we're
here, we  move more logic into GenericIndex to reduce repetition.
  • Loading branch information
LTLA authored Dec 13, 2024
1 parent 3a633e9 commit 607c430
Show file tree
Hide file tree
Showing 26 changed files with 292 additions and 148 deletions.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
recursive-include src/knncolle/include *
1 change: 1 addition & 0 deletions lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ pybind11_add_module(knncolle_py
)

target_include_directories(knncolle_py PRIVATE "${ASSORTHEAD_INCLUDE_DIR}")
target_include_directories(knncolle_py PRIVATE "../src/knncolle/include")

set_property(TARGET knncolle_py PROPERTY CXX_STANDARD 17)

Expand Down
21 changes: 13 additions & 8 deletions lib/src/annoy.cpp
Original file line number Diff line number Diff line change
@@ -1,38 +1,43 @@
#include "def.h"
#include "knncolle_py.h"
#include "pybind11/pybind11.h"

#include <memory>
#include <stdexcept>

// Turn off manual vectorization always, to avoid small inconsistencies in
// distance calculations across otherwise-compliant machines.
#define NO_MANUAL_VECTORIZATION 1

#include "knncolle_annoy/knncolle_annoy.hpp"

BuilderPointer create_annoy_builder(int num_trees, double search_mult, std::string distance) {
uintptr_t create_annoy_builder(int num_trees, double search_mult, std::string distance) {
knncolle_annoy::AnnoyOptions opt;
opt.num_trees = num_trees;
opt.search_mult = search_mult;
auto tmp = std::make_unique<knncolle_py::WrappedBuilder>();

if (distance == "Manhattan") {
return BuilderPointer(new knncolle_annoy::AnnoyBuilder<Annoy::Manhattan, SimpleMatrix, double>(opt));
tmp->ptr.reset(new knncolle_annoy::AnnoyBuilder<Annoy::Manhattan, knncolle_py::SimpleMatrix, knncolle_py::Distance>(opt));

} else if (distance == "Euclidean") {
return BuilderPointer(new knncolle_annoy::AnnoyBuilder<Annoy::Euclidean, SimpleMatrix, double>(opt));
tmp->ptr.reset(new knncolle_annoy::AnnoyBuilder<Annoy::Euclidean, knncolle_py::SimpleMatrix, knncolle_py::Distance>(opt));

} else if (distance == "Cosine") {
return BuilderPointer(
new knncolle::L2NormalizedBuilder<SimpleMatrix, double>(
tmp->ptr.reset(
new knncolle::L2NormalizedBuilder<knncolle_py::SimpleMatrix, knncolle_py::Distance>(
new knncolle_annoy::AnnoyBuilder<
Annoy::Euclidean,
knncolle::L2NormalizedMatrix<SimpleMatrix>,
knncolle::L2NormalizedMatrix<knncolle_py::SimpleMatrix>,
double
>(opt)
)
);

} else {
throw std::runtime_error("unknown distance type '" + distance + "'");
return BuilderPointer();
}

return reinterpret_cast<uintptr_t>(static_cast<void*>(tmp.release()));
}

void init_annoy(pybind11::module& m) {
Expand Down
19 changes: 0 additions & 19 deletions lib/src/def.h

This file was deleted.

20 changes: 13 additions & 7 deletions lib/src/exhaustive.cpp
Original file line number Diff line number Diff line change
@@ -1,28 +1,34 @@
#include "def.h"
#include "knncolle_py.h"
#include "pybind11/pybind11.h"

BuilderPointer create_exhaustive_builder(std::string distance) {
#include <memory>
#include <stdexcept>

uintptr_t create_exhaustive_builder(std::string distance) {
auto tmp = std::make_unique<knncolle_py::WrappedBuilder>();

if (distance == "Manhattan") {
return BuilderPointer(new knncolle::BruteforceBuilder<knncolle::ManhattanDistance, SimpleMatrix, double>);
tmp->ptr.reset(new knncolle::BruteforceBuilder<knncolle::ManhattanDistance, knncolle_py::SimpleMatrix, knncolle_py::Distance>);

} else if (distance == "Euclidean") {
return BuilderPointer(new knncolle::BruteforceBuilder<knncolle::EuclideanDistance, SimpleMatrix, double>);
tmp->ptr.reset(new knncolle::BruteforceBuilder<knncolle::EuclideanDistance, knncolle_py::SimpleMatrix, knncolle_py::Distance>);

} else if (distance == "Cosine") {
return BuilderPointer(
tmp->ptr.reset(
new knncolle::L2NormalizedBuilder(
new knncolle::BruteforceBuilder<
knncolle::EuclideanDistance,
knncolle::L2NormalizedMatrix<SimpleMatrix>,
knncolle::L2NormalizedMatrix<knncolle_py::SimpleMatrix>,
double
>
)
);

} else {
throw std::runtime_error("unknown distance type '" + distance + "'");
return BuilderPointer();
}

return reinterpret_cast<uintptr_t>(static_cast<void*>(tmp.release()));
}

void init_exhaustive(pybind11::module& m) {
Expand Down
51 changes: 38 additions & 13 deletions lib/src/generics.cpp
Original file line number Diff line number Diff line change
@@ -1,25 +1,44 @@
#include "def.h"
#include "knncolle_py.h"

#include "pybind11/pybind11.h"
#include "pybind11/numpy.h"
#include "pybind11/stl.h"

#include <algorithm>
#include <cstdint>
#include <optional>
#include <memory>
#include <stdexcept>
#include <vector>

typedef pybind11::array_t<double, pybind11::array::f_style | pybind11::array::forcecast> DataMatrix;

PrebuiltPointer generic_build(const BuilderPointer& builder, const DataMatrix& data) {
void free_builder(uintptr_t builder_ptr) {
delete knncolle_py::cast_builder(builder_ptr);
}

uintptr_t generic_build(uintptr_t builder_ptr, const DataMatrix& data) {
auto buffer = data.request();
uint32_t NR = buffer.shape[0], NC = buffer.shape[1];
return PrebuiltPointer(builder->build_raw(SimpleMatrix(NR, NC, static_cast<const double*>(buffer.ptr))));

auto builder = knncolle_py::cast_builder(builder_ptr);
auto tmp = std::make_unique<knncolle_py::WrappedPrebuilt>();
tmp->ptr.reset(builder->ptr->build_raw(knncolle_py::SimpleMatrix(NR, NC, static_cast<const double*>(buffer.ptr))));

return reinterpret_cast<uintptr_t>(static_cast<void*>(tmp.release()));
}

uint32_t generic_num_obs(const PrebuiltPointer& prebuilt) {
void free_prebuilt(uintptr_t prebuilt_ptr) {
delete knncolle_py::cast_prebuilt(prebuilt_ptr);
}

uint32_t generic_num_obs(uintptr_t prebuilt_ptr) {
const auto& prebuilt = knncolle_py::cast_prebuilt(prebuilt_ptr)->ptr;
return prebuilt->num_observations();
}

uint32_t generic_num_dims(const PrebuiltPointer& prebuilt) {
uint32_t generic_num_dims(uintptr_t prebuilt_ptr) {
const auto& prebuilt = knncolle_py::cast_prebuilt(prebuilt_ptr)->ptr;
return prebuilt->num_dimensions();
}

Expand Down Expand Up @@ -54,7 +73,7 @@ typedef pybind11::array_t<uint32_t, pybind11::array::f_style | pybind11::array::
typedef pybind11::array_t<uint32_t, pybind11::array::f_style | pybind11::array::forcecast> ChosenVector;

pybind11::object generic_find_knn(
const PrebuiltPointer& prebuilt,
uintptr_t prebuilt_ptr,
const NeighborVector& num_neighbors,
bool force_variable_neighbors,
std::optional<ChosenVector> chosen,
Expand All @@ -63,6 +82,7 @@ pybind11::object generic_find_knn(
bool report_index,
bool report_distance)
{
const auto& prebuilt = knncolle_py::cast_prebuilt(prebuilt_ptr)->ptr;
uint32_t nobs = prebuilt->num_observations();

// Checking if we have to handle subsets.
Expand Down Expand Up @@ -206,7 +226,7 @@ pybind11::object generic_find_knn(
}

pybind11::object generic_query_knn(
const PrebuiltPointer& prebuilt,
uintptr_t prebuilt_ptr,
const DataMatrix& query,
const NeighborVector& num_neighbors,
bool force_variable_neighbors,
Expand All @@ -215,18 +235,19 @@ pybind11::object generic_query_knn(
bool report_index,
bool report_distance)
{
int nobs = prebuilt->num_observations();
size_t ndim = prebuilt->num_dimensions();
const auto& prebuilt = knncolle_py::cast_prebuilt(prebuilt_ptr)->ptr;
uint32_t nobs = prebuilt->num_observations();
uint32_t ndim = prebuilt->num_dimensions();

auto buf_info = query.request();
uint32_t nquery = buf_info.shape[1];
const double* query_ptr = static_cast<const double*>(buf_info.ptr);
if (static_cast<size_t>(buf_info.shape[0]) != ndim) {
if (static_cast<uint32_t>(buf_info.shape[0]) != ndim) {
throw std::runtime_error("mismatch in dimensionality between index and 'query'");
}

// Checking that 'k' is valid.
auto sanitize_k = [&](int k) -> int {
auto sanitize_k = [&](uint32_t k) -> int {
if (k <= nobs) {
return k;
}
Expand Down Expand Up @@ -354,13 +375,14 @@ pybind11::object generic_query_knn(
typedef pybind11::array_t<double, pybind11::array::f_style | pybind11::array::forcecast> ThresholdVector;

pybind11::object generic_find_all(
const PrebuiltPointer& prebuilt,
uintptr_t prebuilt_ptr,
std::optional<ChosenVector> chosen,
const ThresholdVector& thresholds,
int num_threads,
bool report_index,
bool report_distance)
{
const auto& prebuilt = knncolle_py::cast_prebuilt(prebuilt_ptr)->ptr;
uint32_t nobs = prebuilt->num_observations();

uint32_t num_output = nobs;
Expand Down Expand Up @@ -438,13 +460,14 @@ pybind11::object generic_find_all(
}

pybind11::object generic_query_all(
const PrebuiltPointer& prebuilt,
uintptr_t prebuilt_ptr,
const DataMatrix& query,
const ThresholdVector& thresholds,
int num_threads,
bool report_index,
bool report_distance)
{
const auto& prebuilt = knncolle_py::cast_prebuilt(prebuilt_ptr)->ptr;
size_t ndim = prebuilt->num_dimensions();

auto buf_info = query.request();
Expand Down Expand Up @@ -522,7 +545,9 @@ pybind11::object generic_query_all(
*********************************/

void init_generics(pybind11::module& m) {
m.def("free_builder", &free_builder);
m.def("generic_build", &generic_build);
m.def("free_prebuilt", &free_prebuilt);
m.def("generic_num_obs", &generic_num_obs);
m.def("generic_num_dims", &generic_num_dims);
m.def("generic_find_knn", &generic_find_knn);
Expand Down
18 changes: 10 additions & 8 deletions lib/src/hnsw.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include "def.h"
#include "knncolle_py.h"
#include "pybind11/pybind11.h"

// Turn off manual vectorization always, to avoid small inconsistencies in
Expand All @@ -7,35 +7,37 @@

#include "knncolle_hnsw/knncolle_hnsw.hpp"

BuilderPointer create_hnsw_builder(int nlinks, int ef_construct, int ef_search, std::string distance) {
uintptr_t create_hnsw_builder(int nlinks, int ef_construct, int ef_search, std::string distance) {
knncolle_hnsw::HnswOptions<uint32_t, float> opt;
opt.num_links = nlinks;
opt.ef_construction = ef_construct;
opt.ef_search = ef_search;
auto tmp = std::make_unique<knncolle_py::WrappedBuilder>();

if (distance == "Manhattan") {
opt.distance_options.create = [&](int dim) -> hnswlib::SpaceInterface<float>* {
return new knncolle_hnsw::ManhattanDistance<float>(dim);
};
return BuilderPointer(new knncolle_hnsw::HnswBuilder<SimpleMatrix, double>(opt));
tmp->ptr.reset(new knncolle_hnsw::HnswBuilder<knncolle_py::SimpleMatrix, knncolle_py::Distance>(opt));

} else if (distance == "Euclidean") {
return BuilderPointer(new knncolle_hnsw::HnswBuilder<SimpleMatrix, double>(opt));
tmp->ptr.reset(new knncolle_hnsw::HnswBuilder<knncolle_py::SimpleMatrix, knncolle_py::Distance>(opt));

} else if (distance == "Cosine") {
return BuilderPointer(
new knncolle::L2NormalizedBuilder<SimpleMatrix, double>(
tmp->ptr.reset(
new knncolle::L2NormalizedBuilder<knncolle_py::SimpleMatrix, knncolle_py::Distance>(
new knncolle_hnsw::HnswBuilder<
knncolle::L2NormalizedMatrix<SimpleMatrix>,
knncolle::L2NormalizedMatrix<knncolle_py::SimpleMatrix>,
double
>(opt)
)
);

} else {
throw std::runtime_error("unknown distance type '" + distance + "'");
return BuilderPointer();
}

return reinterpret_cast<uintptr_t>(static_cast<void*>(tmp.release()));
}

void init_hnsw(pybind11::module& m) {
Expand Down
4 changes: 0 additions & 4 deletions lib/src/init.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#include "def.h"
#include "pybind11/pybind11.h"
#include "pybind11/numpy.h"
#include "pybind11/stl.h"
Expand All @@ -17,7 +16,4 @@ PYBIND11_MODULE(lib_knncolle, m) {
init_hnsw(m);
init_kmknn(m);
init_vptree(m);

pybind11::class_<Builder, BuilderPointer>(m, "Builder");
pybind11::class_<Prebuilt, PrebuiltPointer>(m, "Prebuilt");
}
22 changes: 14 additions & 8 deletions lib/src/kmknn.cpp
Original file line number Diff line number Diff line change
@@ -1,28 +1,34 @@
#include "def.h"
#include "knncolle_py.h"
#include "pybind11/pybind11.h"

BuilderPointer create_kmknn_builder(std::string distance) {
#include <memory>
#include <stdexcept>

uintptr_t create_kmknn_builder(std::string distance) {
auto tmp = std::make_unique<knncolle_py::WrappedBuilder>();

if (distance == "Manhattan") {
return BuilderPointer(new knncolle::KmknnBuilder<knncolle::ManhattanDistance, SimpleMatrix, double>);
tmp->ptr.reset(new knncolle::KmknnBuilder<knncolle::ManhattanDistance, knncolle_py::SimpleMatrix, knncolle_py::Distance>);

} else if (distance == "Euclidean") {
return BuilderPointer(new knncolle::KmknnBuilder<knncolle::EuclideanDistance, SimpleMatrix, double>);
tmp->ptr.reset(new knncolle::KmknnBuilder<knncolle::EuclideanDistance, knncolle_py::SimpleMatrix, knncolle_py::Distance>);

} else if (distance == "Cosine") {
return BuilderPointer(
new knncolle::L2NormalizedBuilder<SimpleMatrix, double>(
tmp->ptr.reset(
new knncolle::L2NormalizedBuilder<knncolle_py::SimpleMatrix, knncolle_py::Distance>(
new knncolle::KmknnBuilder<
knncolle::EuclideanDistance,
knncolle::L2NormalizedMatrix<SimpleMatrix>,
knncolle::L2NormalizedMatrix<knncolle_py::SimpleMatrix>,
double
>
)
);

} else {
throw std::runtime_error("unknown distance type '" + distance + "'");
return BuilderPointer();
}

return reinterpret_cast<uintptr_t>(static_cast<void*>(tmp.release()));
}

void init_kmknn(pybind11::module& m) {
Expand Down
Loading

0 comments on commit 607c430

Please sign in to comment.