Skip to content

Commit 06e56df

Browse files
committed
Added sparse search example [skip ci]
1 parent db43713 commit 06e56df

File tree

3 files changed

+94
-0
lines changed

3 files changed

+94
-0
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ Or check out some examples:
3737

3838
- [Embeddings](examples/openai/example.cpp) with OpenAI
3939
- [Binary embeddings](examples/cohere/example.cpp) with Cohere
40+
- [Sparse search](examples/sparse/example.cpp) with Text Embeddings Inference
4041
- [Morgan fingerprints](examples/rdkit/example.cpp) with RDKit
4142
- [Recommendations](examples/disco/example.cpp) with Disco
4243

examples/sparse/CMakeLists.txt

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
cmake_minimum_required(VERSION 3.18)
2+
3+
project(example)
4+
5+
set(CMAKE_CXX_STANDARD 17)
6+
7+
# for libpqxx
8+
set(CMAKE_CXX_FLAGS "-Wno-unknown-attributes")
9+
set(SKIP_BUILD_TEST ON)
10+
11+
include(FetchContent)
12+
13+
FetchContent_Declare(cpr GIT_REPOSITORY https://github.com/libcpr/cpr.git GIT_TAG 1.11.1)
14+
FetchContent_Declare(json GIT_REPOSITORY https://github.com/nlohmann/json.git GIT_TAG v3.11.3)
15+
FetchContent_Declare(libpqxx GIT_REPOSITORY https://github.com/jtv/libpqxx.git GIT_TAG 7.10.0)
16+
FetchContent_MakeAvailable(cpr json libpqxx)
17+
18+
add_executable(example example.cpp)
19+
target_include_directories(example PRIVATE ${CMAKE_SOURCE_DIR}/../../include)
20+
target_link_libraries(example PRIVATE cpr::cpr libpqxx::pqxx nlohmann_json::nlohmann_json)

examples/sparse/example.cpp

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
// good resources
2+
// https://opensearch.org/blog/improving-document-retrieval-with-sparse-semantic-encoders/
3+
// https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-v1
4+
//
5+
// run with
6+
// text-embeddings-router --model-id opensearch-project/opensearch-neural-sparse-encoding-v1 --pooling splade
7+
8+
#include <cstdint>
9+
#include <iostream>
10+
11+
#include <cpr/cpr.h>
12+
#include <nlohmann/json.hpp>
13+
#include <pgvector/pqxx.hpp>
14+
#include <pqxx/pqxx>
15+
16+
using json = nlohmann::json;
17+
18+
std::vector<pgvector::SparseVector> fetch_embeddings(const std::vector<std::string>& inputs) {
19+
std::string url = "http://localhost:3000/embed_sparse";
20+
json data = {
21+
{"inputs", inputs}
22+
};
23+
24+
cpr::Response r = cpr::Post(
25+
cpr::Url{url},
26+
cpr::Body{data.dump()},
27+
cpr::Header{{"Content-Type", "application/json"}}
28+
);
29+
json response = json::parse(r.text);
30+
31+
std::vector<pgvector::SparseVector> embeddings;
32+
for (auto& item : response) {
33+
std::vector<int> indices;
34+
std::vector<float> values;
35+
for (auto& e : item) {
36+
indices.emplace_back(e["index"]);
37+
values.emplace_back(e["value"]);
38+
}
39+
embeddings.emplace_back(pgvector::SparseVector(30522, indices, values));
40+
}
41+
return embeddings;
42+
}
43+
44+
int main() {
45+
pqxx::connection conn("dbname=pgvector_example");
46+
47+
pqxx::work tx(conn);
48+
tx.exec("CREATE EXTENSION IF NOT EXISTS vector");
49+
tx.exec("DROP TABLE IF EXISTS documents");
50+
tx.exec("CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding sparsevec(30522))");
51+
tx.commit();
52+
53+
std::vector<std::string> input = {
54+
"The dog is barking",
55+
"The cat is purring",
56+
"The bear is growling"
57+
};
58+
auto embeddings = fetch_embeddings(input);
59+
60+
for (size_t i = 0; i < input.size(); i++) {
61+
tx.exec("INSERT INTO documents (content, embedding) VALUES ($1, $2)", pqxx::params{input[i], embeddings[i]});
62+
}
63+
tx.commit();
64+
65+
std::string query = "forest";
66+
auto query_embedding = fetch_embeddings({query})[0];
67+
pqxx::result result = tx.exec("SELECT content FROM documents ORDER BY embedding <#> $1 LIMIT 5", pqxx::params{query_embedding});
68+
for (const auto& row : result) {
69+
std::cout << row[0].as<std::string>() << std::endl;
70+
}
71+
72+
return 0;
73+
}

0 commit comments

Comments
 (0)