Added sparse search example [skip ci]

ankane · ankane · commit 06e56df0143c · 2025-01-13T13:42:38.000-08:00
diff --git a/README.md b/README.md
@@ -37,6 +37,7 @@ Or check out some examples:
 
 - [Embeddings](examples/openai/example.cpp) with OpenAI
 - [Binary embeddings](examples/cohere/example.cpp) with Cohere
+- [Sparse search](examples/sparse/example.cpp) with Text Embeddings Inference
 - [Morgan fingerprints](examples/rdkit/example.cpp) with RDKit
 - [Recommendations](examples/disco/example.cpp) with Disco
 
diff --git a/examples/sparse/CMakeLists.txt b/examples/sparse/CMakeLists.txt
@@ -0,0 +1,20 @@
+cmake_minimum_required(VERSION 3.18)
+
+project(example)
+
+set(CMAKE_CXX_STANDARD 17)
+
+# for libpqxx
+set(CMAKE_CXX_FLAGS "-Wno-unknown-attributes")
+set(SKIP_BUILD_TEST ON)
+
+include(FetchContent)
+
+FetchContent_Declare(cpr GIT_REPOSITORY https://github.com/libcpr/cpr.git GIT_TAG 1.11.1)
+FetchContent_Declare(json GIT_REPOSITORY https://github.com/nlohmann/json.git GIT_TAG v3.11.3)
+FetchContent_Declare(libpqxx GIT_REPOSITORY https://github.com/jtv/libpqxx.git GIT_TAG 7.10.0)
+FetchContent_MakeAvailable(cpr json libpqxx)
+
+add_executable(example example.cpp)
+target_include_directories(example PRIVATE ${CMAKE_SOURCE_DIR}/../../include)
+target_link_libraries(example PRIVATE cpr::cpr libpqxx::pqxx nlohmann_json::nlohmann_json)
diff --git a/examples/sparse/example.cpp b/examples/sparse/example.cpp
@@ -0,0 +1,73 @@
+// good resources
+// https://opensearch.org/blog/improving-document-retrieval-with-sparse-semantic-encoders/
+// https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-v1
+//
+// run with
+// text-embeddings-router --model-id opensearch-project/opensearch-neural-sparse-encoding-v1 --pooling splade
+
+#include <cstdint>
+#include <iostream>
+
+#include <cpr/cpr.h>
+#include <nlohmann/json.hpp>
+#include <pgvector/pqxx.hpp>
+#include <pqxx/pqxx>
+
+using json = nlohmann::json;
+
+std::vector<pgvector::SparseVector> fetch_embeddings(const std::vector<std::string>& inputs) {
+    std::string url = "http://localhost:3000/embed_sparse";
+    json data = {
+        {"inputs", inputs}
+    };
+
+    cpr::Response r = cpr::Post(
+        cpr::Url{url},
+        cpr::Body{data.dump()},
+        cpr::Header{{"Content-Type", "application/json"}}
+    );
+    json response = json::parse(r.text);
+
+    std::vector<pgvector::SparseVector> embeddings;
+    for (auto& item : response) {
+        std::vector<int> indices;
+        std::vector<float> values;
+        for (auto& e : item) {
+            indices.emplace_back(e["index"]);
+            values.emplace_back(e["value"]);
+        }
+        embeddings.emplace_back(pgvector::SparseVector(30522, indices, values));
+    }
+    return embeddings;
+}
+
+int main() {
+    pqxx::connection conn("dbname=pgvector_example");
+
+    pqxx::work tx(conn);
+    tx.exec("CREATE EXTENSION IF NOT EXISTS vector");
+    tx.exec("DROP TABLE IF EXISTS documents");
+    tx.exec("CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding sparsevec(30522))");
+    tx.commit();
+
+    std::vector<std::string> input = {
+        "The dog is barking",
+        "The cat is purring",
+        "The bear is growling"
+    };
+    auto embeddings = fetch_embeddings(input);
+
+    for (size_t i = 0; i < input.size(); i++) {
+        tx.exec("INSERT INTO documents (content, embedding) VALUES ($1, $2)", pqxx::params{input[i], embeddings[i]});
+    }
+    tx.commit();
+
+    std::string query = "forest";
+    auto query_embedding = fetch_embeddings({query})[0];
+    pqxx::result result = tx.exec("SELECT content FROM documents ORDER BY embedding <#> $1 LIMIT 5", pqxx::params{query_embedding});
+    for (const auto& row : result) {
+        std::cout << row[0].as<std::string>() << std::endl;
+    }
+
+    return 0;
+}