Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 18 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,23 +42,24 @@ All the database client supported
| Optional database client | install command |
|--------------------------|---------------------------------------------|
| pymilvus, zilliz_cloud (*default*) | `pip install vectordb-bench` |
| all (*clients requirements might be conflict with each other*) | `pip install 'vectordb-bench[all]'` |
| qdrant | `pip install 'vectordb-bench[qdrant]'` |
| pinecone | `pip install 'vectordb-bench[pinecone]'` |
| weaviate | `pip install 'vectordb-bench[weaviate]'` |
| elastic, aliyun_elasticsearch| `pip install 'vectordb-bench[elastic]'` |
| pgvector, pgvectorscale, pgdiskann, alloydb | `pip install 'vectordb-bench[pgvector]'` |
| pgvecto.rs | `pip install 'vectordb-bench[pgvecto_rs]'` |
| redis | `pip install 'vectordb-bench[redis]'` |
| memorydb | `pip install 'vectordb-bench[memorydb]'` |
| chromadb | `pip install 'vectordb-bench[chromadb]'` |
| awsopensearch | `pip install 'vectordb-bench[opensearch]'` |
| aliyun_opensearch | `pip install 'vectordb-bench[aliyun_opensearch]'` |
| mongodb | `pip install 'vectordb-bench[mongodb]'` |
| tidb | `pip install 'vectordb-bench[tidb]'` |
| vespa | `pip install 'vectordb-bench[vespa]'` |
| oceanbase | `pip install 'vectordb-bench[oceanbase]'` |
| hologres | `pip install 'vectordb-bench[hologres]'` |
| all (*clients requirements might be conflict with each other*) | `pip install vectordb-bench[all]` |
| qdrant | `pip install vectordb-bench[qdrant]` |
| pinecone | `pip install vectordb-bench[pinecone]` |
| weaviate | `pip install vectordb-bench[weaviate]` |
| elastic, aliyun_elasticsearch| `pip install vectordb-bench[elastic]` |
| pgvector, pgvectorscale, pgdiskann, alloydb | `pip install vectordb-bench[pgvector]` |
| pgvecto.rs | `pip install vectordb-bench[pgvecto_rs]` |
| redis | `pip install vectordb-bench[redis]` |
| memorydb | `pip install vectordb-bench[memorydb]` |
| chromadb | `pip install vectordb-bench[chromadb]` |
| awsopensearch | `pip install vectordb-bench[opensearch]` |
| aliyun_opensearch | `pip install vectordb-bench[aliyun_opensearch]` |
| mongodb | `pip install vectordb-bench[mongodb]` |
| tidb | `pip install vectordb-bench[tidb]` |
| vespa | `pip install vectordb-bench[vespa]` |
| oceanbase | `pip install vectordb-bench[oceanbase]` |
| hologres | `pip install vectordb-bench[hologres]` |
| tencent_es | `pip install vectordb-bench[tencent_es]` |

### Run

Expand Down
2 changes: 1 addition & 1 deletion install/requirements_py3.11.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ grpcio-tools==1.53.0
qdrant-client
pinecone-client
weaviate-client
elasticsearch
elasticsearch==8.16.0
pgvector
pgvecto_rs[psycopg3]>=0.2.1
sqlalchemy
Expand Down
16 changes: 16 additions & 0 deletions vectordb_bench/backend/clients/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class DB(Enum):
OceanBase = "OceanBase"
S3Vectors = "S3Vectors"
Hologres = "Alibaba Cloud Hologres"
TencentElasticsearch = "TencentElasticsearch"

@property
def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912, C901, PLR0915
Expand Down Expand Up @@ -200,6 +201,11 @@ def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912, C901, PLR0915

return Hologres

if self == DB.TencentElasticsearch:
from .tencent_elasticsearch.tencent_elasticsearch import TencentElasticsearch

return TencentElasticsearch

msg = f"Unknown DB: {self.name}"
raise ValueError(msg)

Expand Down Expand Up @@ -351,6 +357,11 @@ def config_cls(self) -> type[DBConfig]: # noqa: PLR0911, PLR0912, C901, PLR0915

return HologresConfig

if self == DB.TencentElasticsearch:
from .tencent_elasticsearch.config import TencentElasticsearchConfig

return TencentElasticsearchConfig

msg = f"Unknown DB: {self.name}"
raise ValueError(msg)

Expand Down Expand Up @@ -477,6 +488,11 @@ def case_config_cls( # noqa: C901, PLR0911, PLR0912

return HologresIndexConfig

if self == DB.TencentElasticsearch:
from .tencent_elasticsearch.config import TencentElasticsearchIndexConfig

return TencentElasticsearchIndexConfig

# DB.Pinecone, DB.Chroma, DB.Redis
return EmptyDBCaseConfig

Expand Down
1 change: 1 addition & 0 deletions vectordb_bench/backend/clients/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class IndexType(str, Enum):
ES_HNSW_INT8 = "int8_hnsw"
ES_HNSW_INT4 = "int4_hnsw"
ES_HNSW_BBQ = "bbq_hnsw"
TES_VSEARCH = "vsearch"
ES_IVFFlat = "ivfflat"
GPU_IVF_FLAT = "GPU_IVF_FLAT"
GPU_BRUTE_FORCE = "GPU_BRUTE_FORCE"
Expand Down
1 change: 1 addition & 0 deletions vectordb_bench/backend/clients/elastic_cloud/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def __hash__(self) -> int:
self.use_routing,
self.efConstruction,
self.M,
2,
)
)

Expand Down
96 changes: 96 additions & 0 deletions vectordb_bench/backend/clients/tencent_elasticsearch/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import os
from typing import Annotated, Unpack

import click
from pydantic import SecretStr

from vectordb_bench.backend.clients import DB
from vectordb_bench.cli.cli import (
CommonTypedDict,
cli,
click_parameter_decorators_from_typed_dict,
run,
)


class TencentElasticsearchTypedDict(CommonTypedDict):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be great if this CLI tool could support ElasticCloud, AliyunES, and TencentES. Otherwise, no rush—I can handle it in a follow-up PR.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have made the TencentElasticsearch client class directly inherit from the ElasticCloud class.
Sorry, I haven't made changes to the CLI tool because I don't have test resources for ElasticCloud or AliyunES, and I'm unsure if the modifications will work properly. Could you merge my PR first, and then merge the CLI tool changes later?

scheme: Annotated[
str,
click.option(
"--scheme",
type=str,
help="Protocol in use to connect to the node",
default="http",
show_default=True,
),
]
host: Annotated[
str,
click.option("--host", type=str, help="shot connection string", required=True),
]
port: Annotated[
int,
click.option("--port", type=int, help="Port to connect to", default=9200, show_default=True),
]
user: Annotated[
str,
click.option("--user", type=str, help="Db username", required=True),
]
password: Annotated[
str,
click.option(
"--password",
type=str,
help="TencentElasticsearch password",
default=lambda: os.environ.get("TES_PASSWORD", ""),
show_default="$TES_PASSWORD",
),
]
m: Annotated[
int,
click.option("--m", type=int, help="HNSW M parameter", default=16, show_default=True),
]
ef_construction: Annotated[
int,
click.option(
"--ef_construction",
type=int,
help="HNSW efConstruction parameter",
default=200,
show_default=True,
),
]
num_candidates: Annotated[
int,
click.option(
"--num_candidates",
type=int,
help="Number of candidates to consider during searching",
default=200,
show_default=True,
),
]


@cli.command()
@click_parameter_decorators_from_typed_dict(TencentElasticsearchTypedDict)
def TencentElasticsearch(**parameters: Unpack[TencentElasticsearchTypedDict]):
from .config import TencentElasticsearchConfig, TencentElasticsearchIndexConfig

run(
db=DB.TencentElasticsearch,
db_config=TencentElasticsearchConfig(
db_label=parameters["db_label"],
scheme=parameters["scheme"],
host=parameters["host"],
port=parameters["port"],
user=parameters["user"],
password=SecretStr(parameters["password"]),
),
db_case_config=TencentElasticsearchIndexConfig(
M=parameters["m"],
efConstruction=parameters["ef_construction"],
num_candidates=parameters["num_candidates"],
),
**parameters,
)
92 changes: 92 additions & 0 deletions vectordb_bench/backend/clients/tencent_elasticsearch/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
from enum import Enum

from pydantic import BaseModel, SecretStr

from ..api import DBCaseConfig, DBConfig, IndexType, MetricType


class TencentElasticsearchConfig(DBConfig, BaseModel):
#: Protocol in use to connect to the node
scheme: str = "http"
host: str = ""
port: int = 9200
user: str = "elastic"
password: SecretStr

def to_dict(self) -> dict:
return {
"hosts": [{"scheme": self.scheme, "host": self.host, "port": self.port}],
"basic_auth": (self.user, self.password.get_secret_value()),
}


class ESElementType(str, Enum):
float = "float" # 4 byte
byte = "byte" # 1 byte, -128 to 127


class TencentElasticsearchIndexConfig(BaseModel, DBCaseConfig):
element_type: ESElementType = ESElementType.float
index: IndexType = IndexType.TES_VSEARCH
number_of_shards: int = 1
number_of_replicas: int = 0
refresh_interval: str = "3s"
merge_max_thread_count: int = 8
use_rescore: bool = False
oversample_ratio: float = 2.0
use_routing: bool = False
use_force_merge: bool = True

metric_type: MetricType | None = None
efConstruction: int | None = None
M: int | None = None
num_candidates: int | None = None

def __eq__(self, obj: any):
return (
self.index == obj.index
and self.number_of_shards == obj.number_of_shards
and self.number_of_replicas == obj.number_of_replicas
and self.use_routing == obj.use_routing
and self.efConstruction == obj.efConstruction
and self.M == obj.M
)

def __hash__(self) -> int:
return hash(
(
self.index,
self.number_of_shards,
self.number_of_replicas,
self.use_routing,
self.efConstruction,
self.M,
2,
)
)

def parse_metric(self) -> str:
if self.metric_type == MetricType.L2:
return "l2_norm"
if self.metric_type == MetricType.IP:
return "dot_product"
return "cosine"

def index_param(self) -> dict:
return {
"type": "dense_vector",
"index": True,
"element_type": self.element_type.value,
"similarity": self.parse_metric(),
"index_options": {
"type": self.index.value,
"index": "hnsw",
"m": self.M,
"ef_construction": self.efConstruction,
},
}

def search_param(self) -> dict:
return {
"num_candidates": self.num_candidates,
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import logging
import time
from contextlib import contextmanager

from vectordb_bench.backend.filter import Filter, FilterOp

from ..elastic_cloud.elastic_cloud import ElasticCloud
from .config import TencentElasticsearchIndexConfig

for logger in ("elasticsearch", "elastic_transport"):
logging.getLogger(logger).setLevel(logging.WARNING)

log = logging.getLogger(__name__)


SECONDS_WAITING_FOR_FORCE_MERGE_API_CALL_SEC = 30


class TencentElasticsearch(ElasticCloud):
supported_filter_types: list[FilterOp] = [
FilterOp.NonFilter,
FilterOp.NumGE,
FilterOp.StrEqual,
]

@contextmanager
def init(self) -> None:
"""connect to elasticsearch"""
from elasticsearch import Elasticsearch

self.client = Elasticsearch(**self.db_config, request_timeout=1800)

yield
self.client = None
del self.client

def optimize(self, data_size: int | None = None):
"""optimize will be called between insertion and search in performance cases."""
assert self.client is not None, "should self.init() first"
self.client.indices.refresh(index=self.indice)
time.sleep(SECONDS_WAITING_FOR_FORCE_MERGE_API_CALL_SEC)
if self.case_config.use_force_merge:
force_merge_task_id = self.client.indices.forcemerge(
index=self.indice,
max_num_segments=1,
wait_for_completion=False,
)["task"]
log.info(f"Elasticsearch force merge task id: {force_merge_task_id}")
while True:
time.sleep(SECONDS_WAITING_FOR_FORCE_MERGE_API_CALL_SEC)
task_status = self.client.tasks.get(task_id=force_merge_task_id)
if task_status["completed"]:
return
2 changes: 2 additions & 0 deletions vectordb_bench/cli/vectordbbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from ..backend.clients.qdrant_local.cli import QdrantLocal
from ..backend.clients.redis.cli import Redis
from ..backend.clients.s3_vectors.cli import S3Vectors
from ..backend.clients.tencent_elasticsearch.cli import TencentElasticsearch
from ..backend.clients.test.cli import Test
from ..backend.clients.tidb.cli import TiDB
from ..backend.clients.vespa.cli import Vespa
Expand Down Expand Up @@ -50,6 +51,7 @@
cli.add_command(QdrantLocal)
cli.add_command(BatchCli)
cli.add_command(S3Vectors)
cli.add_command(TencentElasticsearch)


if __name__ == "__main__":
Expand Down
Loading