Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ All the database client supported
| hologres | `pip install vectordb-bench[hologres]` |
| tencent_es | `pip install vectordb-bench[tencent_es]` |
| alisql | `pip install 'vectordb-bench[alisql]'` |
| doris | `pip install vectordb-bench[doris]` |

### Run

Expand Down Expand Up @@ -321,6 +322,42 @@ Options:
--help Show this message and exit.
```

### Run Doris from command line

Doris supports ann index with type hnsw from version 4.0.x

```shell
NUM_PER_BATCH=1000000 vectordbbench doris --http-port=8030 --port=9030 --db-name=vector_test --case-type=Performance768D1M --stream-load-rows-per-batch=500000
```

Using flag `--session-var`, if you want to test doris with some customized session variables. For example:
```shell
NUM_PER_BATCH=1000000 vectordbbench doris --http-port=8030 --port=9030 --db-name=vector_test --case-type=Performance768D1M --stream-load-rows-per-batch=500000 --session-var enable_profile=True
```

Mote options:

```text
--m INTEGER hnsw m
--ef-construction INTEGER hnsw ef-construction
--username TEXT Username [default: root; required]
--password TEXT Password [default: ""]
--host TEXT Db host [default: 127.0.0.1; required]
--port INTEGER Query Port [default: 9030; required]
--http-port INTEGER Http Port [default: 8030; required]
--db-name TEXT Db name [default: test; required]
--ssl / --no-ssl Enable or disable SSL, for Doris Serverless
SSL must be enabled [default: no-ssl]
--index-prop TEXT Extra index PROPERTY as key=value
(repeatable)
--session-var TEXT Session variable key=value applied to each
SQL session (repeatable)
--stream-load-rows-per-batch INTEGER
Rows per single stream load request; default
uses NUM_PER_BATCH
--no-index Create table without ANN index
```

#### Using a configuration file.

The vectordbbench command can optionally read some or all the options from a yaml formatted configuration file.
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ vespa = [ "pyvespa" ]
lancedb = [ "lancedb" ]
oceanbase = [ "mysql-connector-python" ]
alisql = [ "mysql-connector-python" ]
doris = [ "doris-vector-search" ]

[project.urls]
"repository" = "https://github.com/zilliztech/VectorDBBench"
Expand Down
15 changes: 15 additions & 0 deletions vectordb_bench/backend/clients/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class DB(Enum):
Hologres = "Alibaba Cloud Hologres"
TencentElasticsearch = "TencentElasticsearch"
AliSQL = "AlibabaCloudRDSMySQL"
Doris = "Doris"

@property
def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912, C901, PLR0915
Expand Down Expand Up @@ -177,6 +178,11 @@ def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912, C901, PLR0915

return TiDB

if self == DB.Doris:
from .doris.doris import Doris

return Doris

if self == DB.Test:
from .test.test import Test

Expand Down Expand Up @@ -338,6 +344,11 @@ def config_cls(self) -> type[DBConfig]: # noqa: PLR0911, PLR0912, C901, PLR0915

return TiDBConfig

if self == DB.Doris:
from .doris.config import DorisConfig

return DorisConfig

if self == DB.Test:
from .test.config import TestConfig

Expand Down Expand Up @@ -508,6 +519,10 @@ def case_config_cls( # noqa: C901, PLR0911, PLR0912, PLR0915
from .alisql.alisql import AliSQLIndexConfig

return AliSQLIndexConfig
if self == DB.Doris:
from .doris.config import DorisCaseConfig

return DorisCaseConfig

# DB.Pinecone, DB.Chroma, DB.Redis
return EmptyDBCaseConfig
Expand Down
199 changes: 199 additions & 0 deletions vectordb_bench/backend/clients/doris/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
from typing import Annotated, Unpack

import click
from pydantic import SecretStr

from vectordb_bench.backend.clients import DB

from ....cli.cli import (
CommonTypedDict,
HNSWBaseTypedDict,
cli,
click_parameter_decorators_from_typed_dict,
run,
)


def _parse_kv_list(_ctx, _param, values): # noqa: ANN001
"""Parse repeatable or comma-separated key=value items into a dict.
Accepts any of the following forms (and mixtures thereof):
--index-prop a=1 --index-prop b=2
--index-prop a=1,b=2
--index-prop a=1,b=2 --index-prop c=3
"""
parsed: dict[str, str] = {}
if not values:
return parsed
for item in values:
# allow comma-separated list in a single occurrence
parts = [p.strip() for p in str(item).split(",") if p and p.strip()]
for part in parts:
if "=" not in part:
msg = f"Expect key=value, got: {part}"
raise click.BadParameter(msg)
k, v = part.split("=", 1)
k = k.strip()
v = v.strip()
if not k:
msg = f"Empty key in: {part}"
raise click.BadParameter(msg)
parsed[k] = v
return parsed


class DorisTypedDict(CommonTypedDict, HNSWBaseTypedDict):
user_name: Annotated[
str,
click.option(
"--username",
type=str,
help="Username",
default="root",
show_default=True,
required=True,
),
]
password: Annotated[
str,
click.option(
"--password",
type=str,
default="",
show_default=True,
help="Password",
),
]
host: Annotated[
str,
click.option(
"--host",
type=str,
default="127.0.0.1",
show_default=True,
required=True,
help="Db host",
),
]
port: Annotated[
int,
click.option(
"--port",
type=int,
default=9030,
show_default=True,
required=True,
help="Query Port",
),
]
http_port: Annotated[
int,
click.option(
"--http-port",
type=int,
default=8030,
show_default=True,
required=True,
help="Http Port",
),
]
db_name: Annotated[
str,
click.option(
"--db-name",
type=str,
default="test",
show_default=True,
required=True,
help="Db name",
),
]
ssl: Annotated[
bool,
click.option(
"--ssl/--no-ssl",
default=False,
show_default=True,
is_flag=True,
help="Enable or disable SSL, for Doris Serverless SSL must be enabled",
),
]
index_prop: Annotated[
dict,
click.option(
"--index-prop",
type=str,
multiple=True,
help="Extra index PROPERTY as key=value (repeatable or comma-separated, e.g. a=1,b=2)",
callback=_parse_kv_list,
),
]
session_var: Annotated[
dict,
click.option(
"--session-var",
type=str,
multiple=True,
help="Session variable key=value applied to each SQL session (repeatable or comma-separated)",
callback=_parse_kv_list,
),
]
stream_load_rows_per_batch: Annotated[
int | None,
click.option(
"--stream-load-rows-per-batch",
type=int,
required=False,
help="Rows per single stream load request; default uses NUM_PER_BATCH",
),
]
no_index: Annotated[
bool,
click.option(
"--no-index",
is_flag=True,
default=False,
show_default=True,
help="Create table without ANN index",
),
]


@cli.command()
@click_parameter_decorators_from_typed_dict(DorisTypedDict)
def Doris(
**parameters: Unpack[DorisTypedDict],
):
from .config import DorisCaseConfig, DorisConfig

# Merge explicit HNSW params into index properties using Doris naming
index_properties: dict[str, str] = {}
index_properties.update(parameters.get("index_prop", {}) or {})
if parameters.get("m") is not None:
index_properties.setdefault("max_degree", str(parameters["m"]))
if parameters.get("ef_construction") is not None:
index_properties.setdefault("ef_construction", str(parameters["ef_construction"]))

session_vars: dict[str, str] = parameters.get("session_var", {}) or {}

run(
db=DB.Doris,
db_config=DorisConfig(
db_label=parameters["db_label"],
user_name=parameters["username"],
password=SecretStr(parameters["password"]),
host=parameters["host"],
port=parameters["port"],
http_port=parameters["http_port"],
db_name=parameters["db_name"],
ssl=parameters["ssl"],
),
# metric_type should come from the dataset; Assembler will set it on the case config.
db_case_config=DorisCaseConfig(
index_properties=index_properties,
session_vars=session_vars,
stream_load_rows_per_batch=parameters.get("stream_load_rows_per_batch"),
no_index=parameters.get("no_index", False),
),
**parameters,
)
82 changes: 82 additions & 0 deletions vectordb_bench/backend/clients/doris/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import logging

from pydantic import BaseModel, SecretStr, validator

from ..api import DBCaseConfig, DBConfig, MetricType

log = logging.getLogger(__name__)


class DorisConfig(DBConfig):
user_name: str = "root"
password: SecretStr
host: str = "127.0.0.1"
port: int = 9030
# Doris FE HTTP port for stream load. Default 8030 (8040 for HTTPS if enabled).
http_port: int = 8030
db_name: str = "test"
ssl: bool = False

@validator("*")
def not_empty_field(cls, v: any, field: any):
return v

def to_dict(self) -> dict:
pwd_str = self.password.get_secret_value()
return {
"host": self.host,
"port": self.port,
"http_port": self.http_port,
"user": self.user_name,
"password": pwd_str,
"database": self.db_name,
}


class DorisCaseConfig(BaseModel, DBCaseConfig):
metric_type: MetricType | None = None
# Optional explicit HNSW params for convenience
m: int | None = None
ef_construction: int | None = None
# Arbitrary index properties and session variables
index_properties: dict[str, str] | None = None
session_vars: dict[str, str] | None = None
# Control rows per single stream load request
stream_load_rows_per_batch: int | None = None
# Create table without ANN index
no_index: bool = False

def get_metric_fn(self) -> str:
if self.metric_type == MetricType.L2:
return "l2_distance_approximate"
if self.metric_type == MetricType.IP:
return "inner_product_approximate"
if self.metric_type == MetricType.COSINE:
log.debug("Using inner_product_approximate because doris doesn't support cosine as metric type")
return "inner_product_approximate"
msg = f"Unsupported metric type: {self.metric_type}"
raise ValueError(msg)

def index_param(self) -> dict:
# Use exact metric function name for index creation by removing '_approximate' suffix
metric_fn = self.get_metric_fn()
if metric_fn.endswith("_approximate"):
metric_fn = metric_fn[: -len("_approximate")]
props = {"metric_fn": metric_fn}
# Merge optional HNSW params
if self.m is not None:
props.setdefault("max_degree", str(self.m))
if self.ef_construction is not None:
props.setdefault("ef_construction", str(self.ef_construction))
# Merge user provided index_properties
if self.index_properties:
props.update(self.index_properties)
return props

def search_param(self) -> dict:
return {
"metric_fn": self.get_metric_fn(),
}

def session_param(self) -> dict:
return self.session_vars or {}
Loading