Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Making it possible to run experiments with HNSW by using --index-type option for run-big-bench.py #26

Merged
merged 2 commits into from
Jan 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions experiments/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ yandex-deep-bench:
--gtruth /media/scratch/yandex-deep/ground_truth.10K.bin \
--metric l2 > logs.txt 2>&1

# Other available options include: --index-type (hnsw, flatnav)
sift-bench:
poetry run python run-big-bench.py \
--dataset /root/data/sift-128-euclidean/sift-128-euclidean.train.npy \
Expand Down
107 changes: 80 additions & 27 deletions experiments/run-big-bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,14 +116,14 @@ def load_ground_truth(path: str) -> Tuple[np.ndarray, np.ndarray, int, int]:


def compute_metrics(
index: Union[flatnav.index.L2Index, flatnav.index.IPIndex],
index: Union[flatnav.index.L2Index, flatnav.index.IPIndex, hnswlib.Index],
queries: np.ndarray,
ground_truth: np.ndarray,
ef_search: int,
k=100,
) -> Tuple[float, float]:
"""
Compute recall and QPS for given queries, ground truth, and a FlaNav index.
Compute recall and QPS for given queries, ground truth for the given index(FlatNav or HNSW).

Args:
- index: A FlatNav index to search.
Expand All @@ -136,9 +136,19 @@ def compute_metrics(
QPS over all queries

"""
start = time.time()
_, top_k_indices = index.search(queries=queries, ef_search=ef_search, K=k)
end = time.time()
is_flatnav_index = type(index) in (flatnav.index.L2Index, flatnav.index.IPIndex)
if is_flatnav_index:
start = time.time()
_, top_k_indices = index.search(
queries=queries, ef_search=ef_search, K=k, num_initializations=300
)
end = time.time()
else:
index.set_ef(ef_search)
start = time.time()
# Search for HNSW return (ids, distances) instead of (distances, ids)
top_k_indices, _ = index.knn_query(data=queries, k=k)
end = time.time()

querying_time = end - start
qps = len(queries) / querying_time
Expand All @@ -159,34 +169,72 @@ def compute_metrics(
return recall, qps


def train_flatnav_index(
def create_and_train_hnsw_index(
data: np.ndarray,
space: str,
dim: int,
dataset_size: int,
ef_construction: int,
max_edges_per_node: int,
num_threads,
) -> hnswlib.Index:
hnsw_index = hnswlib.Index(space=space, dim=dim)
hnsw_index.init_index(
max_elements=dataset_size, ef_construction=ef_construction, M=max_edges_per_node
)
hnsw_index.set_num_threads(num_threads)

start = time.time()
hnsw_index.add_items(data=data, ids=np.arange(dataset_size))
end = time.time()
logging.info(f"Indexing time = {end - start} seconds")

return hnsw_index


def train_index(
train_dataset: np.ndarray,
distance_type: str,
dim: int,
dataset_size: int,
max_edges_per_node: int,
ef_construction: int,
index_type: str = "flatnav",
use_hnsw_base_layer: bool = False,
hnsw_base_layer_filename: Optional[str] = None,
num_build_threads: int = 1,
) -> Union[flatnav.index.L2Index, flatnav.index.IPIndex]:
) -> Union[flatnav.index.L2Index, flatnav.index.IPIndex, hnswlib.Index]:
if index_type == "hnsw":
# We use "angular" instead of "ip", so here we are just converting.
_distance_type = distance_type if distance_type == "l2" else "ip"
# HNSWlib will have M * 2 edges in the base layer.
# So if we want to use M=32, we need to set M=16 here.
hnsw_index = create_and_train_hnsw_index(
data=train_dataset,
space=_distance_type,
dim=dim,
dataset_size=dataset_size,
ef_construction=ef_construction,
max_edges_per_node=max_edges_per_node // 2,
num_threads=num_build_threads,
)

return hnsw_index

if use_hnsw_base_layer:
if not hnsw_base_layer_filename:
raise ValueError("Must provide a filename for the HNSW base layer graph.")

# build the HNSW index to use as the base layer
# We use "angular" instead of "ip", so here we are just converting.
_distance_type = distance_type if distance_type == "l2" else "ip"
hnsw_index = hnswlib.Index(space=_distance_type, dim=dim)

# HNSWlib will have M * 2 edges in the base layer.
# So if we want to use M=32, we need to set M=16 here.
hnsw_index.init_index(
max_elements=dataset_size,
hnsw_index = create_and_train_hnsw_index(
data=train_dataset,
space=_distance_type,
dim=dim,
dataset_size=dataset_size,
ef_construction=ef_construction,
M=max_edges_per_node // 2,
max_edges_per_node=max_edges_per_node // 2,
num_threads=num_build_threads,
)
hnsw_index.add_items(data=train_dataset, ids=np.arange(dataset_size))

# Now extract the base layer's graph and save it to a file.
# This will be a Matrix Market file that we use to construct the Flatnav index.
Expand All @@ -211,13 +259,13 @@ def train_flatnav_index(
max_edges_per_node=max_edges_per_node,
verbose=False,
)
index.num_threads = num_build_threads

logging.info(f"Using {index.num_threads} threads for index construction.")
index.set_num_threads(num_build_threads)

# Train the index.
start = time.time()
index.add(data=train_dataset, ef_construction=ef_construction)
index.add(
data=train_dataset, ef_construction=ef_construction, num_initializations=300
)
end = time.time()

logging.info(f"Indexing time = {end - start} seconds")
Expand All @@ -233,6 +281,7 @@ def main(
ef_search_params: List[int],
num_node_links: List[int],
distance_type: str,
index_type: str = "flatnav",
use_hnsw_base_layer: bool = False,
hnsw_base_layer_filename: Optional[str] = None,
reordering_strategies: List[str] | None = None,
Expand All @@ -245,7 +294,8 @@ def main(
for node_links in num_node_links:
for ef_cons in ef_cons_params:
for ef_search in ef_search_params:
index = train_flatnav_index(
index = train_index(
index_type=index_type,
train_dataset=train_dataset,
max_edges_per_node=node_links,
ef_construction=ef_cons,
Expand All @@ -268,7 +318,7 @@ def main(
index.reorder(strategies=reordering_strategies)

if num_search_threads > 1:
index.num_threads = num_search_threads
index.set_num_threads(num_search_threads)

recall, qps = compute_metrics(
index=index,
Expand All @@ -288,21 +338,25 @@ def parse_arguments() -> argparse.Namespace:
description="Benchmark Flatnav on Big ANN datasets."
)

parser.add_argument(
"--index-type",
default="flatnav",
help="Type of index to benchmark. Options include `flatnav` and `hnsw`.",
)

parser.add_argument(
"--use-hnsw-base-layer",
action="store_true",
help="If set, use HNSW's base layer's connectivity for the Flatnav index.",
)
parser.add_argument(
"--hnsw-base-layer-filename",
required=False,
default=None,
help="Filename to save the HNSW base layer graph to. Please use the .mtx extension for clarity.",
)

parser.add_argument(
"--num-node-links",
required=False,
nargs="+",
type=int,
default=[16, 32],
Expand All @@ -311,7 +365,6 @@ def parse_arguments() -> argparse.Namespace:

parser.add_argument(
"--ef-construction",
required=False,
nargs="+",
type=int,
default=[100, 200, 300, 400, 500],
Expand All @@ -320,7 +373,6 @@ def parse_arguments() -> argparse.Namespace:

parser.add_argument(
"--ef-search",
required=False,
nargs="+",
type=int,
default=[100, 200, 300, 400, 500, 1000, 2000, 3000, 4000],
Expand Down Expand Up @@ -398,6 +450,7 @@ def parse_arguments() -> argparse.Namespace:
ef_search_params=args.ef_search,
num_node_links=args.num_node_links,
distance_type=args.metric.lower(),
index_type=args.index_type.lower(),
use_hnsw_base_layer=args.use_hnsw_base_layer,
hnsw_base_layer_filename=args.hnsw_base_layer_filename,
reordering_strategies=args.reordering_strategies,
Expand Down
26 changes: 14 additions & 12 deletions flatnav_python/python_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -312,22 +312,24 @@ void bindIndexMethods(
"Perform graph re-ordering based on the given sequence of "
"re-ordering strategies. "
"Supported re-ordering strategies include `gorder` and `rcm`.")
.def_property(
"num_threads",
[](IndexType &index_type) -> uint32_t {
auto *index = index_type.getIndex();
return index->getNumThreads();
},
.def(
"set_num_threads",
[](IndexType &index_type, uint32_t num_threads) {
if (num_threads == 0 ||
num_threads > std::thread::hardware_concurrency()) {
throw std::invalid_argument("Invalid number of threads.");
}
auto *index = index_type.getIndex();
index->setNumThreads(num_threads);
},
"Configure the desired number of threads. This is useful for "
"constructing the graph or performing KNN search in parallel.")
py::arg("num_threads"),
"Set the number of threads to use for constructing the graph and/or "
"performing KNN search.")
.def_property_readonly(
"num_threads",
[](IndexType &index_type) {
auto *index = index_type.getIndex();
return index->getNumThreads();
},
"Returns the number of threads used for "
"constructing the graph and/or performing KNN "
"search.")
.def_property_readonly(
"max_edges_per_node",
[](IndexType &index_type) {
Expand Down
4 changes: 2 additions & 2 deletions flatnav_python/unit_tests/test_parallel_insertions.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def test_parallel_insertions_yield_similar_recall():

assert index.max_edges_per_node == 16

index.num_threads = os.cpu_count()
index.set_num_threads(os.cpu_count())

print(f"Using {index.num_threads} threads for parallel index construction.")

Expand All @@ -44,7 +44,7 @@ def test_parallel_insertions_yield_similar_recall():
)

# This is not necessary since by default FlatNav uses a single thread.
single_threaded_index.num_threads = 1
single_threaded_index.set_num_threads(1)

start = time.time()
single_threaded_index.add(data=training_set, ef_construction=100)
Expand Down
Loading