Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate data type enums & Add SIMD support for unt8_t and int8_t Indices #52

Merged
merged 32 commits into from
Jul 31, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
37dc40b
making progress
blaise-muhirwa Apr 3, 2024
52e3c70
making simd more readable
blaise-muhirwa Apr 4, 2024
32eaa2a
further simd refactoring
blaise-muhirwa Apr 6, 2024
8022090
attempting to fix tests
blaise-muhirwa Apr 6, 2024
b3a00b8
second attempt
blaise-muhirwa Apr 6, 2024
6429b63
make sure we're compiling with sse3
blaise-muhirwa Apr 6, 2024
f99c037
include an sse3 preprocessor directive
blaise-muhirwa Apr 6, 2024
c4ec9d1
typo
blaise-muhirwa Apr 6, 2024
c71172a
fix preprocessor directives bug
blaise-muhirwa Apr 8, 2024
09109c4
remove avx2 instruction
blaise-muhirwa Apr 8, 2024
9287251
Update cereal_tests.cpp
BlaiseMuhirwa Apr 8, 2024
0e672ec
clean up
blaise-muhirwa Apr 8, 2024
4937726
Merge branch 'refactor-simd-code' of github.com-personal:BlaiseMuhirw…
blaise-muhirwa Apr 8, 2024
7b42ca4
fix small code inclusion error
blaise-muhirwa Apr 8, 2024
995fea8
merge from main
blaise-muhirwa Apr 11, 2024
170c2c7
initial commit
blaise-muhirwa Apr 11, 2024
159b95a
making progress
blaise-muhirwa Apr 14, 2024
563e5af
templating on data types
blaise-muhirwa Apr 23, 2024
954f847
saving current progress
blaise-muhirwa May 3, 2024
de5e6bf
improve simd for int8
blaise-muhirwa May 5, 2024
73cbbf4
merge from main
blaise-muhirwa Jul 26, 2024
b052a0a
work in progress
blaise-muhirwa Jul 27, 2024
adc18f8
things looking good -- still have a memory leak
blaise-muhirwa Jul 28, 2024
0e4f180
wip
blaise-muhirwa Jul 28, 2024
51d67dc
cleaned up the python bindings
blaise-muhirwa Jul 30, 2024
4174bc4
fix compiler errors
blaise-muhirwa Jul 30, 2024
72401ef
fix issues in product quantizer
blaise-muhirwa Jul 30, 2024
9dd84a3
fix gcc complaints regarding SSE4.1 instructions
blaise-muhirwa Jul 30, 2024
509827a
fix more errors
blaise-muhirwa Jul 30, 2024
582b3dd
forgot maximum edges per node
blaise-muhirwa Jul 30, 2024
43eb6a3
fix serialization tests
blaise-muhirwa Jul 31, 2024
66271d7
remove unnecessary comment
blaise-muhirwa Jul 31, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/cicd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ jobs:
- name: Build flatnav
run: |
cd flatnav_python
export NO_MANUAL_VECTORIZATION=1
export NO_SIMD_VECTORIZATION=1
./install_flatnav.sh

- name: Run Unit Tests
Expand Down
16 changes: 10 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,15 @@ endif(OpenMP_FOUND)
option(BUILD_TESTS "Build all tests")
option(BUIL_EXAMPLES "Build examples")
option(BUILD_BENCHMARKS "Build ANNS benchmarks")
option(NO_MANUAL_VECTORIZATION "Disable manual vectorization (SIMD)")
option(NO_SIMD_VECTORIZATION "Disable using SIMD instructions")
message(STATUS "Building tests: ${BUILD_TESTS}")
message(STATUS "Building examples: ${BUILD_EXAMPLES}")
message(STATUS "Building benchmarks: ${BUILD_BENCHMARKS}")

# Enable auto-vectorization if we are not using SIMD.
if(NO_MANUAL_VECTORIZATION)
message(STATUS "Disabling manual vectorization (SIMD)")
add_definitions(-DNO_MANUAL_VECTORIZATION)
if(NO_SIMD_VECTORIZATION)
message(STATUS "Disabling using SIMD instructions")
add_definitions(-DNO_SIMD_VECTORIZATION)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ftree-vectorize")
endif()

Expand All @@ -74,14 +74,18 @@ endif()
set(HEADERS
${PROJECT_SOURCE_DIR}/flatnav/distances/InnerProductDistance.h
${PROJECT_SOURCE_DIR}/flatnav/distances/SquaredL2Distance.h
${PROJECT_SOURCE_DIR}/flatnav/util/SquaredL2SimdExtensions.h
${PROJECT_SOURCE_DIR}/flatnav/util/InnerProductSimdExtensions.h
${PROJECT_SOURCE_DIR}/flatnav/util/VisitedSetPool.h
${PROJECT_SOURCE_DIR}/flatnav/util/GorderPriorityQueue.h
${PROJECT_SOURCE_DIR}/flatnav/util/Reordering.h
${PROJECT_SOURCE_DIR}/flatnav/util/SIMDDistanceSpecializations.h
${PROJECT_SOURCE_DIR}/flatnav/util/ParallelConstructs.h
${PROJECT_SOURCE_DIR}/flatnav/util/Macros.h
${PROJECT_SOURCE_DIR}/flatnav/util/Datatype.h
${PROJECT_SOURCE_DIR}/flatnav/util/SimdBaseTypes.h
${PROJECT_SOURCE_DIR}/flatnav/DistanceInterface.h
${PROJECT_SOURCE_DIR}/flatnav/Index.h
${PROJECT_SOURCE_DIR}/quantization/ProductQuantization.h
# ${PROJECT_SOURCE_DIR}/quantization/ProductQuantization.h
${PROJECT_SOURCE_DIR}/quantization/CentroidsGenerator.h
${PROJECT_SOURCE_DIR}/quantization/Utils.h)

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ Available Options:
-v, --verbose: Make verbose
-b, --benchmark: Build benchmarks
-bt, --build_type: Build type (Debug, Release, RelWithDebInfo, MinSizeRel)
-nmv, --no_manual_vectorization:Disable manual vectorization (SIMD)
-nmv, --no_simd_vectorization:Disable SIMD instructions
-h, --help: Print this help message

Example Usage:
Expand Down
8 changes: 4 additions & 4 deletions bin/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ cd "$(dirname "$0")/.."
BUILD_TESTS=OFF
BUILD_EXAMPLES=OFF
BUILD_BENCHMARKS=OFF
NO_MANUAL_VECTORIZATION=OFF
NO_SIMD_VECTORIZATION=OFF
MAKE_VERBOSE=0
CMAKE_BUILD_TYPE=Release

Expand All @@ -19,7 +19,7 @@ function print_usage() {
echo " -v, --verbose: Make verbose"
echo " -b, --benchmark: Build benchmarks"
echo " -bt, --build_type: Build type (Debug, Release, RelWithDebInfo, MinSizeRel)"
echo " -nmv, --no_manual_vectorization:Disable manual vectorization (SIMD)"
echo " -nsv, --no_simd_vectorization:Disable SIMD vectorization"
echo " -h, --help: Print this help message"
echo ""
echo "Example Usage:"
Expand All @@ -41,7 +41,7 @@ while [[ "$#" -gt 0 ]]; do
-e|--examples) BUILD_EXAMPLES=ON; shift ;;
-v|--verbose) MAKE_VERBOSE=1; shift ;;
-b|--benchmark) BUILD_BENCHMARKS=ON; shift ;;
-nmv|--no_manual_vectorization) NO_MANUAL_VECTORIZATION=ON; shift ;;
-nsv|--NO_SIMD_VECTORIZATION) NO_SIMD_VECTORIZATION=ON; shift ;;
-bt|--build_type) CMAKE_BUILD_TYPE=$2; shift; shift ;;
*) print_usage ;;
esac
Expand Down Expand Up @@ -73,7 +73,7 @@ mkdir -p build
cd build && cmake \
-DCMAKE_C_COMPILER=${CC} \
-DCMAKE_CXX_COMPILER=${CXX} \
-DNO_MANUAL_VECTORIZATION=${NO_MANUAL_VECTORIZATION} \
-DNO_SIMD_VECTORIZATION=${NO_SIMD_VECTORIZATION} \
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \
-DBUILD_TESTS=${BUILD_TESTS} \
-DBUILD_EXAMPLES=${BUILD_EXAMPLES} \
Expand Down
3 changes: 2 additions & 1 deletion cmake/FindAVX.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ function(check_compiler_and_hardware_support FLAG CODE_VAR EXTENSION_NAME)
set(CMAKE_CXX_FLAGS
"${CMAKE_CXX_FLAGS} ${FLAG}"
PARENT_SCOPE)
message(STATUS "Building with ${EXTENSION_NAME}")
message(STATUS "Building with ${FLAG}")
else()
message(
STATUS "Compiler supports ${FLAG} flag but the target machine does not "
Expand All @@ -54,6 +54,7 @@ endfunction()
# Build SSE/AVX/AVX512 code only on x86-64 processors.
if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
check_compiler_and_hardware_support("-mavx512f" "AVX512_CODE" "AVX512")
check_compiler_and_hardware_support("-mavx512bw" "AVX512_CODE" "AVX512")
check_compiler_and_hardware_support("-mavx" "AVX_CODE" "AVX")

check_cxx_compiler_flag("-msse" CXX_SSE)
Expand Down
5 changes: 2 additions & 3 deletions experiments/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,9 @@ sift-bench-flatnav:
--queries /root/data/sift-128-euclidean/sift-128-euclidean.test.npy \
--gtruth /root/data/sift-128-euclidean/sift-128-euclidean.gtruth.npy \
--index-type flatnav \
--use-hnsw-base-layer \
--hnsw-base-layer-filename sift.mtx \
--data-type float32 \
--num-node-links 32 \
--ef-construction 30 40 50 100 200 300 \
--ef-construction 100 \
--ef-search 100 200 300 500 1000 \
--metric l2 \
--num-build-threads 16 \
Expand Down
12 changes: 5 additions & 7 deletions experiments/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,11 @@ def load_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
start_index, end_index = self.range
train_dataset = np.load(self.train_dataset_path)[
start_index:end_index
].astype(np.float32, copy=False)
]
else:
train_dataset = np.load(self.train_dataset_path).astype(
np.float32, copy=False
)
queries = np.load(self.queries_path).astype(np.float32, copy=False)
ground_truth = np.load(self.ground_truth_path).astype(np.int32, copy=False)
train_dataset = np.load(self.train_dataset_path)
queries = np.load(self.queries_path)
ground_truth = np.load(self.ground_truth_path)
return train_dataset, queries, ground_truth


Expand Down Expand Up @@ -114,7 +112,7 @@ def _read_bvecs_file(self, filename: str) -> np.ndarray:
return v.reshape((end - start + 1, dimension + 4))[:, 4:]

def load_data(self) -> Tuple[np.ndarray]:
ground_truth = self._read_ivecs_file(self.gtruth_path)
ground_truth = self._read_ivecs_file(self.ground_truth_path)
# Ground truth has shape (10000, 1000) but we only need the first 100 queries
ground_truth = ground_truth[:, 0:100]

Expand Down
1 change: 1 addition & 0 deletions experiments/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ numpy = "^1.26.1"
matplotlib = "^3.8.2"
psutil = "^5.9.8"
pydantic = "^2.6.4"
flatnav = {path = "../flatnav_python/dist/flatnav-0.0.1-cp310-cp310-linux_x86_64.whl"}
BlaiseMuhirwa marked this conversation as resolved.
Show resolved Hide resolved

[build-system]
requires = ["poetry-core"]
Expand Down
130 changes: 75 additions & 55 deletions experiments/run-benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def compute_metrics(
:return: Dictionary of metrics.

"""
is_flatnav_index = type(index) in (flatnav.index.L2Index, flatnav.index.IPIndex)
is_flatnav_index = not type(index) == hnswlib.Index
latencies = []
top_k_indices = []
distance_computations = []
Expand Down Expand Up @@ -158,6 +158,7 @@ def train_index(
max_edges_per_node: int,
ef_construction: int,
index_type: str = "flatnav",
data_type: str = "float32",
use_hnsw_base_layer: bool = False,
hnsw_base_layer_filename: Optional[str] = None,
num_build_threads: int = 1,
Expand Down Expand Up @@ -214,6 +215,7 @@ def train_index(

index = flatnav.index.index_factory(
distance_type=distance_type,
index_data_type=data_type,
dim=dim,
dataset_size=dataset_size,
max_edges_per_node=max_edges_per_node,
Expand All @@ -232,6 +234,7 @@ def train_index(
else:
index = flatnav.index.index_factory(
distance_type=distance_type,
index_data_type=data_type,
dim=dim,
dataset_size=dataset_size,
max_edges_per_node=max_edges_per_node,
Expand Down Expand Up @@ -264,13 +267,75 @@ def main(
dataset_name: str,
requested_metrics: List[str],
index_type: str = "flatnav",
data_type: str = "float32",
use_hnsw_base_layer: bool = False,
hnsw_base_layer_filename: Optional[str] = None,
reordering_strategies: List[str] | None = None,
num_initializations: Optional[List[int]] = None,
num_build_threads: int = 1,
num_search_threads: int = 1,
):

def build_and_run_knn_search(ef_cons: int, node_links: int):
"""
Build the index and run the KNN search.
This part is here to ensure that two indices are not in memory at the same time.
With large datasets, we might get an OOM error.
"""

index = train_index(
index_type=index_type,
data_type=data_type,
train_dataset=train_dataset,
max_edges_per_node=node_links,
ef_construction=ef_cons,
dataset_size=dataset_size,
dim=dim,
distance_type=distance_type,
use_hnsw_base_layer=use_hnsw_base_layer,
hnsw_base_layer_filename=hnsw_base_layer_filename,
num_build_threads=num_build_threads,
)

if reordering_strategies is not None:
if index_type != "flatnav":
raise ValueError("Reordering only applies to the FlatNav index.")
index.reorder(strategies=reordering_strategies)

index.set_num_threads(num_search_threads)
for ef_search in ef_search_params:
# Extend metrics with computed metrics
metrics.update(
compute_metrics(
requested_metrics=requested_metrics,
index=index,
queries=queries,
ground_truth=gtruth,
ef_search=ef_search,
)
)
logging.info(f"Metrics: {metrics}")

# Add parameters to the metrics dictionary.
metrics["distance_type"] = distance_type
metrics["ef_search"] = ef_search
all_metrics = {experiment_key: []}

if os.path.exists(metrics_file) and os.path.getsize(metrics_file) > 0:
with open(metrics_file, "r") as file:
try:
all_metrics = json.load(file)
except json.JSONDecodeError:
logging.error(f"Error reading {metrics_file=}")

if experiment_key not in all_metrics:
all_metrics[experiment_key] = []

all_metrics[experiment_key].append(metrics)
with open(metrics_file, "w") as file:
json.dump(all_metrics, file, indent=4)


dataset_size = train_dataset.shape[0]
dim = train_dataset.shape[1]

Expand All @@ -284,59 +349,7 @@ def main(
metrics["ef_construction"] = ef_cons

logging.info(f"Building {index_type=}")
index = train_index(
index_type=index_type,
train_dataset=train_dataset,
max_edges_per_node=node_links,
ef_construction=ef_cons,
dataset_size=dataset_size,
dim=dim,
distance_type=distance_type,
use_hnsw_base_layer=use_hnsw_base_layer,
hnsw_base_layer_filename=hnsw_base_layer_filename,
num_build_threads=num_build_threads,
)

if reordering_strategies is not None:
if type(index) not in (
flatnav.index.L2Index,
flatnav.index.IPIndex,
):
raise ValueError("Reordering only applies to the FlatNav index.")
index.reorder(strategies=reordering_strategies)

index.set_num_threads(num_search_threads)
for ef_search in ef_search_params:
# Extend metrics with computed metrics
metrics.update(
compute_metrics(
requested_metrics=requested_metrics,
index=index,
queries=queries,
ground_truth=gtruth,
ef_search=ef_search,
)
)
logging.info(f"Metrics: {metrics}")

# Add parameters to the metrics dictionary.
metrics["distance_type"] = distance_type
metrics["ef_search"] = ef_search
all_metrics = {experiment_key: []}

if os.path.exists(metrics_file) and os.path.getsize(metrics_file) > 0:
with open(metrics_file, "r") as file:
try:
all_metrics = json.load(file)
except json.JSONDecodeError:
logging.error(f"Error reading {metrics_file=}")

if experiment_key not in all_metrics:
all_metrics[experiment_key] = []

all_metrics[experiment_key].append(metrics)
with open(metrics_file, "w") as file:
json.dump(all_metrics, file, indent=4)
build_and_run_knn_search(ef_cons=ef_cons, node_links=node_links)


def parse_arguments() -> argparse.Namespace:
Expand All @@ -349,6 +362,12 @@ def parse_arguments() -> argparse.Namespace:
default="flatnav",
help="Type of index to benchmark. Options include `flatnav` and `hnsw`.",
)

parser.add_argument(
"--data-type",
default="float32",
help="Data type of the index. Options include `float32`, `uint8` and `int8`.",
)

parser.add_argument(
"--use-hnsw-base-layer",
Expand Down Expand Up @@ -542,7 +561,7 @@ def run_experiment():
raise ValueError("HNSW does not support num_initializations.")

metrics_file_path = os.path.join(ROOT_DIR, "metrics", args.metrics_file)

main(
train_dataset=train_data,
queries=queries,
Expand All @@ -553,6 +572,7 @@ def run_experiment():
distance_type=args.metric.lower(),
dataset_name=args.dataset_name,
index_type=args.index_type.lower(),
data_type=args.data_type,
use_hnsw_base_layer=args.use_hnsw_base_layer,
hnsw_base_layer_filename=args.hnsw_base_layer_filename,
reordering_strategies=args.reordering_strategies,
Expand Down
12 changes: 12 additions & 0 deletions flatnav/DistanceInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,19 @@

#include <cereal/access.hpp>
#include <cstddef> // for size_t
#include <flatnav/util/Datatype.h>
#include <fstream> // for ifstream, ofstream
#include <functional>
#include <iostream>

namespace flatnav {

using util::DataType;
typedef std::function<float(const void *, const void *, const size_t &)>
DistanceFunction;

typedef std::unique_ptr<DistanceFunction> DistanceFunctionPtr;

enum class METRIC_TYPE { EUCLIDEAN, INNER_PRODUCT };

// We use the CRTP to implement static polymorphism on the distance. This is
Expand Down Expand Up @@ -35,6 +43,10 @@ template <typename T> class DistanceInterface {
// Prints the parameters of the distance function.
void getSummary() { static_cast<T *>(this)->getSummaryImpl(); }

inline constexpr DataType dataType() {
return static_cast<T *>(this)->dataTypeImpl();
}

// This transforms the data located at src into a form that is writeable
// to disk / storable in RAM. For distance functions that don't
// compress the input, this just passses through a copy from src to
Expand Down
Loading
Loading