Skip to content

Commit

Permalink
Fixing a Integer Overflow Error During Index Construction (#80)
Browse files Browse the repository at this point in the history
* wip

* debug recall

* clean up

* cleaning up

* some refactoring

* some cmake changes

* revert back to CMake v3.14

---------

Co-authored-by: blaise-muhirwa <[email protected]>
  • Loading branch information
BlaiseMuhirwa and blaise-muhirwa authored Jan 5, 2025
1 parent 86ee7d1 commit 231b423
Show file tree
Hide file tree
Showing 9 changed files with 141 additions and 111 deletions.
1 change: 0 additions & 1 deletion .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ Language: Cpp
BasedOnStyle: Google
AccessModifierOffset: -1
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: None
AlignOperands: Align
AllowAllArgumentsOnNextLine: true
AllowAllConstructorInitializersOnNextLine: true
Expand Down
7 changes: 6 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,13 @@ if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# Add debug compile flags
message(STATUS "Building in Debug mode")
# Address sanitizer: https://clang.llvm.org/docs/AddressSanitizer.html
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -fsanitize=address")
# For some reason, using -fsanitize=address,thread,undefined doesn't workt since it appears
# that Asan and thread sanitizer have conflicting behavior. To use thread sanitizer, remove
# Asan first and vice-versa
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -fsanitize=address,undefined")
endif()


include(FeatureSummary)
# All options summary
feature_summary(WHAT ALL)
Expand Down Expand Up @@ -94,6 +98,7 @@ add_library(FLAT_NAV_LIB INTERFACE)
target_sources(FLAT_NAV_LIB INTERFACE ${HEADERS})
target_include_directories(FLAT_NAV_LIB INTERFACE ${PROJECT_SOURCE_DIR})


target_link_libraries(FLAT_NAV_LIB INTERFACE OpenMP::OpenMP_CXX)

if(BUILD_EXAMPLES)
Expand Down
9 changes: 3 additions & 6 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
# Build arguments
# This is a relatively large image, so we might want to use a smaller base image, such as
# alpine in the future if image size becomes an issue.
ARG BASE_IMAGE=ubuntu:22.04
# debian:buster-slim is much smaller than ubuntu 22
ARG BASE_IMAGE=debian:buster-slim

FROM ${BASE_IMAGE} as base


ARG POETRY_VERSION=1.8.2
ARG PYTHON_VERSION=3.11.6
ARG POETRY_HOME="/opt/poetry"
ARG ROOT_DIR="/root"
ARG FLATNAV_PATH="${ROOT_DIR}/flatnavlib"


ENV DEBIAN_FRONTEND=noninteractive

RUN apt-get update -y \
&& apt-get install -y --no-install-recommends \
# Need for python installation:
Expand Down Expand Up @@ -43,7 +41,6 @@ RUN apt-get update -y \
gcc \
g++ \
apt-utils \
wget \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/*
Expand Down
2 changes: 1 addition & 1 deletion bin/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
cd "$(dirname "$0")/.."

BUILD_TESTS=OFF
BUILD_EXAMPLES=OFF
BUILD_EXAMPLES=OFF
NO_SIMD_VECTORIZATION=OFF
MAKE_VERBOSE=0
CMAKE_BUILD_TYPE=Release
Expand Down
2 changes: 0 additions & 2 deletions bin/docker-run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,6 @@ docker run \
--name $CONTAINER_NAME \
-it \
-e MAKE_TARGET=$1 \
--env-file bin/.env-vars \
--volume ~/.aws:/root/.aws:ro \
--volume ${DATA_DIR}:/root/data \
--volume ${METRICS_DIR}:/root/metrics \
--rm flatnav:$TAG_NAME \
Expand Down
98 changes: 53 additions & 45 deletions experiments/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,49 @@
from typing import Tuple, List, Optional, Union


def read_ivecs_file(filename: str, range: Optional[tuple[int, int]] = None) -> np.ndarray:
with open(filename, "rb") as f:
dimension = np.fromfile(f, dtype=np.int32, count=1)[0]
vec_size = 4 + dimension * 4

f.seek(0, 2)
total_vectors = f.tell() // vec_size
start, end = 1, total_vectors

if range:
start, end = range
end = min(end, total_vectors)

assert 1 <= start <= end <= total_vectors, "Invalid range specified."

f.seek((start - 1) * vec_size, 0)
v = np.fromfile(
f, dtype=np.int32, count=(dimension + 1) * (end - start + 1)
)
return v.reshape((end - start + 1, dimension + 1))[:, 1:]

def read_bvecs_file(filename: str, range: Optional[tuple[int, int]] = None) -> np.ndarray:
with open(filename, "rb") as f:
dimension = np.fromfile(f, dtype=np.int32, count=1)[0]
vec_size = 4 + dimension

f.seek(0, 2)
total_vectors = f.tell() // vec_size

start, end = 1, total_vectors
if range:
start, end = range
end = min(end, total_vectors)

assert 1 <= start <= end <= total_vectors, "Invalid range specified."

f.seek((start - 1) * vec_size, 0)
v = np.fromfile(
f, dtype=np.uint8, count=(dimension + 4) * (end - start + 1)
)
return v.reshape((end - start + 1, dimension + 4))[:, 4:]


class DatasetLoader(ABC):
def __init__(
self,
Expand Down Expand Up @@ -71,55 +114,13 @@ class BvecsDatasetLoader(DatasetLoader):
NOTE: This is mostly for loading the SIFT1B dataset.
"""

def _read_ivecs_file(self, filename: str) -> np.ndarray:
with open(filename, "rb") as f:
dimension = np.fromfile(f, dtype=np.int32, count=1)[0]
vec_size = 4 + dimension * 4

f.seek(0, 2)
total_vectors = f.tell() // vec_size
start, end = 1, total_vectors

if self.range:
start, end = self.range
end = min(end, total_vectors)

assert 1 <= start <= end <= total_vectors, "Invalid range specified."

f.seek((start - 1) * vec_size, 0)
v = np.fromfile(
f, dtype=np.int32, count=(dimension + 1) * (end - start + 1)
)
return v.reshape((end - start + 1, dimension + 1))[:, 1:]

def _read_bvecs_file(self, filename: str) -> np.ndarray:
with open(filename, "rb") as f:
dimension = np.fromfile(f, dtype=np.int32, count=1)[0]
vec_size = 4 + dimension

f.seek(0, 2)
total_vectors = f.tell() // vec_size

start, end = 1, total_vectors
if self.range:
start, end = self.range
end = min(end, total_vectors)

assert 1 <= start <= end <= total_vectors, "Invalid range specified."

f.seek((start - 1) * vec_size, 0)
v = np.fromfile(
f, dtype=np.uint8, count=(dimension + 4) * (end - start + 1)
)
return v.reshape((end - start + 1, dimension + 4))[:, 4:]

def load_data(self) -> Tuple[np.ndarray]:
ground_truth = self._read_ivecs_file(self.ground_truth_path)
ground_truth = read_ivecs_file(self.ground_truth_path, self.range)
# Ground truth has shape (10000, 1000) but we only need the first 100 queries
ground_truth = ground_truth[:, 0:100]

train_data = self._read_bvecs_file(self.train_dataset_path)
queries_data = self._read_bvecs_file(self.queries_path)
train_data = read_bvecs_file(self.train_dataset_path, self.range)
queries_data = read_bvecs_file(self.queries_path, self.range)

return train_data, queries_data, ground_truth

Expand All @@ -145,6 +146,13 @@ def load_ground_truth(self, path: str) -> Tuple[np.ndarray, np.ndarray, int, int
- Number of queries
- K value
"""

if self.ground_truth_path.endswith(".ivecs"):
ground_truth = read_ivecs_file(self.ground_truth_path)
num_queries, K = ground_truth.shape
return ground_truth, None, num_queries, K


with open(path, "rb") as f:
num_queries = np.fromfile(f, dtype=np.uint32, count=1)[0]
K = np.fromfile(f, dtype=np.uint32, count=1)[0]
Expand Down
1 change: 0 additions & 1 deletion experiments/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ numpy = "^1.26.1, <2.0"
matplotlib = "^3.8.2"
psutil = "^5.9.8"
pydantic = "^2.6.4"
boto3 = "^1.34.98"

[build-system]
requires = ["poetry-core"]
Expand Down
Loading

0 comments on commit 231b423

Please sign in to comment.