Skip to content

Commit

Permalink
making progress
Browse files Browse the repository at this point in the history
  • Loading branch information
blaise-muhirwa committed Feb 2, 2024
1 parent 8476973 commit f482980
Show file tree
Hide file tree
Showing 9 changed files with 642 additions and 200 deletions.
8 changes: 6 additions & 2 deletions bin/docker-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ function get_tag_name() {
TAG_NAME=$(get_tag_name)

INCLUDE_HNSWLIB=${INCLUDE_HNSWLIB:-true}
DATA_DIR=${DATA_DIR:-$(pwd)/data}


echo "Building docker image with tag name: $TAG_NAME"

Expand All @@ -83,11 +85,13 @@ if [ -z "$1" ]
then
# This will build the image and run the container with the default make target
# (i.e., print help message)
docker run -it --volume $(pwd)/data:/root/data --rm flatnav:$TAG_NAME make help
docker run -it --volume ${DATA_DIR}:/root/data --rm flatnav:$TAG_NAME make help
exit 0
fi


# Run the container and mount the data/ directory as volume to /root/data
# Pass the make target as argument to the container.
docker run -it --volume $(pwd)/data:/root/data --rm flatnav:$TAG_NAME make $1
ARG1=$1
docker run -it --volume ${DATA_DIR}:/root/data flatnav:$TAG_NAME /bin/bash \
-c "make ${ARG1}; tail -f /dev/null"
24 changes: 21 additions & 3 deletions experiments/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,30 @@ sift-bench:

hubness-test:
poetry run python hubness.py \
--datasets mnist-784-euclidean cauchy-10-euclidean cauchy-256-euclidean cauchy-1024-euclidean sift-128-euclidean \
--datasets normal-1-angular normal-2-angular normal-4-angular normal-8-angular normal-16-angular normal-32-angular normal-64-angular normal-128-angular normal-256-angular normal-1024-angular normal-1536-angular normal-1-euclidean normal-2-euclidean normal-4-euclidean normal-8-euclidean normal-16-euclidean normal-32-euclidean normal-64-euclidean normal-128-euclidean normal-256-euclidean normal-1024-euclidean normal-1536-euclidean \
--k 100 \
--metric l2 \
--ef-construction 100 \
--ef-search 200 \
--num-node-links 32
--num-node-links 32 \
--hubness-scores hubness-scores.json


generate-datasets:
poetry run python generate-datasets.py \
--base-path /home/ubuntu/flatnav-experimental/data \
--dataset-size 1010000 \
--num-queries 10000 \
--dimensions 1 2 4 8 16 32 64 128 1536 \
--k 100

# Generate hubness scores for each desired dataset
generate-hubness-scores:
poetry run python generate-hubness-scores.py \
--dataset-names normal-1-angular normal-2-angular normal-4-angular normal-8-angular normal-16-angular normal-32-angular normal-64-angular normal-128-angular normal-256-angular normal-1024-angular normal-1536-angular normal-2048-angular normal-1-euclidean normal-2-euclidean normal-4-euclidean normal-8-euclidean normal-16-euclidean normal-32-euclidean normal-64-euclidean normal-128-euclidean normal-256-euclidean normal-1024-euclidean normal-1536-euclidean normal-2048-euclidean \
--base-path /media/scratch/ \
--k 100 \
--save-file hubness-scores-full.json


setup: install-flatnav install-hnswlib

Expand Down
186 changes: 186 additions & 0 deletions experiments/generate-datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
import os
import numpy as np
from sklearn.neighbors import NearestNeighbors
import argparse
import faiss


def generate_iid_normal_dataset(
num_samples: int,
num_dimensions: int,
num_queries: int,
k: int,
directory_path: str,
dataset_name: str,
metric: str = "cosine",
):
"""
Generatest a dataset with the specified number of samples and dimensions using
the standard normal distribution.
Separates a subset for queries and computes their true k nearest neighbors.
:param num_samples: Number of samples in the dataset.
:param num_dimensions: Number of dimensions for each sample.
:param num_queries: Number of queries to be separated from the dataset.
:param k: The number of nearest neighbors to find.
:param directory_path: Base path to save the dataset, queries, and ground truth labels.
:param dataset_name: Name of the dataset (should be something like normal-10-angular)
:param metric: Metric to use for computing nearest neighbors.
"""

def normalize_rows_inplace(matrix):
for row in matrix:
norm = np.linalg.norm(row)
norm = norm if norm > 0 else 1e-30
row /= norm

def add_data_in_batches(index, data, batch_size=10000):
for i in range(0, data.shape[0], batch_size):
index.add(data[i : i + batch_size])

dataset = np.random.normal(size=(num_samples, num_dimensions))
np.random.shuffle(dataset)
query_set = dataset[:num_queries]
dataset_without_queries = dataset[num_queries:]

if metric in ["cosine", "angular", "ip"]:
normalize_rows_inplace(dataset_without_queries)
normalize_rows_inplace(query_set)

print("Finished normalizing")
index = faiss.IndexFlatIP(dataset.shape[1])
else:
index = faiss.IndexFlatL2(dataset.shape[1])

add_data_in_batches(index, dataset_without_queries)

print("kNN search")
_, ground_truth_labels = index.search(query_set, k=k)

if np.any(ground_truth_labels < 0):
raise ValueError("Indices cannot be negative")

# Create directory if it doesn't exist
if not os.path.exists(directory_path):
os.makedirs(directory_path)

# dataset_without_queries = dataset_without_queries.astype(np.float32, copy=False)
# query_set = query_set.astype(np.float32, copy=False)
ground_truth_labels = ground_truth_labels.astype(np.int32, copy=False)

print("Saving dataset")
# Save the dataset
np.save(
f"{directory_path}/{dataset_name}.train.npy",
dataset_without_queries,
)
np.save(f"{directory_path}/{dataset_name}.test.npy", query_set)
np.save(
f"{directory_path}/{dataset_name}.gtruth.npy",
ground_truth_labels,
)


def generate_cauchy_dataset(
num_samples: int,
num_dimensions: int,
num_queries: int,
k: int,
base_path: str,
metric: str = "minkowski",
p: int = 2,
):
"""
Generates a dataset with the specified number of samples and dimensions using
the Cauchy distribution.
Separates a subset for queries and computes their true k nearest neighbors.
:param num_samples: Number of samples in the dataset.
:param num_dimensions: Number of dimensions for each sample.
:param num_queries: Number of queries to be separated from the dataset.
:param k: The number of nearest neighbors to find.
:param base_path: Base path to save the dataset, queries, and ground truth labels.
:param metric: Metric to use for computing nearest neighbors.
:param p: Parameter for the metric.
NOTE: metric="minkowski" and p=2 is equivalent to Euclidean distance.
See: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
"""
# Generate the dataset
dataset = np.random.standard_cauchy(size=(num_samples, num_dimensions))

# Separate out a subset for queries
np.random.shuffle(dataset)
query_set = dataset[:num_queries]
dataset_without_queries = dataset[num_queries:]

# Compute the true k nearest neighbors for the query set
nbrs = NearestNeighbors(n_neighbors=k, algorithm="brute", p=p, metric=metric).fit(
dataset_without_queries
)
ground_truth_labels = nbrs.kneighbors(query_set, return_distance=False)

# Save the dataset without queries, the queries, and the ground truth labels
np.save(f"{base_path}/train.npy", dataset_without_queries.astype(np.float32))
np.save(f"{base_path}/test.npy", query_set.astype(np.float32))
np.save(f"{base_path}/ground_truth.npy", ground_truth_labels.astype(np.int32))


def check_datasets_exists(base_path: str, dataset_name: str) -> bool:
train_path = os.path.join(base_path, f"{dataset_name}.train.npy")
queries = os.path.join(base_path, f"{dataset_name}.test.npy")
ground_truth = os.path.join(base_path, f"{dataset_name}.gtruth.npy")

all_exists = all(
[
os.path.exists(train_path),
os.path.exists(queries),
os.path.exists(ground_truth),
]
)
return all_exists


def parse_arguments() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("--base-path", type=str, required=True)
parser.add_argument("--dataset-size", type=int, required=True)
parser.add_argument("--num-queries", type=int, required=True)
parser.add_argument("--dimensions", type=int, nargs="+", required=True)
parser.add_argument("--k", type=int, default=100)

return parser.parse_args()


if __name__ == "__main__":
args = parse_arguments()
base_path = args.base_path
dimensions = args.dimensions

DATASET_NAMES = [f"normal-{d}-angular" for d in dimensions]
DATASET_NAMES += [f"normal-{d}-euclidean" for d in dimensions]
# DATASET_NAMES = [f"normal-{d}-euclidean" for d in dimensions]

# Create the datasets. First create the directory if it doesn't exist
for dataset_name in DATASET_NAMES:
directory_path = os.path.join(base_path, dataset_name)

if check_datasets_exists(directory_path, dataset_name):
print(f"Dataset {dataset_name} already exists. Skipping...")
continue
if not os.path.exists(directory_path):
os.makedirs(directory_path)

print(f"Generating dataset: {dataset_name}")

_, dimension, metric = dataset_name.split("-")
metric = metric if metric == "euclidean" else "cosine"
# Generate the datasets
generate_iid_normal_dataset(
num_samples=args.dataset_size,
num_dimensions=int(dimension),
num_queries=args.num_queries,
k=args.k,
directory_path=directory_path,
dataset_name=dataset_name,
metric=metric,
)
100 changes: 100 additions & 0 deletions experiments/generate-hubness-scores.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import numpy as np
from sklearn.neighbors import NearestNeighbors
import json
import os
import argparse
import faiss


def compute_k_occurence_distrubution(top_k_indices: np.ndarray) -> np.ndarray:
"""
Computes the distribution of k-occurences for each node in the given array.
:param top_k_indices: array of shape (dataset_size, k) containing the indices of
the k nearest neighbors for each node.
:return: array of shape (dataset_size,) containing the k-occurence distribution for each node (N_k)
"""

# validate indices. If any value is negative, throw an error
if np.any(top_k_indices < 0):
raise ValueError("Indices cannot be negative")

dataset_size = top_k_indices.shape[0]
k_occurence_distribution = np.zeros(dataset_size, dtype=int)

flattened_indices = top_k_indices.flatten()
unique_indices, counts = np.unique(flattened_indices, return_counts=True)
k_occurence_distribution[unique_indices] = counts

return k_occurence_distribution


def compute_skewness(dataset: np.ndarray, k: int, metric: str) -> float:
# For cosine distance, we will assume that the data was normalized

if metric in ["cosine", "angular", "ip"]:
index = faiss.IndexFlatIP(dataset.shape[1])
else:
index = faiss.IndexFlatL2(dataset.shape[1])

# Shuffle the dataset and add only the first 10k elements to the index
# np.random.shuffle(dataset)
# dataset = dataset[0:10000]

index.add(dataset)
_, top_k_indices = index.search(dataset, k=k)

k_occurence_distribution = compute_k_occurence_distrubution(
top_k_indices=top_k_indices
)
mean = np.mean(k_occurence_distribution)
std_dev = np.std(k_occurence_distribution)
denominator = len(k_occurence_distribution) * (std_dev**3)
skewness = (np.sum((k_occurence_distribution - mean) ** 3)) / denominator

return skewness


if __name__ == "__main__":
# We will compute the hubness scores for all given datasets and
# save them in a dictionary as JSON

parser = argparse.ArgumentParser()
parser.add_argument("--base-path", type=str, required=True)
parser.add_argument("--dataset-names", type=str, nargs="+", required=True)
parser.add_argument("--k", type=int, default=100)
parser.add_argument("--save-file", type=str, required=True)

args = parser.parse_args()

file_path = os.path.join(args.base_path, args.save_file)

dataset_names = args.dataset_names
for dataset_name in dataset_names:
print(f"Computing hubness score for {dataset_name}")

dataset_path = os.path.join(
args.base_path, dataset_name, f"{dataset_name}.train.npy"
)
_, dimension, metric = dataset_name.split("-")
metric = metric if metric == "euclidean" else "cosine"

dataset = np.load(dataset_path)
dataset = dataset.astype(np.float32, copy=False)

skewness = compute_skewness(dataset=dataset, k=args.k, metric=metric)
print(f"Skewness: {skewness}")

# Read the existing data from the JSON file
if os.path.exists(file_path):
with open(file_path, "r") as file:
hubness_scores = json.load(file)
else:
hubness_scores = {}

# Update the dictionary with the new hubness score
hubness_scores[dataset_name] = skewness

# Write the updated dictionary back to the JSON file
with open(file_path, "w") as file:
json.dump(hubness_scores, file, indent=4)
Loading

0 comments on commit f482980

Please sign in to comment.