-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
8476973
commit f482980
Showing
9 changed files
with
642 additions
and
200 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,186 @@ | ||
import os | ||
import numpy as np | ||
from sklearn.neighbors import NearestNeighbors | ||
import argparse | ||
import faiss | ||
|
||
|
||
def generate_iid_normal_dataset( | ||
num_samples: int, | ||
num_dimensions: int, | ||
num_queries: int, | ||
k: int, | ||
directory_path: str, | ||
dataset_name: str, | ||
metric: str = "cosine", | ||
): | ||
""" | ||
Generatest a dataset with the specified number of samples and dimensions using | ||
the standard normal distribution. | ||
Separates a subset for queries and computes their true k nearest neighbors. | ||
:param num_samples: Number of samples in the dataset. | ||
:param num_dimensions: Number of dimensions for each sample. | ||
:param num_queries: Number of queries to be separated from the dataset. | ||
:param k: The number of nearest neighbors to find. | ||
:param directory_path: Base path to save the dataset, queries, and ground truth labels. | ||
:param dataset_name: Name of the dataset (should be something like normal-10-angular) | ||
:param metric: Metric to use for computing nearest neighbors. | ||
""" | ||
|
||
def normalize_rows_inplace(matrix): | ||
for row in matrix: | ||
norm = np.linalg.norm(row) | ||
norm = norm if norm > 0 else 1e-30 | ||
row /= norm | ||
|
||
def add_data_in_batches(index, data, batch_size=10000): | ||
for i in range(0, data.shape[0], batch_size): | ||
index.add(data[i : i + batch_size]) | ||
|
||
dataset = np.random.normal(size=(num_samples, num_dimensions)) | ||
np.random.shuffle(dataset) | ||
query_set = dataset[:num_queries] | ||
dataset_without_queries = dataset[num_queries:] | ||
|
||
if metric in ["cosine", "angular", "ip"]: | ||
normalize_rows_inplace(dataset_without_queries) | ||
normalize_rows_inplace(query_set) | ||
|
||
print("Finished normalizing") | ||
index = faiss.IndexFlatIP(dataset.shape[1]) | ||
else: | ||
index = faiss.IndexFlatL2(dataset.shape[1]) | ||
|
||
add_data_in_batches(index, dataset_without_queries) | ||
|
||
print("kNN search") | ||
_, ground_truth_labels = index.search(query_set, k=k) | ||
|
||
if np.any(ground_truth_labels < 0): | ||
raise ValueError("Indices cannot be negative") | ||
|
||
# Create directory if it doesn't exist | ||
if not os.path.exists(directory_path): | ||
os.makedirs(directory_path) | ||
|
||
# dataset_without_queries = dataset_without_queries.astype(np.float32, copy=False) | ||
# query_set = query_set.astype(np.float32, copy=False) | ||
ground_truth_labels = ground_truth_labels.astype(np.int32, copy=False) | ||
|
||
print("Saving dataset") | ||
# Save the dataset | ||
np.save( | ||
f"{directory_path}/{dataset_name}.train.npy", | ||
dataset_without_queries, | ||
) | ||
np.save(f"{directory_path}/{dataset_name}.test.npy", query_set) | ||
np.save( | ||
f"{directory_path}/{dataset_name}.gtruth.npy", | ||
ground_truth_labels, | ||
) | ||
|
||
|
||
def generate_cauchy_dataset( | ||
num_samples: int, | ||
num_dimensions: int, | ||
num_queries: int, | ||
k: int, | ||
base_path: str, | ||
metric: str = "minkowski", | ||
p: int = 2, | ||
): | ||
""" | ||
Generates a dataset with the specified number of samples and dimensions using | ||
the Cauchy distribution. | ||
Separates a subset for queries and computes their true k nearest neighbors. | ||
:param num_samples: Number of samples in the dataset. | ||
:param num_dimensions: Number of dimensions for each sample. | ||
:param num_queries: Number of queries to be separated from the dataset. | ||
:param k: The number of nearest neighbors to find. | ||
:param base_path: Base path to save the dataset, queries, and ground truth labels. | ||
:param metric: Metric to use for computing nearest neighbors. | ||
:param p: Parameter for the metric. | ||
NOTE: metric="minkowski" and p=2 is equivalent to Euclidean distance. | ||
See: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html | ||
""" | ||
# Generate the dataset | ||
dataset = np.random.standard_cauchy(size=(num_samples, num_dimensions)) | ||
|
||
# Separate out a subset for queries | ||
np.random.shuffle(dataset) | ||
query_set = dataset[:num_queries] | ||
dataset_without_queries = dataset[num_queries:] | ||
|
||
# Compute the true k nearest neighbors for the query set | ||
nbrs = NearestNeighbors(n_neighbors=k, algorithm="brute", p=p, metric=metric).fit( | ||
dataset_without_queries | ||
) | ||
ground_truth_labels = nbrs.kneighbors(query_set, return_distance=False) | ||
|
||
# Save the dataset without queries, the queries, and the ground truth labels | ||
np.save(f"{base_path}/train.npy", dataset_without_queries.astype(np.float32)) | ||
np.save(f"{base_path}/test.npy", query_set.astype(np.float32)) | ||
np.save(f"{base_path}/ground_truth.npy", ground_truth_labels.astype(np.int32)) | ||
|
||
|
||
def check_datasets_exists(base_path: str, dataset_name: str) -> bool: | ||
train_path = os.path.join(base_path, f"{dataset_name}.train.npy") | ||
queries = os.path.join(base_path, f"{dataset_name}.test.npy") | ||
ground_truth = os.path.join(base_path, f"{dataset_name}.gtruth.npy") | ||
|
||
all_exists = all( | ||
[ | ||
os.path.exists(train_path), | ||
os.path.exists(queries), | ||
os.path.exists(ground_truth), | ||
] | ||
) | ||
return all_exists | ||
|
||
|
||
def parse_arguments() -> argparse.Namespace: | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--base-path", type=str, required=True) | ||
parser.add_argument("--dataset-size", type=int, required=True) | ||
parser.add_argument("--num-queries", type=int, required=True) | ||
parser.add_argument("--dimensions", type=int, nargs="+", required=True) | ||
parser.add_argument("--k", type=int, default=100) | ||
|
||
return parser.parse_args() | ||
|
||
|
||
if __name__ == "__main__": | ||
args = parse_arguments() | ||
base_path = args.base_path | ||
dimensions = args.dimensions | ||
|
||
DATASET_NAMES = [f"normal-{d}-angular" for d in dimensions] | ||
DATASET_NAMES += [f"normal-{d}-euclidean" for d in dimensions] | ||
# DATASET_NAMES = [f"normal-{d}-euclidean" for d in dimensions] | ||
|
||
# Create the datasets. First create the directory if it doesn't exist | ||
for dataset_name in DATASET_NAMES: | ||
directory_path = os.path.join(base_path, dataset_name) | ||
|
||
if check_datasets_exists(directory_path, dataset_name): | ||
print(f"Dataset {dataset_name} already exists. Skipping...") | ||
continue | ||
if not os.path.exists(directory_path): | ||
os.makedirs(directory_path) | ||
|
||
print(f"Generating dataset: {dataset_name}") | ||
|
||
_, dimension, metric = dataset_name.split("-") | ||
metric = metric if metric == "euclidean" else "cosine" | ||
# Generate the datasets | ||
generate_iid_normal_dataset( | ||
num_samples=args.dataset_size, | ||
num_dimensions=int(dimension), | ||
num_queries=args.num_queries, | ||
k=args.k, | ||
directory_path=directory_path, | ||
dataset_name=dataset_name, | ||
metric=metric, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
import numpy as np | ||
from sklearn.neighbors import NearestNeighbors | ||
import json | ||
import os | ||
import argparse | ||
import faiss | ||
|
||
|
||
def compute_k_occurence_distrubution(top_k_indices: np.ndarray) -> np.ndarray: | ||
""" | ||
Computes the distribution of k-occurences for each node in the given array. | ||
:param top_k_indices: array of shape (dataset_size, k) containing the indices of | ||
the k nearest neighbors for each node. | ||
:return: array of shape (dataset_size,) containing the k-occurence distribution for each node (N_k) | ||
""" | ||
|
||
# validate indices. If any value is negative, throw an error | ||
if np.any(top_k_indices < 0): | ||
raise ValueError("Indices cannot be negative") | ||
|
||
dataset_size = top_k_indices.shape[0] | ||
k_occurence_distribution = np.zeros(dataset_size, dtype=int) | ||
|
||
flattened_indices = top_k_indices.flatten() | ||
unique_indices, counts = np.unique(flattened_indices, return_counts=True) | ||
k_occurence_distribution[unique_indices] = counts | ||
|
||
return k_occurence_distribution | ||
|
||
|
||
def compute_skewness(dataset: np.ndarray, k: int, metric: str) -> float: | ||
# For cosine distance, we will assume that the data was normalized | ||
|
||
if metric in ["cosine", "angular", "ip"]: | ||
index = faiss.IndexFlatIP(dataset.shape[1]) | ||
else: | ||
index = faiss.IndexFlatL2(dataset.shape[1]) | ||
|
||
# Shuffle the dataset and add only the first 10k elements to the index | ||
# np.random.shuffle(dataset) | ||
# dataset = dataset[0:10000] | ||
|
||
index.add(dataset) | ||
_, top_k_indices = index.search(dataset, k=k) | ||
|
||
k_occurence_distribution = compute_k_occurence_distrubution( | ||
top_k_indices=top_k_indices | ||
) | ||
mean = np.mean(k_occurence_distribution) | ||
std_dev = np.std(k_occurence_distribution) | ||
denominator = len(k_occurence_distribution) * (std_dev**3) | ||
skewness = (np.sum((k_occurence_distribution - mean) ** 3)) / denominator | ||
|
||
return skewness | ||
|
||
|
||
if __name__ == "__main__": | ||
# We will compute the hubness scores for all given datasets and | ||
# save them in a dictionary as JSON | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--base-path", type=str, required=True) | ||
parser.add_argument("--dataset-names", type=str, nargs="+", required=True) | ||
parser.add_argument("--k", type=int, default=100) | ||
parser.add_argument("--save-file", type=str, required=True) | ||
|
||
args = parser.parse_args() | ||
|
||
file_path = os.path.join(args.base_path, args.save_file) | ||
|
||
dataset_names = args.dataset_names | ||
for dataset_name in dataset_names: | ||
print(f"Computing hubness score for {dataset_name}") | ||
|
||
dataset_path = os.path.join( | ||
args.base_path, dataset_name, f"{dataset_name}.train.npy" | ||
) | ||
_, dimension, metric = dataset_name.split("-") | ||
metric = metric if metric == "euclidean" else "cosine" | ||
|
||
dataset = np.load(dataset_path) | ||
dataset = dataset.astype(np.float32, copy=False) | ||
|
||
skewness = compute_skewness(dataset=dataset, k=args.k, metric=metric) | ||
print(f"Skewness: {skewness}") | ||
|
||
# Read the existing data from the JSON file | ||
if os.path.exists(file_path): | ||
with open(file_path, "r") as file: | ||
hubness_scores = json.load(file) | ||
else: | ||
hubness_scores = {} | ||
|
||
# Update the dictionary with the new hubness score | ||
hubness_scores[dataset_name] = skewness | ||
|
||
# Write the updated dictionary back to the JSON file | ||
with open(file_path, "w") as file: | ||
json.dump(hubness_scores, file, indent=4) |
Oops, something went wrong.