making progress

BlaiseMuhirwa · Feb 2, 2024 · f482980 · f482980
1 parent 8476973
commit f482980
Show file tree

Hide file tree

Showing 9 changed files with 642 additions and 200 deletions.
diff --git a/bin/docker-test.sh b/bin/docker-test.sh
@@ -64,6 +64,8 @@ function get_tag_name() {
 TAG_NAME=$(get_tag_name)
 
 INCLUDE_HNSWLIB=${INCLUDE_HNSWLIB:-true}
+DATA_DIR=${DATA_DIR:-$(pwd)/data}
+
 
 echo "Building docker image with tag name: $TAG_NAME"
 
@@ -83,11 +85,13 @@ if [ -z "$1" ]
 then
     # This will build the image and run the container with the default make target
     # (i.e., print help message)
-    docker run -it --volume $(pwd)/data:/root/data --rm flatnav:$TAG_NAME make help
+    docker run -it --volume ${DATA_DIR}:/root/data --rm flatnav:$TAG_NAME make help
     exit 0
 fi
 
 
 # Run the container and mount the data/ directory as volume to /root/data
 # Pass the make target as argument to the container. 
-docker run -it --volume $(pwd)/data:/root/data --rm flatnav:$TAG_NAME make $1
+ARG1=$1
+docker run -it --volume ${DATA_DIR}:/root/data flatnav:$TAG_NAME /bin/bash \
+                -c "make ${ARG1}; tail -f /dev/null"
diff --git a/experiments/Makefile b/experiments/Makefile
@@ -23,12 +23,30 @@ sift-bench:
 
 hubness-test:
 	poetry run python hubness.py \
-		--datasets mnist-784-euclidean cauchy-10-euclidean cauchy-256-euclidean cauchy-1024-euclidean sift-128-euclidean \
+		--datasets normal-1-angular normal-2-angular normal-4-angular normal-8-angular normal-16-angular normal-32-angular normal-64-angular normal-128-angular normal-256-angular normal-1024-angular normal-1536-angular normal-1-euclidean normal-2-euclidean normal-4-euclidean normal-8-euclidean normal-16-euclidean normal-32-euclidean normal-64-euclidean normal-128-euclidean normal-256-euclidean normal-1024-euclidean normal-1536-euclidean \
 		--k 100 \
-		--metric l2 \
 		--ef-construction 100 \
 		--ef-search 200 \
-		--num-node-links 32
+		--num-node-links 32 \
+		--hubness-scores hubness-scores.json
+
+
+generate-datasets:
+	poetry run python generate-datasets.py \
+		--base-path /home/ubuntu/flatnav-experimental/data \
+		--dataset-size 1010000 \
+		--num-queries 10000 \
+		--dimensions 1 2 4 8 16 32 64 128 1536 \
+		--k 100
+
+# Generate hubness scores for each desired dataset
+generate-hubness-scores:
+	poetry run python generate-hubness-scores.py \
+		--dataset-names normal-1-angular normal-2-angular normal-4-angular normal-8-angular normal-16-angular normal-32-angular normal-64-angular normal-128-angular normal-256-angular normal-1024-angular normal-1536-angular normal-2048-angular normal-1-euclidean normal-2-euclidean normal-4-euclidean normal-8-euclidean normal-16-euclidean normal-32-euclidean normal-64-euclidean normal-128-euclidean normal-256-euclidean normal-1024-euclidean normal-1536-euclidean normal-2048-euclidean \
+		--base-path /media/scratch/ \
+		--k 100 \
+		--save-file hubness-scores-full.json
+
 
 setup: install-flatnav install-hnswlib
 

diff --git a/experiments/generate-datasets.py b/experiments/generate-datasets.py
@@ -0,0 +1,186 @@
+import os
+import numpy as np
+from sklearn.neighbors import NearestNeighbors
+import argparse
+import faiss
+
+
+def generate_iid_normal_dataset(
+    num_samples: int,
+    num_dimensions: int,
+    num_queries: int,
+    k: int,
+    directory_path: str,
+    dataset_name: str,
+    metric: str = "cosine",
+):
+    """
+    Generatest a dataset with the specified number of samples and dimensions using
+    the standard normal distribution.
+    Separates a subset for queries and computes their true k nearest neighbors.
+    :param num_samples: Number of samples in the dataset.
+    :param num_dimensions: Number of dimensions for each sample.
+    :param num_queries: Number of queries to be separated from the dataset.
+    :param k: The number of nearest neighbors to find.
+    :param directory_path: Base path to save the dataset, queries, and ground truth labels.
+    :param dataset_name: Name of the dataset (should be something like normal-10-angular)
+    :param metric: Metric to use for computing nearest neighbors.
+    """
+
+    def normalize_rows_inplace(matrix):
+        for row in matrix:
+            norm = np.linalg.norm(row)
+            norm = norm if norm > 0 else 1e-30
+            row /= norm
+
+    def add_data_in_batches(index, data, batch_size=10000):
+        for i in range(0, data.shape[0], batch_size):
+            index.add(data[i : i + batch_size])
+
+    dataset = np.random.normal(size=(num_samples, num_dimensions))
+    np.random.shuffle(dataset)
+    query_set = dataset[:num_queries]
+    dataset_without_queries = dataset[num_queries:]
+
+    if metric in ["cosine", "angular", "ip"]:
+        normalize_rows_inplace(dataset_without_queries)
+        normalize_rows_inplace(query_set)
+
+        print("Finished normalizing")
+        index = faiss.IndexFlatIP(dataset.shape[1])
+    else:
+        index = faiss.IndexFlatL2(dataset.shape[1])
+
+    add_data_in_batches(index, dataset_without_queries)
+
+    print("kNN search")
+    _, ground_truth_labels = index.search(query_set, k=k)
+
+    if np.any(ground_truth_labels < 0):
+        raise ValueError("Indices cannot be negative")
+
+    # Create directory if it doesn't exist
+    if not os.path.exists(directory_path):
+        os.makedirs(directory_path)
+
+    # dataset_without_queries = dataset_without_queries.astype(np.float32, copy=False)
+    # query_set = query_set.astype(np.float32, copy=False)
+    ground_truth_labels = ground_truth_labels.astype(np.int32, copy=False)
+
+    print("Saving dataset")
+    # Save the dataset
+    np.save(
+        f"{directory_path}/{dataset_name}.train.npy",
+        dataset_without_queries,
+    )
+    np.save(f"{directory_path}/{dataset_name}.test.npy", query_set)
+    np.save(
+        f"{directory_path}/{dataset_name}.gtruth.npy",
+        ground_truth_labels,
+    )
+
+
+def generate_cauchy_dataset(
+    num_samples: int,
+    num_dimensions: int,
+    num_queries: int,
+    k: int,
+    base_path: str,
+    metric: str = "minkowski",
+    p: int = 2,
+):
+    """
+    Generates a dataset with the specified number of samples and dimensions using
+    the Cauchy distribution.
+    Separates a subset for queries and computes their true k nearest neighbors.
+
+    :param num_samples: Number of samples in the dataset.
+    :param num_dimensions: Number of dimensions for each sample.
+    :param num_queries: Number of queries to be separated from the dataset.
+    :param k: The number of nearest neighbors to find.
+    :param base_path: Base path to save the dataset, queries, and ground truth labels.
+    :param metric: Metric to use for computing nearest neighbors.
+    :param p: Parameter for the metric.
+
+    NOTE: metric="minkowski" and p=2 is equivalent to Euclidean distance.
+    See: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
+    """
+    # Generate the dataset
+    dataset = np.random.standard_cauchy(size=(num_samples, num_dimensions))
+
+    # Separate out a subset for queries
+    np.random.shuffle(dataset)
+    query_set = dataset[:num_queries]
+    dataset_without_queries = dataset[num_queries:]
+
+    # Compute the true k nearest neighbors for the query set
+    nbrs = NearestNeighbors(n_neighbors=k, algorithm="brute", p=p, metric=metric).fit(
+        dataset_without_queries
+    )
+    ground_truth_labels = nbrs.kneighbors(query_set, return_distance=False)
+
+    # Save the dataset without queries, the queries, and the ground truth labels
+    np.save(f"{base_path}/train.npy", dataset_without_queries.astype(np.float32))
+    np.save(f"{base_path}/test.npy", query_set.astype(np.float32))
+    np.save(f"{base_path}/ground_truth.npy", ground_truth_labels.astype(np.int32))
+
+
+def check_datasets_exists(base_path: str, dataset_name: str) -> bool:
+    train_path = os.path.join(base_path, f"{dataset_name}.train.npy")
+    queries = os.path.join(base_path, f"{dataset_name}.test.npy")
+    ground_truth = os.path.join(base_path, f"{dataset_name}.gtruth.npy")
+
+    all_exists = all(
+        [
+            os.path.exists(train_path),
+            os.path.exists(queries),
+            os.path.exists(ground_truth),
+        ]
+    )
+    return all_exists
+
+
+def parse_arguments() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-path", type=str, required=True)
+    parser.add_argument("--dataset-size", type=int, required=True)
+    parser.add_argument("--num-queries", type=int, required=True)
+    parser.add_argument("--dimensions", type=int, nargs="+", required=True)
+    parser.add_argument("--k", type=int, default=100)
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    base_path = args.base_path
+    dimensions = args.dimensions
+
+    DATASET_NAMES = [f"normal-{d}-angular" for d in dimensions]
+    DATASET_NAMES += [f"normal-{d}-euclidean" for d in dimensions]
+    # DATASET_NAMES = [f"normal-{d}-euclidean" for d in dimensions]
+
+    # Create the datasets. First create the directory if it doesn't exist
+    for dataset_name in DATASET_NAMES:
+        directory_path = os.path.join(base_path, dataset_name)
+
+        if check_datasets_exists(directory_path, dataset_name):
+            print(f"Dataset {dataset_name} already exists. Skipping...")
+            continue
+        if not os.path.exists(directory_path):
+            os.makedirs(directory_path)
+
+        print(f"Generating dataset: {dataset_name}")
+
+        _, dimension, metric = dataset_name.split("-")
+        metric = metric if metric == "euclidean" else "cosine"
+        # Generate the datasets
+        generate_iid_normal_dataset(
+            num_samples=args.dataset_size,
+            num_dimensions=int(dimension),
+            num_queries=args.num_queries,
+            k=args.k,
+            directory_path=directory_path,
+            dataset_name=dataset_name,
+            metric=metric,
+        )
diff --git a/experiments/generate-hubness-scores.py b/experiments/generate-hubness-scores.py
@@ -0,0 +1,100 @@
+import numpy as np
+from sklearn.neighbors import NearestNeighbors
+import json
+import os
+import argparse
+import faiss
+
+
+def compute_k_occurence_distrubution(top_k_indices: np.ndarray) -> np.ndarray:
+    """
+    Computes the distribution of k-occurences for each node in the given array.
+    :param top_k_indices: array of shape (dataset_size, k) containing the indices of
+            the k nearest neighbors for each node.
+
+    :return: array of shape (dataset_size,) containing the k-occurence distribution for each node (N_k)
+    """
+
+    # validate indices. If any value is negative, throw an error
+    if np.any(top_k_indices < 0):
+        raise ValueError("Indices cannot be negative")
+
+    dataset_size = top_k_indices.shape[0]
+    k_occurence_distribution = np.zeros(dataset_size, dtype=int)
+
+    flattened_indices = top_k_indices.flatten()
+    unique_indices, counts = np.unique(flattened_indices, return_counts=True)
+    k_occurence_distribution[unique_indices] = counts
+
+    return k_occurence_distribution
+
+
+def compute_skewness(dataset: np.ndarray, k: int, metric: str) -> float:
+    # For cosine distance, we will assume that the data was normalized
+
+    if metric in ["cosine", "angular", "ip"]:
+        index = faiss.IndexFlatIP(dataset.shape[1])
+    else:
+        index = faiss.IndexFlatL2(dataset.shape[1])
+
+    # Shuffle the dataset and add only the first 10k elements to the index
+    # np.random.shuffle(dataset)
+    # dataset = dataset[0:10000]
+
+    index.add(dataset)
+    _, top_k_indices = index.search(dataset, k=k)
+
+    k_occurence_distribution = compute_k_occurence_distrubution(
+        top_k_indices=top_k_indices
+    )
+    mean = np.mean(k_occurence_distribution)
+    std_dev = np.std(k_occurence_distribution)
+    denominator = len(k_occurence_distribution) * (std_dev**3)
+    skewness = (np.sum((k_occurence_distribution - mean) ** 3)) / denominator
+
+    return skewness
+
+
+if __name__ == "__main__":
+    # We will compute the hubness scores for all given datasets and
+    # save them in a dictionary as JSON
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-path", type=str, required=True)
+    parser.add_argument("--dataset-names", type=str, nargs="+", required=True)
+    parser.add_argument("--k", type=int, default=100)
+    parser.add_argument("--save-file", type=str, required=True)
+
+    args = parser.parse_args()
+
+    file_path = os.path.join(args.base_path, args.save_file)
+
+    dataset_names = args.dataset_names
+    for dataset_name in dataset_names:
+        print(f"Computing hubness score for {dataset_name}")
+
+        dataset_path = os.path.join(
+            args.base_path, dataset_name, f"{dataset_name}.train.npy"
+        )
+        _, dimension, metric = dataset_name.split("-")
+        metric = metric if metric == "euclidean" else "cosine"
+
+        dataset = np.load(dataset_path)
+        dataset = dataset.astype(np.float32, copy=False)
+
+        skewness = compute_skewness(dataset=dataset, k=args.k, metric=metric)
+        print(f"Skewness: {skewness}")
+
+        # Read the existing data from the JSON file
+        if os.path.exists(file_path):
+            with open(file_path, "r") as file:
+                hubness_scores = json.load(file)
+        else:
+            hubness_scores = {}
+
+        # Update the dictionary with the new hubness score
+        hubness_scores[dataset_name] = skewness
+
+        # Write the updated dictionary back to the JSON file
+        with open(file_path, "w") as file:
+            json.dump(hubness_scores, file, indent=4)