demo.py

# Clustering example using LFW data:
import os
import pandas as pd
from matplotlib import pyplot as plt
import argparse
import json
import scipy.io as sio
from clustering import cluster
from evaluation import calculate_pairwise_pr


def plot_histogram(lfw_dir):
    """
    Function to plot the distribution of cluster sizes in LFW.
    """
    filecount_dict = {}
    for root, dirs, files in os.walk(lfw_dir):
        for dirname in dirs:
            n_photos = len(os.listdir(os.path.join(root, dirname)))
            filecount_dict[dirname] = n_photos
    print("No of unique people: {}".format(len(filecount_dict.keys())))
    df = pd.DataFrame(filecount_dict.items(), columns=['Name', 'Count'])
    print("Singletons : {}\nTwo :{}\n".format((df['Count'] == 1).sum(),
                                              (df['Count'] == 2).sum()))
    plt.hist(df['Count'], bins=max(df['Count']))
    plt.title('Cluster Sizes')
    plt.xlabel('No of images in folder')
    plt.ylabel('No of folders')
    plt.show()


def approximate_rank_order_clustering(vectors):
    """
    Cluster the input vectors.
    """
    clusters = cluster(vectors, n_neighbors=200, thresh= [1.1])
    return clusters


def evaluate_clusters(clusters, labels_lookup):
    """
    This function calculates the pairwise precision and recall for the
    clusters.
    Input:
        clusters: list of lists
            Each list contains a set of integers that correspond to a particular
            image in the LFW dataset.
        labels: dict
            It is a dictionary where the keys are row numbers and the values
            are lables(string).
    Output:
        pairwise_precision: float
            Fraction of pair of samples within a cluster that belong to one
            identity

        pairwise_recall: float
            Fraction of pairs of samples within a cluster which are placed in
            the same cluster over the total number of same cluster pairs within
            the dataset.

        f1_score: float
            Defined as the harmonic mean of precision and recall.
    """
    precision, recall = calculate_pairwise_pr(clusters, labels_lookup)
    f1_score = 2*precision*recall/(precision+recall)
    print("Precision : {}\nRecall : {}\nf1_score : {}".format(precision,
                                                              recall,
                                                              f1_score
                                                              ))
    print("---------------------------------------------------------")
    return f1_score


def create_labels_lookup(labels):
    """
    Create a dictionary where the key is the row number and the value is the
    actual label.
    In this case, labels is an array where the position corresponds to the row
    number and the value is an integer indicating the label.
    """
    labels_lookup = {}
    for idx, label in enumerate(labels):
        labels_lookup[idx] = int(label[0][:])
    return labels_lookup


if __name__ == '__main__':
    parser = argparse.ArgumentParser('Approximate Rank Order Clustering Demo')
    parser.add_argument('--lfw_path', required=True,
                        help='Enter tha directory where LFW images are saved.')
    parser.add_argument('-v', '--vector_file', required=False,
                        help="Path to where the vectors to be clustered are saved.")
    args = vars(parser.parse_args())
    # plot_histogram(args['lfw_path'])
    if args['vector_file']:
        f = sio.loadmat(args['vector_file'])
        vectors = f['features']
        labels = f['labels_original'][0]
        clusters_thresholds = approximate_rank_order_clustering(vectors)
        clusters_at_th = clusters_thresholds[0]
        clusters_to_be_saved = {}
        for i, cluster in enumerate(clusters_at_th["clusters"]):
            c = [int(x) for x in list(cluster)]
            clusters_to_be_saved[i] = c

        with open("data/clusters.json","w") as f:
            json.dump(clusters_to_be_saved, f)

        labels_lookup = create_labels_lookup(labels)
        for clusters in clusters_thresholds:
            print("No of clusters: {}".format(len(clusters['clusters'])))
            print("Threshold : {}".format(clusters['threshold']))
            f1_score = evaluate_clusters(clusters['clusters'], labels_lookup)
        # n_faces = 0
        # for c in clusters:
        #     print c
        #     n_faces += len(c)
        # print 'No of faces : {}'.format(n_faces)