diff --git a/runMinHashExample.py b/runMinHashExample.py index dd5089d..c5800b9 100644 --- a/runMinHashExample.py +++ b/runMinHashExample.py @@ -1,79 +1,72 @@ -# ======== runMinHashExample ======= -# This example code demonstrates comparing documents using the MinHash -# approach. -# -# First, each document is represented by the set of shingles it contains. The -# documents can then be compared using the Jaccard similarity of their -# shingle sets. This is computationally expensive, however, for large numbers -# of documents. -# -# For comparison, we will also use the MinHash algorithm to calculate short -# signature vectors to represent the documents. These MinHash signatures can -# then be compared quickly by counting the number of components in which the -# signatures agree. We'll compare all possible pairs of documents, and find -# the pairs with high similarity. -# -# The program follows these steps: -# 1. Convert each test file into a set of shingles. -# - The shingles are formed by combining three consecutive words together. -# - Shingles are mapped to shingle IDs using the CRC32 hash. -# 2. Calculate all Jaccard similarities directly. -# - This is ok for small dataset sizes. For the full 10,000 articles, it -# takes 20 minutes! -# 3. Calculate the MinHash signature for each document. -# - The MinHash algorithm is implemented using the random hash function -# trick which prevents us from having to explicitly compute random -# permutations of all of the shingle IDs. For further explanation, see -# section 3.3.5 of http://infolab.stanford.edu/~ullman/mmds/ch3.pdf -# 4. Compare all MinHash signatures to one another. -# - Compare MinHash signatures by counting the number of components in which -# the signatures are equal. Divide the number of matching components by -# the signature length to get a similarity value. -# - Display pairs of documents / signatures with similarity greater than a -# threshold. - +""" +This example code demonstrates comparing documents using the MinHash approach. + +First, each document is represented by the set of shingles it contains. The +documents can then be compared using the Jaccard similarity of their +shingle sets. This is computationally expensive, however, for large numbers +of documents. + +For comparison, we will also use the MinHash algorithm to calculate short +signature vectors to represent the documents. These MinHash signatures can +then be compared quickly by counting the number of components in which the +signatures agree. We'll compare all possible pairs of documents, and find +the pairs with high similarity. +The program follows these steps: +1. Convert each test file into a set of shingles. + - The shingles are formed by combining three consecutive words together. + - Shingles are mapped to shingle IDs using the CRC32 hash. +2. Calculate all Jaccard similarities directly. + - This is ok for small dataset sizes. For the full 10,000 articles, it + takes 20 minutes! +3. Calculate the MinHash signature for each document. + - The MinHash algorithm is implemented using the random hash function + trick which prevents us from having to explicitly compute random + permutations of all of the shingle IDs. For further explanation, see + section 3.3.5 of http://infolab.stanford.edu/~ullman/mmds/ch3.pdf +4. Compare all MinHash signatures to one another. + - Compare MinHash signatures by counting the number of components in which + the signatures are equal. Divide the number of matching components by + the signature length to get a similarity value. + - Display pairs of documents / signatures with similarity greater than a + threshold. +""" from __future__ import division -import os -import re +import sys import random import time import binascii -from bisect import bisect_right -from heapq import heappop, heappush +# from bisect import bisect_right +# from heapq import heappop, heappush # This is the number of components in the resulting MinHash signatures. # Correspondingly, it is also the number of random hash functions that # we will need in order to calculate the MinHash. -numHashes = 10; +num_hashes = 10 # You can run this code for different portions of the dataset. # It ships with data set sizes 100, 1000, 2500, and 10000. -numDocs = 1000 -dataFile = "./data/articles_" + str(numDocs) + ".train" -truthFile = "./data/articles_" + str(numDocs) + ".truth" +num_docs = 1000 +data_file = "./data/articles_" + str(num_docs) + ".train" +truth_file = "./data/articles_" + str(num_docs) + ".truth" # ============================================================================= # Parse The Ground Truth Tables # ============================================================================= # Build a dictionary mapping the document IDs to their plagiaries, and vice- # versa. -plagiaries = {} +plagiaries = dict() -# Open the truth file. -f = open(truthFile, "rU") +with open(truth_file, "rU") as f: + for line in f: + # Strip the newline character, if present. + if line[-1] == '\n': + line = line[0:-1] + docs = line.split(" ") -# For each line of the files... -for line in f: - - # Strip the newline character, if present. - if line[-1] == '\n': - line = line[0:-1] - - docs = line.split(" ") + # Map the two documents to each other. + plagiaries[docs[0]] = docs[1] + plagiaries[docs[1]] = docs[0] - # Map the two documents to each other. - plagiaries[docs[0]] = docs[1] - plagiaries[docs[1]] = docs[0] # ============================================================================= # Convert Documents To Sets of Shingles @@ -81,69 +74,66 @@ print "Shingling articles..." -# The current shingle ID value to assign to the next new shingle we +# The current shingle ID value to assign to the next new shingle we # encounter. When a shingle gets added to the dictionary, we'll increment this # value. -curShingleID = 0 +# curShingleID = 0 -# Create a dictionary of the articles, mapping the article identifier (e.g., +# Create a dictionary of the articles, mapping the article identifier (e.g., # "t8470") to the list of shingle IDs that appear in the document. -docsAsShingleSets = {}; - -# Open the data file. -f = open(dataFile, "rU") +doc_as_shingle_sets = dict() -docNames = [] +with open(data_file, 'rU') as f: + doc_names = list() + t0 = time.time() + total_shingles = 0 -t0 = time.time() + for i in xrange(num_docs): + # Read all of the words (they are all on one line) and split them by + # white space. + words = f.readline().split(' ') + + # Retrieve the article ID, which is the first word on the line. + doc_id = words[0] + + # Maintain a list of all document IDs. + doc_names.append(doc_id) + + del words[0] + + ''' + 'shingles_in_doc' will hold all of the unique shingle IDs present in + the current document. If a shingle ID occurs multiple times in the + document, it will only appear once in the set + (this is a property of Python sets). + ''' + shingles_in_doc = set() + + # For each word in the document... + for index in range(0, len(words) - 2): + # Construct the shingle text by combining three words together. + shingle = words[index] + " " + words[index + 1] + " " + words[index + 2] + + # Hash the shingle to a 32-bit integer. + crc = binascii.crc32(shingle) & 0xffffffff + + ''' + Add the hash value to the list of shingles for the current document. + Note that set objects will only add the value to the set if the set + doesn't already contain it. + ''' + shingles_in_doc.add(crc) + + ''' + Store the completed list of shingles for this document in the + dictionary. + ''' + doc_as_shingle_sets[doc_id] = shingles_in_doc + + # Count the number of shingles across all documents. + total_shingles = total_shingles + (len(words) - 2) -totalShingles = 0 - -for i in range(0, numDocs): - - # Read all of the words (they are all on one line) and split them by white - # space. - words = f.readline().split(" ") - - # Retrieve the article ID, which is the first word on the line. - docID = words[0] - - # Maintain a list of all document IDs. - docNames.append(docID) - - del words[0] - - # 'shinglesInDoc' will hold all of the unique shingle IDs present in the - # current document. If a shingle ID occurs multiple times in the document, - # it will only appear once in the set (this is a property of Python sets). - shinglesInDoc = set() - - # For each word in the document... - for index in range(0, len(words) - 2): - # Construct the shingle text by combining three words together. - shingle = words[index] + " " + words[index + 1] + " " + words[index + 2] - - # Hash the shingle to a 32-bit integer. - crc = binascii.crc32(shingle) & 0xffffffff - - # Add the hash value to the list of shingles for the current document. - # Note that set objects will only add the value to the set if the set - # doesn't already contain it. - shinglesInDoc.add(crc) - - # Store the completed list of shingles for this document in the dictionary. - docsAsShingleSets[docID] = shinglesInDoc - - # Count the number of shingles across all documents. - totalShingles = totalShingles + (len(words) - 2) - -# Close the data file. -f.close() - -# Report how long shingling took. -print '\nShingling ' + str(numDocs) + ' docs took %.2f sec.' % (time.time() - t0) - -print '\nAverage shingles per doc: %.2f' % (totalShingles / numDocs) +print '\nAverage shingles per doc: %.2f' % (total_shingles / num_docs) # ============================================================================= # Define Triangle Matrices @@ -156,79 +146,82 @@ # the empty/invalid cells of a full matrix. # Calculate the number of elements needed in our triangle matrix -numElems = int(numDocs * (numDocs - 1) / 2) +numElems = int(num_docs * (num_docs - 1) / 2) -# Initialize two empty lists to store the similarity values. -# 'JSim' will be for the actual Jaccard Similarity values. +# Initialize two empty lists to store the similarity values. +# 'JSim' will be for the actual Jaccard Similarity values. # 'estJSim' will be for the estimated Jaccard Similarities found by comparing # the MinHash signatures. JSim = [0 for x in range(numElems)] estJSim = [0 for x in range(numElems)] # Define a function to map a 2D matrix coordinate into a 1D index. -def getTriangleIndex(i, j): - # If i == j that's an error. - if i == j: - sys.stderr.write("Can't access triangle matrix with i == j") - sys.exit(1) - # If j < i just swap the values. - if j < i: - temp = i - i = j - j = temp - - # Calculate the index within the triangular array. - # This fancy indexing scheme is taken from pg. 211 of: - # http://infolab.stanford.edu/~ullman/mmds/ch6.pdf - # But I adapted it for a 0-based index. - # Note: The division by two should not truncate, it - # needs to be a float. - k = int(i * (numDocs - (i + 1) / 2.0) + j - i) - 1 - - return k + + +def get_triangle_index(i, j, *args, **kwargs): + """..""" + # If i == j that's an error. + if i == j: + sys.stderr.write("Can't access triangle matrix with i == j") + sys.exit(1) + + # If j < i just swap the values. + if j < i: + temp = i + i = j + j = temp + + # Calculate the index within the triangular array. + # This fancy indexing scheme is taken from pg. 211 of: + # http://infolab.stanford.edu/~ullman/mmds/ch6.pdf + # But I adapted it for a 0-based index. + # Note: The division by two should not truncate, it + # needs to be a float. + k = int(i * (num_docs - (i + 1) / 2.0) + j - i) - 1 + + return k # ============================================================================= # Calculate Jaccard Similarities # ============================================================================= -# In this section, we will directly calculate the Jaccard similarities by +# In this section, we will directly calculate the Jaccard similarities by # comparing the sets. This is included here to show how much slower it is than # the MinHash approach. # Calculating the Jaccard similarities gets really slow for large numbers # of documents. -if numDocs <= 2500: -#if True: +if num_docs <= 2500: print "\nCalculating Jaccard Similarities..." # Time the calculation. t0 = time.time() # For every document pair... - for i in range(0, numDocs): - - # Print progress every 100 documents. - if (i % 100) == 0: - print " (" + str(i) + " / " + str(numDocs) + ")" - - # Retrieve the set of shingles for document i. - s1 = docsAsShingleSets[docNames[i]] - - for j in range(i + 1, numDocs): - # Retrieve the set of shingles for document j. - s2 = docsAsShingleSets[docNames[j]] - + for i in range(0, num_docs): + # Print progress every 100 documents. + if (i % 100) == 0: + print " (" + str(i) + " / " + str(num_docs) + ")" + + # Retrieve the set of shingles for document i. + s1 = doc_as_shingle_sets[doc_names[i]] + + for j in range(i + 1, num_docs): + # Retrieve the set of shingles for document j. + s2 = doc_as_shingle_sets[doc_names[j]] + # Calculate and store the actual Jaccard similarity. - JSim[getTriangleIndex(i, j)] = (len(s1.intersection(s2)) / len(s1.union(s2))) + perform = (len(s1.intersection(s2)) / float(len(s1.union(s2)))) + JSim[get_triangle_index(i, j)] = perform # Calculate the elapsed time (in seconds) elapsed = (time.time() - t0) - + print "\nCalculating all Jaccard Similarities took %.2fsec" % elapsed -# Delete the Jaccard Similarities, since it's a pretty big matrix. +# Delete the Jaccard Similarities, since it's a pretty big matrix. del JSim - + # ============================================================================= # Generate MinHash Signatures # ============================================================================= @@ -239,10 +232,10 @@ def getTriangleIndex(i, j): print '\nGenerating random hash functions...' # Record the maximum shingle ID that we assigned. -maxShingleID = 2**32-1 +maxShingleID = 2**32 - 1 # We need the next largest prime number above 'maxShingleID'. -# I looked this value up here: +# I looked this value up here: # http://compoasso.free.fr/primelistweb/page/prime/liste_online_en.php nextPrime = 4294967311 @@ -253,155 +246,146 @@ def getTriangleIndex(i, j): # a prime number just greater than maxShingleID. # Generate a list of 'k' random coefficients for the random hash functions, -# while ensuring that the same value does not appear multiple times in the +# while ensuring that the same value does not appear multiple times in the # list. -def pickRandomCoeffs(k): - # Create a list of 'k' random values. - randList = [] - - while k > 0: - # Get a random shingle ID. - randIndex = random.randint(0, maxShingleID) - +def pick_random_coeffs(k): + """..""" + # Create a list of 'k' random values. + rand_list = list() + + while k > 0: + # Get a random shingle ID. + rand_index = random.randint(0, maxShingleID) + # Ensure that each random number is unique. - while randIndex in randList: - randIndex = random.randint(0, maxShingleID) - + while rand_index in rand_list: + rand_index = random.randint(0, maxShingleID) + # Add the random number to the list. - randList.append(randIndex) + rand_list.append(rand_index) k = k - 1 - - return randList -# For each of the 'numHashes' hash functions, generate a different coefficient 'a' and 'b'. -coeffA = pickRandomCoeffs(numHashes) -coeffB = pickRandomCoeffs(numHashes) + return rand_list + +# For each of the 'num_hashes' hash functions, generate a different coefficient +# 'a' and 'b'. +coeffA = pick_random_coeffs(num_hashes) +coeffB = pick_random_coeffs(num_hashes) print '\nGenerating MinHash signatures for all documents...' # List of documents represented as signature vectors signatures = [] -# Rather than generating a random permutation of all possible shingles, +# Rather than generating a random permutation of all possible shingles, # we'll just hash the IDs of the shingles that are *actually in the document*, -# then take the lowest resulting hash code value. This corresponds to the index +# then take the lowest resulting hash code value. This corresponds to the index # of the first shingle that you would have encountered in the random order. # For each document... -for docID in docNames: - - # Get the shingle set for this document. - shingleIDSet = docsAsShingleSets[docID] - - # The resulting minhash signature for this document. - signature = [] - - # For each of the random hash functions... - for i in range(0, numHashes): - - # For each of the shingles actually in the document, calculate its hash code - # using hash function 'i'. - - # Track the lowest hash ID seen. Initialize 'minHashCode' to be greater than - # the maximum possible value output by the hash. - minHashCode = nextPrime + 1 - +for doc_id in doc_names: + + # Get the shingle set for this document. + shingleIDSet = doc_as_shingle_sets[doc_id] + + # The resulting minhash signature for this document. + signature = [] + + # For each of the random hash functions... + for i in range(0, num_hashes): + + # For each of the shingles actually in the document, calculate its hash + # code using hash function 'i'. + + # Track the lowest hash ID seen. Initialize 'minHashCode' to be greater + # than the maximum possible value output by the hash. + minHashCode = nextPrime + 1 + # For each shingle in the document... for shingleID in shingleIDSet: - # Evaluate the hash function. - hashCode = (coeffA[i] * shingleID + coeffB[i]) % nextPrime - - # Track the lowest hash code seen. - if hashCode < minHashCode: - minHashCode = hashCode - - # Add the smallest hash code value as component number 'i' of the signature. + # Evaluate the hash function. + hashCode = (coeffA[i] * shingleID + coeffB[i]) % nextPrime + + # Track the lowest hash code seen. + if hashCode < minHashCode: + minHashCode = hashCode + + # Add the smallest hash code value as component number 'i' of the signature signature.append(minHashCode) - - # Store the MinHash signature for this document. - signatures.append(signature) + + # Store the MinHash signature for this document. + signatures.append(signature) # Calculate the elapsed time (in seconds) elapsed = (time.time() - t0) - -print "\nGenerating MinHash signatures took %.2fsec" % elapsed + +print "\nGenerating MinHash signatures took %.2fsec" % elapsed # ============================================================================= # Compare All Signatures -# ============================================================================= +# ============================================================================= + +print '\nComparing all signatures...' -print '\nComparing all signatures...' - # Creates a N x N matrix initialized to 0. # Time this step. t0 = time.time() # For each of the test documents... -for i in range(0, numDocs): - # Get the MinHash signature for document i. - signature1 = signatures[i] - - # For each of the other test documents... - for j in range(i + 1, numDocs): - - # Get the MinHash signature for document j. - signature2 = signatures[j] - +for i in range(0, num_docs): + # Get the MinHash signature for document i. + signature1 = signatures[i] + + # For each of the other test documents... + for j in range(i + 1, num_docs): + # Get the MinHash signature for document j. + signature2 = signatures[j] + count = 0 # Count the number of positions in the minhash signature which are equal. - for k in range(0, numHashes): - count = count + (signature1[k] == signature2[k]) - - # Record the percentage of positions which matched. - estJSim[getTriangleIndex(i, j)] = (count / numHashes) + for k in range(0, num_hashes): + count = count + (signature1[k] == signature2[k]) + + # Record the percentage of positions which matched. + estJSim[get_triangle_index(i, j)] = (count / num_hashes) # Calculate the elapsed time (in seconds) elapsed = (time.time() - t0) - -print "\nComparing MinHash signatures took %.2fsec" % elapsed - - + +print "\nComparing MinHash signatures took %.2fsec" % elapsed + # ============================================================================= # Display Similar Document Pairs -# ============================================================================= +# ============================================================================= # Count the true positives and false positives. tp = 0 fp = 0 - -threshold = 0.5 + +threshold = 0.5 print "\nList of Document Pairs with J(d1,d2) more than", threshold print "Values shown are the estimated Jaccard similarity and the actual" print "Jaccard similarity.\n" print " Est. J Act. J" # For each of the document pairs... -for i in range(0, numDocs): - for j in range(i + 1, numDocs): - # Retrieve the estimated similarity value for this pair. - estJ = estJSim[getTriangleIndex(i, j)] - +for i in range(0, num_docs): + for j in range(i + 1, num_docs): + # Retrieve the estimated similarity value for this pair. + estJ = estJSim[get_triangle_index(i, j)] + # If the similarity is above the threshold... if estJ > threshold: - - # Calculate the actual Jaccard similarity for validation. - s1 = docsAsShingleSets[docNames[i]] - s2 = docsAsShingleSets[docNames[j]] - J = (len(s1.intersection(s2)) / len(s1.union(s2))) - - # Print out the match and similarity values with pretty spacing. - print " %5s --> %5s %.2f %.2f" % (docNames[i], docNames[j], estJ, J) - - # Check whether this is a true positive or false positive. - # We don't need to worry about counting the same true positive twice - # because we implemented the for-loops to only compare each pair once. - if plagiaries[docNames[i]] == docNames[j]: - tp = tp + 1 - else: - fp = fp + 1 - -# Display true positive and false positive counts. -print -print "True positives: " + str(tp) + " / " + str(int(len(plagiaries.keys()) / 2)) -print "False positives: " + str(fp) + # Calculate the actual Jaccard similarity for validation. + s1 = doc_as_shingle_sets[doc_names[i]] + s2 = doc_as_shingle_sets[doc_names[j]] + J = (len(s1.intersection(s2)) / len(s1.union(s2))) + + # Check whether this is a true positive or false positive. + # We don't need to worry about counting the same true positive twice + # because we implemented the for-loops to only compare each pair once. + if plagiaries[doc_names[i]] == doc_names[j]: + tp = tp + 1 + else: + fp = fp + 1