-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbuild_index.py
104 lines (95 loc) · 4.11 KB
/
build_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import random
import string
import uuid
from random import randint
import time
import math
import json
from collections import defaultdict
import os
import sys
import numpy as np
import re
# PartA Word Frequencies
def tokenize(doc):
""" return list of tokens of the file O(N)
Function Tokenize: Here I use a regular expression template to find matching word in text.
As the regular expression is a sliding window. So the Time complexity of Tokenize is O(N)
"""
# doc = file.read()
# token_pattern=r"(?u)\b\w\w+\b"
token_pattern=r"(?u)\b\w[a-zA-Z]+\b" # only word, not number
token_pattern = re.compile(token_pattern)
# tokenize = lambda doc: token_pattern.findall(doc.lower()
tokenize = lambda doc: token_pattern.findall(doc)
words = tokenize(doc)
for i in range(len(words)):
if words[i].istitle():
words[i] = words[i].lower()
return words
class Index:
def __init__(self, path, bookkeeper):
self.path = path
self.bookkeeper_json = bookkeeper
self.bookkeeper_dict = dict()
self.file_list = []
with open(self.bookkeeper_json) as json_data:
self.bookkeeper_dict = json.load(json_data)
self.length = len(self.bookkeeper_dict)
self.inverted_index = dict()
self.inverted_index_dict = dict()
self.output_path = 'index.json'
def build_index(self):
# print(self.length)
print('Docs: ',len(self.bookkeeper_dict))
for file_key in self.bookkeeper_dict: # file key: X/XXX
file_path = self.path + file_key # path of file
with open(file_path) as file_data:
index_file = self.parse_count(file_data.read(),file_key)
self.file_list.append(index_file)
self.merge_sort(self.file_list)
return
def parse_count(self, content, file_key):
# read content of file, return a index dict of one file
token_list = tokenize(content)
token_dict = dict()
for pos,token in enumerate(token_list):
if token in token_dict:
token_dict[token][1].append(pos) # token: (X/XXX,[1,5,8]) list of position
else:
token_dict[token] = file_key,[pos]
token_dict = sorted(token_dict.items(), key=lambda x: x[1], reverse=False)
return token_dict
def merge_sort(self, index_list, ):
for index_file in index_list:
for token_list in index_file:
if token_list[0] in self.inverted_index:
self.inverted_index[token_list[0]].append(token_list[1])
else:
self.inverted_index[token_list[0]] = [token_list[1]] # key: token, value: list of pos
# with open('index_list.json', 'w') as outfile:
# json.dump(self.inverted_index, outfile)
N = len(self.inverted_index)
print('N: ',N)
for item in self.inverted_index.items(): #
# print(item[0])
index_dict = dict()
index_dict['idf'] = np.log10(N/len(item[1]))
index_dict['docs'] = dict()
for doc in item[1]:
doc_info = dict()
doc_info['pos'] = doc[1]
doc_info['tf'] = len(doc[1])
index_dict['docs'][doc[0]] = doc_info # doc[0]: doc key, doc[1]: list of pos
self.inverted_index_dict[item[0]] = index_dict # item[0]: token
with open('index_dict_part.json', 'w') as outfile:
json.dump(self.inverted_index_dict, outfile)
return
if __name__ == '__main__':
print("start building index")
path = '../WEBPAGES_CLEAN/'
# bookkeeper_path = "/Users/yifantian/Desktop/Course/CS221/project3/SearchEngine/bookkeeping_single.json"
bookkeeper_path = "/Users/yifantian/Desktop/Course/CS221/project3/SearchEngine/bookkeeping_part.json"
# bookkeeper_path = "/Users/yifantian/Desktop/Course/CS221/project3/SearchEngine/bookkeeping.json"
new_index = Index(path, bookkeeper_path)
new_index.build_index()