-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindexer.py
276 lines (262 loc) · 11.1 KB
/
indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Feb 25 09:27:41 2017
@author: evita
"""
import pickle
import os
from string import punctuation as punct
from string import maketrans
from nltk.corpus import stopwords
from urlparse import urlparse
try:
from BeautifulSoup import BeautifulSoup
except ImportError:
from bs4 import BeautifulSoup
import re
from nltk.stem.porter import PorterStemmer
from collections import defaultdict
import glob
import math
global skipped_files_counter
trans_table = maketrans(punct, ' '*len(punct))
def tokenize(text):
tokens = []
text=text.translate(trans_table) #source: http://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python
for w in text.split():
w=w.lower()
w=''.join(s for s in w if ord(s)>31 and ord(s)<126) #remove non-ascii chars source: http://stackoverflow.com/questions/20183669/remove-formatting-from-strings
tokens.append(w)
tokens=filter(None,tokens)
filtered_words = [word for word in tokens if word not in stopwords.words('english')]
return filtered_words
def listdir_nohidden(path):
#source: http://stackoverflow.com/questions/7099290/how-to-ignore-hidden-files-using-os-listdir/14063074
#skip hidden files
return glob.glob(os.path.join(path, '*'))
def list_files(dir):
r = []
folder_names_dict = dict()
subdirs = [x[0] for x in os.walk(dir)]
for subdir in subdirs:
files = listdir_nohidden(subdir)
if (len(files) > 0):
folder_names_dict[subdir]= files
for file in files:
if not os.path.isdir(file):
r.append(file)
return r
def word_positions(filtered_words): #source: http://aakashjapi.com/fuckin-search-engines-how-do-they-work/
'''
returns: positions of each word in the document: {word: [pos1, pos2, etc.]}
'''
position_dict = dict()
positions = []
for pos,word in enumerate(filtered_words):
if word in position_dict.keys():
curr_positions = []
curr_positions = position_dict[word]
curr_positions.append(pos)
position_dict[word]=curr_positions
else:
positions = []
positions.append(pos)
position_dict[word] = positions
return position_dict
def parse_html_and_tokenize(filename,urls,urlname):
stemmed_words = []
urls_anchor = dict()
global skipped_files_counter
if is_valid(urlname):
soup = BeautifulSoup(open(filename), 'html.parser')
content_text = (soup.text).encode('utf-8').strip()
for link in soup.findAll('a', href=True):
url = link['href']
if is_valid(url) and url in urls:
anchor_text = link.get_text()
if anchor_text and anchor_text!= "(?)":
anchor_text = " ".join(anchor_text.split())
urls_anchor[url]= anchor_text
filtered_words = tokenize(content_text)
porter = PorterStemmer()
try:
stemmed_words = [porter.stem(word) for word in filtered_words]
except:
stemmed_words = filtered_words
if urls_anchor:
with open("all_urls_anchor.txt","a+") as writer:
for key,value in urls_anchor.items():
writer.write('{},{}\n'.format(key, value))
return stemmed_words
else:
print filename, ": skipped"
skipped_files_counter +=1
return stemmed_words
def is_valid(url):
'''
Returns true is url is within ics.uci.edu and does not have any of the below extensions (valid page)
'''
# some regex: source: https://support.archive-it.org/hc/en-us/articles/208332963-Modify-your-crawl-scope-with-a-Regular-Expression
# remove php, jsp, asp, txt, c , py, java, h , cpp, cc
# and other kind of files
parsed = urlparse(url)
if parsed.scheme not in set(["http", "https"]):
return False
try:
return ".ics.uci.edu" in parsed.hostname \
and not re.match(".*\.(css|js|bmp|gif|jpe?g|ico" + "|png|tiff?|mid|mp2|mp3|mp4"\
+ "|wav|avi|mov|mpeg|ram|m4v|mkv|ogg|ogv|pdf" \
+ "|ps|eps|tex|ppt|pptx|doc|docx|xls|xlsx|names|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso|epub|dll|cnf|tgz|sha1" \
+ "|thmx|mso|arff|rtf|jar|csv"\
+ "|php|jsp|asp|txt|c|py|java|h|cpp|cc"\
+ "|rm|smil|wmv|swf|wma|zip|rar|gz)$", parsed.path.lower()) \
and not re.match("^.*calendar.*$", url.lower())\
and not re.match("^.*php.*$", url.lower())\
and not re.match("^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$", url.lower())\
and not re.match("^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$", url.lower())\
and (bool(urlparse(url).netloc) == True)
except TypeError:
print ("TypeError for ", parsed)
def create_postings_per_file(filenames,urls,filename_to_url_dict):
doc_set= set()
counter=0
block=0
for f in sorted(filenames):
file_name = f.split("/")
length = len(file_name)
doc_id = file_name[length -2] + "_" + file_name[length-1]
url_name = filename_to_url_dict[doc_id]
counter += 1 #counts the number of files in each folder
file_tokens = []
title_tokens = []
file_tokens = parse_html_and_tokenize(f,urls,url_name)
non_sorted_tokens_positions = dict()
if len(file_tokens) or len(title_tokens) > 0:
counter += 1
non_sorted_tokens_positions = word_positions(file_tokens)
file_name = f.split("/")
length = len(file_name)
doc_id = file_name[length -2] + "_" + file_name[length-1]
doc_set.add(doc_id)
if counter > 500:
counter = 0
block += 1 #change folder
dir_to_write = "postings_per_file/" + str(block) + "/"
if not os.path.exists(dir_to_write):
os.makedirs(dir_to_write)
with open(dir_to_write + doc_id + ".pkl",'wb') as output:
pickle.dump(non_sorted_tokens_positions,output)
print "skipped files: ", skipped_files_counter
print "total documents indexed: ", len(doc_set)
def create_block_index(directory):
filenames2 = []
block_index = dict()
filenames2 = list_files(directory)
counter = 0
directory_name_split = directory.split("/")
total_length = len(directory_name_split)
directory_name = directory_name_split[total_length-1]
print directory_name
for f in sorted(filenames2):
postings_dict_file = open(f,'rb')
try:
postings_dict = pickle.load(postings_dict_file)
file_name = f.split("/")
length = len(file_name)
doc_id_splitted = file_name[length-1].split(".")
doc_id = doc_id_splitted[0]
#print "doc_id: ",doc_id
postings_dict_file.close()
for t in postings_dict.keys(): #merge into a main index
if t in block_index.keys():
#append the doc id and the positions of that term to the main index
postings_list = block_index[t]
postings_list.append({doc_id: postings_dict[t]})
block_index[t]= postings_list
else:
postings_list = []
postings_list.append({doc_id: postings_dict[t]})
#create a listing of the term in the main index
block_index[t]=postings_list
except:
print f,": error"
continue
output = open("new_block_indexes/block_index_"+directory_name+".pkl",'wb')
pickle.dump(block_index,output)
output.close()
def merge_blocks_to_main_index():
'''
Merges intermediate indexes to a main index
'''
blocks = list_files('new_block_indexes')
main_index= defaultdict(list)
counter = 0
block_dict=list()
for b in sorted(blocks):
print "block: ",counter
block_dict_file = open(b,'rb')
block_index = pickle.load(block_dict_file)
block_dict.append(block_index)
for dictionar in block_dict:
print "block: ",counter
for term in dictionar:
try:
main_index[term].extend(dictionar[term])
except KeyError:
main_index[term]=dictionar[term]
counter += 1
output = open("final_merged_main_index.pkl",'wb')
pickle.dump(main_index,output)
output.close()
def TF_IDF(main_ind, N):
'''
returns the final dictionary with tf_idf values of each document and its terms
'''
tf = dict() #term frequency in each document
df = dict()
idf = dict() #inverted document frequency of each term
doc_terms_tfidf = dict()
for t in main_ind.keys(): #for each term t in the inverted index
tf_idf_dict = dict() #a new dictionary for tf_idf value
postings = main_ind[t]
if len(postings)>1:
df[t] = len(postings)
for p in postings:
for d in p.keys():
doc = d
freq = len(p[d])
#print "doc:", doc, "tf:", freq
tf[doc] = freq
idf[t] = math.log10(N/df[t])
tf_idf_dict = {t: tf[doc] * idf[t]}
if doc in doc_terms_tfidf.keys():
tf_idf_list = doc_terms_tfidf[doc]
tf_idf_list.append(tf_idf_dict)
doc_terms_tfidf[doc] = tf_idf_list
else:
tf_idf = list()
tf_idf.append(tf_idf_dict)
doc_terms_tfidf[doc]=tf_idf
else:
df[t]=1
dictionar = postings[0]
for d in dictionar.keys():
doc=d
freq=len(dictionar[d])
#print "doc:", doc, "tf:", freq
tf[doc]=freq
idf[t]=math.log10(N/df[t])
tf_idf_dict= {t: tf[doc]*idf[t]}
if doc in doc_terms_tfidf.keys():
tf_idf_list = doc_terms_tfidf[doc]
tf_idf_list.append(tf_idf_dict)
doc_terms_tfidf[doc] = tf_idf_list
else:
tf_idf=list()
tf_idf.append(tf_idf_dict)
doc_terms_tfidf[doc]=tf_idf
output = open("final_tfidf_index.pkl",'wb')
pickle.dump(doc_terms_tfidf,output)
output.close()
return doc_terms_tfidf