-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathqueryIndex.py
126 lines (103 loc) · 3.39 KB
/
queryIndex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import pickle
def intersect(l):
'''
I: list of lists of elements
O: list of sequences which is in every list
'''
l = sorted(l, key=len)
all_ids = []
intersection = []
for elem in l:
for subElem in elem:
all_ids.append(subElem)
for elem in all_ids:
if all_ids.count(elem) == len(l):
intersection.append(elem)
else:
pass
return intersection
def oneWordQuery(invertedIndex, query):
'''I: query from terminal
O: list of articles' IDs where term occurs
'''
# stem searched term
from invertedIndex import getTerms
terms = getTerms(query)
# if query in invertedIndex return articles IDs in which query occurs
# else return []
for term in terms:
IDs = []
if term in invertedIndex.keys():
for ID, pos in invertedIndex[term]:
IDs.append(ID)
else:
return []
print(f'IDs {IDs}')
return IDs
def freeTextQuery(invertedIndex, query):
'''I: Free Text Query from terminal
O: list of articles' IDs in which some searched terms occur
'''
from invertedIndex import getTerms
terms = getTerms(query)
IDs = []
for term in terms:
if term in invertedIndex.keys():
for ID, pos in invertedIndex[term]:
IDs.append(ID)
IDs = list(set(IDs))
print(f'IDs {IDs}')
return IDs
def phraseQueries(invertedIndex, query):
'''I: phraseQuery from terminal with quotes
O: list of articles' IDs in which some searched terms occur
'''
from invertedIndex import getTerms
#from collections import defaultdict
terms = getTerms(query, False)
print(f'terms {terms}')
# IDs of articles in which term occured
IDs = []
for term in terms:
holder = []
if term in invertedIndex.keys():
for coordinates in invertedIndex[term]:
# add all IDs of articles in which the term occurs
if coordinates[0] not in holder:
holder.append(coordinates[0])
# add all IDs provided the previous condition
IDs.append(holder)
else:
return None
# IDs of articles in which query occurs
inter = list(set(intersect(IDs)))
positions = {}
# add positions of terms to positions
for term in terms:
if term in invertedIndex.keys():
for ID, pos in invertedIndex[term]:
if ID in inter:
if ID not in positions.keys():
positions[ID] = []
positions[ID].append(pos)
# check if queried words are in the right order
for ID in inter:
holder = []
for termPos in positions[ID]:
for i in termPos:
holder.append(i)
positions[ID] = sorted(list(set(holder)))
legit = []
for ID in inter:
#holder is temporary storage for entire sequence of words
holder = list(positions[ID])
#h is a for a test if query is in particular order
h = []
for ei, elem in enumerate(holder):
h.append(elem-ei)
#if length of terms is equal to number of same numbers in h, query is in the article
for elem in h:
if h.count(elem) == len(terms):
legit.append(ID)
print(f'legit {list(set(legit))}')
return list(set(legit))