joogle/index.py at main · DaneMarc/joogle · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#!/usr/bin/python3
import nltk
import sys
import getopt
import json
import math
import pickle

from pathlib import Path
from string import punctuation

def usage():
    print("usage: " + sys.argv[0] + " -i directory-of-documents -d dictionary-file -p postings-file")

def build_index(in_dir, out_dict, out_postings):
    """
    build index from documents stored in the input directory,
    then output the dictionary file and postings file
    """
    print('indexing...')

    MEM_LIMIT = 100000 # Number of pairings
    HALF_LIMIT = MEM_LIMIT / 2
    STRING_MAX = '}' # Max string value as compared to dictionary terms
    mem = 0
    stemmer = nltk.PorterStemmer()
    dictionary = {}
    block_count = 0
    population = [] # Universal set

    for file in Path(in_dir).iterdir():
        population.append(int(file.name))
        with open(file) as doc:
            token_set = set() # Set is used to prevent duplicates
            tokens = nltk.word_tokenize(doc.read().lower()) # Tokenization and case folding

            # Stemming and punctuation stripping
            for tok in tokens:
                token_set.add(stemmer.stem(tok).strip(punctuation))

            # Removes the empty string from set - side effect of punctuation stripping
            if '' in token_set:
                token_set.remove('')

            mem += len(token_set) # Keeps track of 'memory'

            for tok in token_set:
                if tok in dictionary:
                    posting = dictionary[tok]
                    posting[0] += 1
                    posting[1].append(int(file.name))
                else:
                    dictionary[tok] = [1, [int(file.name)]]

            if mem > MEM_LIMIT:
                write_out(dictionary, block_count)
                block_count += 1
                mem = 0

    # Writes out any remaining dictionary items
    if len(dictionary) > 0:
        write_out(dictionary, block_count)
        block_count += 1

    files  = list(Path('.').glob('block*.txt'))
    terms = [] # Terms to be processed
    currs = [0] * block_count # File pointers
    final_dict = {}

    # Continuously reads chunks from blocks, merges and writes them out until all blocks have been fully read
    while len(files) > 0:
        lowest = read_chunks(files, currs, STRING_MAX, HALF_LIMIT, terms)
        merge_chunks(terms, dictionary)
        write_postings(out_postings, dictionary, lowest, final_dict)

    # Writes out the universal set and any remaining items
    dictionary['ALL'] = [len(population), population]
    write_postings(out_postings, dictionary, STRING_MAX, final_dict)

    # Writes out the final dictionary
    with open(out_dict, 'wb') as out_d:
        pickle.dump(final_dict, out_d)


# Writes out the filled dictionary onto the disk
def write_out(dictionary, block_count):
    sorted_dict = dict(sorted(dictionary.items()))

    with open('block' + str(block_count) + '.txt', 'w') as block_file:
        for item in sorted_dict.items():
            block_file.write(json.dumps(item) + '\n')

    dictionary.clear()


# Reads chunks from each block and stores them for later processing
def read_chunks(files, currs, string_max, half_limit, terms):
    lowest = string_max
    block_limit = half_limit / len(files) # Size limit for each block

    for i in reversed(range(len(files))):
        end = False
        with open(files[i]) as block:
            chunk_size = 0 # Keeps track of acquired chunk size
            block.seek(currs[i])
            line = block.readline()

            while line:
                term = json.loads(line)
                terms.append(term)
                chunk_size += term[1][0]

                # When the limit is reached, the current file position is recorded to allow later readings
                if chunk_size > block_limit:
                    currs[i] = block.tell()
                    if term[0] < lowest: # Records the smallest string among the chunks drawn this round
                        lowest = term[0]
                    break

                line = block.readline()

            # Checks if a block has reached the end
            if block.readline() == '':
                end = True

        if end:
            Path(files[i]).unlink()
            del files[i]
            del currs[i]

    return lowest


# Merges chunks into the main dictionary
def merge_chunks(terms, dictionary):
    for term in terms:
        if term[0] in dictionary:
            item = dictionary[term[0]]
            item[0] += term[1][0]
            item[1] += term[1][1]
        else:
            dictionary[term[0]] = term[1]

    terms.clear()


# Writes out the postings list of terms that have been merged from all blocks
def write_postings(out_postings, dictionary, lowest, final_dict):
    with open(out_postings, 'ab') as out_p:
        for k in list(dictionary.keys()):
            # If the term is smaller than the smallest last term read from all the blocks it has completed processing
            if k <= lowest:
                v = dictionary.pop(k)
                v[1].sort() # Sorts final posting list
                add_skips(v[1])
                final_dict[k] = (v[0], out_p.tell())
                pickle.dump(v[1], out_p)


# Adds skip pointers to the final postings lists
def add_skips(posting):
    posting_len = len(posting)

    if posting_len > 3:
        skip_len = math.floor(math.sqrt(posting_len))
        skip = 0
        for i in range(posting_len):
            if i == skip and (skip + skip_len) < posting_len:
                skip += skip_len
                posting[i] = (posting[i], skip)
            else:
                posting[i] = (posting[i], )
    else:
        for i in range(posting_len):
            posting[i] = (posting[i], )


input_directory = output_file_dictionary = output_file_postings = None

try:
    opts, args = getopt.getopt(sys.argv[1:], 'i:d:p:')
except getopt.GetoptError:
    usage()
    sys.exit(2)

for o, a in opts:
    if o == '-i': # input directory
        input_directory = a
    elif o == '-d': # dictionary file
        output_file_dictionary = a
    elif o == '-p': # postings file
        output_file_postings = a
    else:
        assert False, "unhandled option"

if input_directory == None or output_file_postings == None or output_file_dictionary == None:
    usage()
    sys.exit(2)

build_index(input_directory, output_file_dictionary, output_file_postings)