-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnaiveOrderer.py
More file actions
50 lines (45 loc) · 2.08 KB
/
naiveOrderer.py
File metadata and controls
50 lines (45 loc) · 2.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import os
import re
import nltk
#order query with sectiondepths, get progress in bar
def goOrderNaive(sectiondepths, query, bar):
path = os.getcwd() + "/" + query + "/"
return orderNaive(sectiondepths, query, path,
bar) #order with depths, save in dicts
def orderNaive(depths, query, path, bar):
dicts = (
dict(), dict()
) #make dicts to store the possible choices given the previous words
counter = 0
allFiles = len(os.listdir(path)) #only used to give a time estimate
for file in os.listdir(path): #open each file exactly once
f = open(path + file, 'r', encoding="utf-8")
fileText = f.read()
fileText = re.split("\n", fileText) #sections are split by linebreaks
for sectionsIterator in range(len(depths)): #go through sections
depth = depths[
sectionsIterator] #use the sectionsIterator as the "working variables"
currentDict = dicts[sectionsIterator]
currentText = fileText[sectionsIterator]
currentText = nltk.tokenize.word_tokenize(currentText) #get words
currentText = ['$START$'] * depth + currentText + ['$END$']
for currentIterator in range(
depth, len(currentText)
): #make a index for every word (after startsymbols)
key = tuple([
currentText[currentIterator - r]
for r in range(1, depth + 1)
]) #use the previous depth-many words as keys
if not key in currentDict: #add them to the choices array
currentDict[key] = [currentText[currentIterator]]
else:
currentDict[key].append(currentText[currentIterator])
if counter % 10 == 0: #dont print for every file
print('@#ordered ' + str(counter + 1) + ' of ' + str(allFiles + 1))
if bar != None:
bar.setValue((counter + 1) / (allFiles + 1) * 100)
counter += 1
print('@@ordering done')
if bar != None:
bar.setValue(100)
return dicts