-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1.changeToVector.py
44 lines (33 loc) · 1.44 KB
/
1.changeToVector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from sklearn.feature_extraction.text import TfidfVectorizer
from mosestokenizer import MosesDetokenizer
from nltk.corpus import stopwords
#open to file in this directory
f1= open('/home/shuchita-rahman/Documents/thesis/f1', 'r')
f2= open('/home/shuchita-rahman/Documents/thesis/f2', 'r')
#put file data in data variable
mainData = f1.read()
checkSimilarityData = f2.read()
#remove stop word from this data(am, an etc.)
stop = set(stopwords.words('english'))
mainData = [word for word in mainData.split() if word not in stop]
checkSimilarityData= [word for word in checkSimilarityData.split() if word not in stop]
#data are tokenized to convert it to vector we must detokenize data or make it a string
detokenize = MosesDetokenizer('en')
mainData = detokenize(mainData)
checkSimilarityData = detokenize(checkSimilarityData)
#printing data whinch are now string
print(mainData)
print(checkSimilarityData)
#changing sentence to vector
tfidf = TfidfVectorizer()
fitDataOne = tfidf.fit_transform([mainData])
feature_names = tfidf.get_feature_names()
#for every word how much similar they are
for col in fitDataOne.nonzero()[1]:
print(feature_names[col], ' - ', fitDataOne[0, col])
fitDataTwo = tfidf.fit_transform([checkSimilarityData])
feature_names = tfidf.get_feature_names()
#feature_names = tfidf.get_feature_names(fitDataTwo)
print("\n-------------\n")
for col in fitDataTwo.nonzero()[1]:
print(feature_names[col], ' - ', fitDataTwo[0, col])