-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcount2.py
More file actions
92 lines (68 loc) · 2 KB
/
count2.py
File metadata and controls
92 lines (68 loc) · 2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import docx2txt
import os
import sys
import re
import time
import PyPDF2
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def getPageCount(pdf_file):
pdfFileObj = open(pdf_file, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pages = pdfReader.numPages
return pages
def extractData(pdf_file, page):
pdfFileObj = open(pdf_file, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(page)
data = pageObj.extractText()
return data
def getWordCount(data):
data=data.split()
return len(data)
def main():
if len(sys.argv)!=2:
print('command usage: python word_count.py FileName')
exit(1)
else:
pdfFile = sys.argv[1]
try:
if os.path.exists(pdfFile):
print("file found!")
except OSError as err:
print(err.reason)
exit(1)
#totalWords = 0
#texts = []
#numPages = getPageCount(pdfFile)
#for i in range(numPages):
# text = extractData(pdfFile, i)
# totalWords+=getWordCount(text)
# texts.append(text)
#time.sleep(1)
#for i in range(len(texts)):
# texts[i] = texts[i].split()
stop_words = set(stopwords.words('english'))
resume = 'i am good in python'
resume2 = 'this job requires proficiency in python and c++ and java'
job_description = 'this job requires proficiency in python and c++ and java'
resume = word_tokenize(resume)
resume_out = []
for w in resume:
if w not in stop_words:
resume_out.append(w)
resume = ' '.join(resume_out)
print(resume)
text = [job_description,resume, resume2]
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
count_matrix = cv.fit_transform(text)
from sklearn.metrics.pairwise import cosine_similarity
print("\nSimilarity Scores:")
print(cosine_similarity(count_matrix))
matchPercentage = cosine_similarity(count_matrix)[0][1] * 100
matchPercentage = round(matchPercentage, 2)
print("Your resume matches about "+ str(matchPercentage)+ "% of the project description.")
if __name__ == '__main__':
main()