NLP/count2.py at master · brighter-bee/NLP · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import docx2txt
import os
import sys
import re
import time
import PyPDF2
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def getPageCount(pdf_file):

	pdfFileObj = open(pdf_file, 'rb')
	pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
	pages = pdfReader.numPages
	return pages

def extractData(pdf_file, page):

	pdfFileObj = open(pdf_file, 'rb')
	pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
	pageObj = pdfReader.getPage(page)
	data = pageObj.extractText()
	return data

def getWordCount(data):

	data=data.split()
	return len(data)

def main():
	if len(sys.argv)!=2:
		print('command usage: python word_count.py FileName')
		exit(1)
	else:
		pdfFile = sys.argv[1]

		try:
			if os.path.exists(pdfFile):
				print("file found!")
		except OSError as err:
			print(err.reason)
			exit(1)


		#totalWords = 0
		#texts = []
		#numPages = getPageCount(pdfFile)
		#for i in range(numPages):
		#	text = extractData(pdfFile, i)
		#	totalWords+=getWordCount(text)
		#	texts.append(text)
		#time.sleep(1)

		#for i in range(len(texts)):
		#	texts[i] = texts[i].split()

		stop_words = set(stopwords.words('english'))


		resume = 'i am good in python'
		resume2 = 'this job requires proficiency in python and c++ and java'
		job_description = 'this job requires proficiency in python and c++ and java'

		resume = word_tokenize(resume)

		resume_out = []

		for w in resume:
			if w not in stop_words:
				resume_out.append(w)

		resume = ' '.join(resume_out)
		print(resume)

		text = [job_description,resume, resume2]

		from sklearn.feature_extraction.text import CountVectorizer
		cv = CountVectorizer()
		count_matrix = cv.fit_transform(text)

		from sklearn.metrics.pairwise import cosine_similarity

		print("\nSimilarity Scores:")
		print(cosine_similarity(count_matrix))

		matchPercentage = cosine_similarity(count_matrix)[0][1] * 100
		matchPercentage = round(matchPercentage, 2)
		print("Your resume matches about "+ str(matchPercentage)+ "% of the project description.")

if __name__ == '__main__':
	main()