-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdirect Abstract and Keywords Extraction.py
82 lines (70 loc) · 2.6 KB
/
direct Abstract and Keywords Extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
## This function assume the keywords for "abstract" and "keywords"
# exists in the paper
from tika import parser
import time
import os.path
tStart = time.time()
# read file and return all content
def rFile(fPath):
# parser is a Tika package function
parsed = parser.from_file(fPath)
cont = parsed["content"]
meta = parsed["metadata"]
# print (parsed)
return cont
def getString(contS):
# initiate keyword list and abstract list
keyWd,abstr = [], []
# iterate every string in separated string list
for i in contS:
# check if specified string is in the list
if "Keywords:" in i:
keyWd.append(i)
# # print(keyWd)
# # find the start point using keyword "Keywords:"
# keyStart = contS.index(i)
# # for loop to allocate all keywords, until next line break symbol
# for j in contS[keyStart:len(contS)]:
# if j != ' ':
# keyWd.append(j)
# else:
# break
if "ABSTRACT" in i or "Abstract—" in i:
# find the start point using keyword "Abstract", some time it's "Abstract—"
for j in contS[contS.index(i) + 1:len(contS)]:
if j != '' and j != ' ':
abstrStart = contS.index(j)
break
# for loop to allocate all strings for abstract in the list, until next line break symbol
for k in contS[abstrStart:len(contS)]:
# print(contSplit[abstrStart:len(contSplit)])
if k != ' ':
abstr.append(k)
else:
break
return keyWd, abstr
if __name__ == "__main__":
fname = input("Input your file path: ")
# e.g. data/USING INFORMATION FROM RENDEZVOUS MISSIONS FOR BEST-CASE APPRAISALS OF IMPACT DAMAGE TO PLANET EARTH .pdf
if os.path.exists(fname):
# read though a file and extract content using Tika package
content = rFile(fname)
# separate extracted content by line break
contSplit = content.split("\n")
# print(contSplit)
texts = getString(contSplit)
# merge strings in the list to one
keyW = ' '.join(texts[0])
strAbstract = ' '.join(texts[1])
if keyW == '':
print("no keywords found")
else:
print(keyW)
if strAbstract == '':
print("no abstract found")
else:
print("Abstract:", strAbstract)
else:
print("No such file")
tEnd = time.time()
print("\nTotal time: ", tEnd - tStart, "seconds")