-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathga.py
48 lines (34 loc) · 1.09 KB
/
ga.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import PyPDF2
import re
import openpyxl
import pandas as pd
pdfFileObj = open('/Users/ramamohanraoveeramachaneni/Downloads/ga/533Cottrell.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
print(pdfReader.numPages)
ls = []
ts = []
for i in range(pdfReader.numPages):
pageObj = pdfReader.getPage(i)
# extracting text from page
ls.extend(re.findall(r'\(T.+;\n*\ *SBN.+\)', pageObj.extractText()))
ts.extend(re.findall(r'\(T .*?\)', pageObj.extractText(),re.S))
# closing the pdf file object
pdfFileObj.close()
for i in ts:
if i not in ls:
if "\n" in i:
a = i.split("\n")
#print(a)
ls.extend(a)
else:
ls.append(i)
#print(ls)
df = pd.DataFrame(ls,columns=["Suffix"])
df["Name_of _Paper"] = pd.Series(["Minds, Composition, and Hume’s Skepticism in the Appendix." for x in range(len(df))])
print(df)
try:
previous_df = pd.read_excel("/Users/ramamohanraoveeramachaneni/Downloads/Output.xlsx",index_col=0)
except:
print("No such file exists")
result= pd.concat([previous_df,df])
result.to_excel("Output.xlsx")