-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhal_processSampleFullText.py
107 lines (78 loc) · 3.28 KB
/
hal_processSampleFullText.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import pymongo
from libraries import nlprocessingFullText
from libraries import nlprocessing
from grobid.client import GrobidClient
import urllib.request
from bs4 import BeautifulSoup
import html
# Connect to MongoDB server
server = pymongo.MongoClient("mongodb://localhost:27017/")
db = server['hal']
bd_del = True
if bd_del:
col = db['primaryDomain']
col.drop()
col = db['articles_w_files_cleaned_fr']
# n = (2,575)² x (0,5)(1-0,5) / (0,01)² = 16576.5625
# n = (2,575)² x (0,5)(1-0,5) / (0,02)² = 4144.140625
sample_size = 4500
progress = 0
sample = col.aggregate([{"$sample": {"size": sample_size}}])
domains = []
errors_count = 0
for document in sample:
progress += 1
print('Processing ' + str(progress) + "/" + str(sample_size))
url = document['files_s'][0]
print(url)
try:
response = urllib.request.urlopen(url)
file = open("tmp/print.pdf", 'wb')
file.write(response.read())
file.close()
client = GrobidClient("localhost", "8070")
rsp, status = client.serve("processFulltextDocument", "tmp/print.pdf")
soup = BeautifulSoup(rsp.content, 'lxml')
fullText = soup.findAll(text=True)
# Removing xml version tag
fullText.pop(0)
fullText_clean = []
# Removing \n, etc.
for piece in fullText:
if html.unescape(piece).strip() != "" and piece != "GROBID - A machine learning software for extracting information from scholarly documents":
fullText_clean.append(str(piece))
keywords = document['fr_keyword_s']
abstract = document['fr_abstract_s'][0]
keywords = document['fr_keyword_s']
if len(keywords) > 0 and len(fullText_clean) > 0 and abstract != "":
# Compute similarity between abstract and keywords
document['abstract_match'] = nlprocessing.computeSimilarity(abstract, keywords, 'fr', True)
# Compute similarity between fullText and keywords
document['fullText_match'] = nlprocessingFullText.computeSimilarity(fullText_clean, keywords, 'fr', True)
# Update domain's statistics
domain_already_registered = False
for domain in domains:
if domain['primaryDomain'] == document['primaryDomain_s']:
domain_already_registered = True
domain['count'] += 1
domain['fullText_match'] += document['fullText_match']
domain['abstract_match'] += document['abstract_match']
if not domain_already_registered:
domains.append({'primaryDomain': document['primaryDomain_s'],
'count': 1,
'fullText_match': document['fullText_match'],
'abstract_match': document['abstract_match']
})
else:
errors_count += 1
# If can not retrieve the PDF file
except:
errors_count += 1
print('Can not retrieve the PDF file')
for domain in domains:
domain['lang'] = 'fr'
domain['fullText_match'] = domain['fullText_match'] / domain['count']
domain['abstract_match'] = domain['abstract_match'] / domain['count']
col = db['primaryDomain']
col.insert_one(domain)
print(errors_count)