-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathText_Scrapper.py
53 lines (46 loc) · 2.1 KB
/
Text_Scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import csv
import json
import pathlib
import os
import pandas as pd
from pandas.io.json import json_normalize
from ibm_watson import NaturalLanguageUnderstandingV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson.natural_language_understanding_v1 import Features, CategoriesOptions, EntitiesOptions
inFile = "ReadURL.csv"
with open(inFile, 'r') as read_obj:
csv_reader=csv.reader(read_obj)
header = next(csv_reader)
if header != None:
for col in csv_reader:
inURL= col[0]
already_downloaded=col[1]
if already_downloaded=="Yes":
pass
else:
authenticator = IAMAuthenticator('G2CkuNwMzqPHtByDgOMTmMIVjJeF1GzT-RCMHvGslddR')
natural_language_understanding = NaturalLanguageUnderstandingV1(
version='2020-08-01',
authenticator=authenticator
)
natural_language_understanding.set_service_url('https://api.au-syd.natural-language-understanding.watson.cloud.ibm.com/instances/79d13f1f-8a18-4625-8d46-79bc17cdfd20')
#natural_language_understanding.set_service_url('{url}')
response = natural_language_understanding.analyze(
url=inURL,
#url="https://www.erim.eur.nl/people/stefan-stremersch/",
features=Features(entities=EntitiesOptions(sentiment=False)),clean=False).get_result()
data = json_normalize(response['entities'])
a = data.iloc[(data.groupby(["type"])['relevance'].idxmax())][['type','text']]
Company = a[(a['type']=="Company")]['text'].to_string(index=False)
Email = a[(a['type']=="EmailAddress")]['text'].to_string(index=False)
Location = a[(a['type']=="Location")]['text'].to_string(index=False)
Person = a[(a['type']=="Person")]['text'].to_string(index=False)
TopPublication = a[(a['type']=="PrintMedia")]['text'].to_string(index=False)
Person
file_exists = os.path.isfile('names.csv')
with open('names.csv', 'a') as csvfile:
fieldnames = ['Person', 'Email', 'Company', 'Location']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
if not file_exists:
writer.writeheader()
writer.writerow({'Person': Person, 'Email': Email, 'Company':Company, 'Location': Location})