forked from alessandro-piscopo/Nazca_DMI2015
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnews_image_scraping.py
52 lines (32 loc) · 1.09 KB
/
news_image_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import urllib
import pandas as pd
import csv as csv
#load the dataset
nazca_news = pd.read_csv('nazca_news_1401.csv', header=0)
#create arrays
images_links = nazca_news['images.url']
dates = nazca_news['published']
images_place = nazca_news['url']
images_engagement = nazca_news['engagement']
images_dated = zip(images_links, dates, images_place, images_engagement)
#download images
image_dated_partial = images_dated
picture_name_date = []
errors = []
counter = 0
for x, i, j, k in image_dated_partial:
if x:
if type(x) is not str or x == 0:
continue
counter += 1
filename = str(counter) + ".jpg"
urllib.urlretrieve(x, filename)
picture_name_date.append((filename, i, j, k))
else :
picture_name_date.append('none', i, j, k)
#save picture_name_date list
with open('nazca_pictures.csv','w') as out:
csv_out=csv.writer(out)
csv_out.writerow(['img_name','date', 'url', 'engagement'])
for row in picture_name_date:
csv_out.writerow(row)