Skip to content

Commit

Permalink
Dataset update
Browse files Browse the repository at this point in the history
  • Loading branch information
zzbn12345 committed May 17, 2022
1 parent 5ab03c4 commit 1f51fd6
Show file tree
Hide file tree
Showing 45 changed files with 431,290 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@

HeriGraph_Dataset.zip
dataset/Amsterdam/Edge_List.csv
dataset/Suzhou/Edge_List.csv
dataset/Venice-XL/Textual_Features.csv
dataset/Venice-XL/Visual_Features.csv
dataset/Venice/Edge_List.csv
15,898 changes: 15,898 additions & 0 deletions Amsterdam/data_storage/GEO_metadata.csv

Large diffs are not rendered by default.

946 changes: 946 additions & 0 deletions Amsterdam/data_storage/GEO_node_dist.csv

Large diffs are not rendered by default.

3,728 changes: 3,728 additions & 0 deletions Amsterdam/data_storage/metadata_compare.csv

Large diffs are not rendered by default.

9,600 changes: 9,600 additions & 0 deletions Amsterdam/data_storage/sentences.csv

Large diffs are not rendered by default.

9,600 changes: 9,600 additions & 0 deletions Amsterdam/data_storage/sentences_compare.csv

Large diffs are not rendered by default.

38,026 changes: 38,026 additions & 0 deletions Amsterdam/data_storage/social_links.csv

Large diffs are not rendered by default.

1,717 changes: 1,717 additions & 0 deletions Amsterdam/data_storage/tags_embedding.csv

Large diffs are not rendered by default.

162 changes: 162 additions & 0 deletions Amsterdam/save_image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import json
import flickrapi
import pandas as pd
import numpy as np
from argparse import Namespace
from collections import Counter
import pickle
import os
import urllib

args = Namespace(
# Data and Path information
api_key = u'[api_key]',
api_secret = u'[api_secret]',
radius = 2,
save_dir = 'data_storage/',
tags = None,
len_grid = 1,
image_dir = 'data_storage/images/'
)

def get_latlon(id_x, id_y, num = args.len_grid):
lat = 52.365
lon = 4.887777778
return lat,lon

def collect_ids(flickr, lat, lon, radius, x,y, tags = None):

if 'photo_ids_{}_{}.csv'.format(x,y) in [files for root, dirs, files in os.walk(args.save_dir)][0]:
Ids = pd.read_csv(args.save_dir+'photo_ids_{}_{}.csv'.format(x,y),sep='\t')['ids'].astype(str).unique().tolist()
else:
Ids = []

walk = flickr.walk(has_geo = 1, lat = lat, lon = lon, radius = args.radius, tags=tags)
for photo in walk:
id_now = photo.get('id')
if id_now in Ids:
continue
Ids.append(id_now)
if len(Ids)%200 == 0:
print('{} photo ids collected'.format(len(Ids)))
pd.Series(Ids, name = 'ids').to_csv(args.save_dir + 'photo_ids_{}_{}.csv'.format(x,y), index=False)
if len(Ids) >= 5000:
return Ids
pd.Series(Ids, name = 'ids').to_csv(args.save_dir + 'photo_ids_{}_{}.csv'.format(x,y), index=False)
return Ids

def update_df(Photos):
return Photos


def get_photos(flickr, Photos, Ids):
Photos = {str(k):v for k,v in Photos.items()}
processed = Photos.keys()
print(len(processed))
for id_now in Ids:
if id_now in processed:
continue
else:
Photos[id_now] = {}
sizes = json.loads(flickr.photos.getSizes(photo_id = id_now, format='json'))
info = json.loads(flickr.photos.getInfo(photo_id = id_now, format='json'))
try:
url_c = sizes['sizes']['size'][8]['source']
url_q = sizes['sizes']['size'][1]['source']
url_n = sizes['sizes']['size'][4]['source']
url_largest = sizes['sizes']['size'][-1]['source']
can = sizes['sizes']['candownload']
Photos[id_now]['candownload'] = can
Photos[id_now]['url_c'] = url_c
Photos[id_now]['url_q'] = url_q
Photos[id_now]['url_n'] = url_n
Photos[id_now]['url_largest'] = url_largest
Photos[id_now]['others'] = sizes
Photos[id_now]['info'] = info
Photos[id_now]['owner'] = info['photo']['owner']['nsid']
Photos[id_now]['owner_loc'] = info['photo']['owner']['location']
Photos[id_now]['title'] = info['photo']['title']['_content']
Photos[id_now]['description'] = info['photo']['description']['_content']
Photos[id_now]['comments'] = info['photo']['comments']['_content']
Photos[id_now]['taken'] = info['photo']['dates']['taken']
Photos[id_now]['views'] = info['photo']['views']
Photos[id_now]['people'] = info['photo']['people']['haspeople']
Photos[id_now]['tags'] = info['photo']['tags']['tag']
Photos[id_now]['lat'] = info['photo']['location']['latitude']
Photos[id_now]['lon'] = info['photo']['location']['longitude']
Photos[id_now]['neighbourhood'] = info['photo']['location']['neighbourhood']['_content']
Photos[id_now]['url'] = info['photo']['urls']['url'][0]['_content']

if can:
urllib.request.urlretrieve(url_q, args.image_dir+'150/{}.jpg'.format(id_now))
urllib.request.urlretrieve(url_n, args.image_dir+'320/{}.jpg'.format(id_now))

if len(processed)%20 ==1:
print('{}/{} photos collected'.format(len(processed),len(Ids)))
with open(args.image_dir+'Photo_sizes_pre_sep.p', 'wb') as fp:
pickle.dump(Photos,fp, protocol=pickle.HIGHEST_PROTOCOL)
with open(args.image_dir+'Photo_sizes.p', 'wb') as fp:
pickle.dump(Photos,fp, protocol=pickle.HIGHEST_PROTOCOL)
photo_df = pd.DataFrame(Photos).T.drop(['others','info'],axis=1)
photo_df.to_csv(args.image_dir+'photos_sizes.csv', sep='\t',encoding='utf-8-sig')
except Exception as e:
print(e)
print(id_now)
continue

with open(args.image_dir+'Photo_sizes_pre.p', 'wb') as fp:
pickle.dump(Photos,fp, protocol=pickle.HIGHEST_PROTOCOL)

with open(args.image_dir+'Photo_sizes.p', 'wb') as fp:
pickle.dump(Photos,fp, protocol=pickle.HIGHEST_PROTOCOL)
photo_df = pd.DataFrame(Photos).T.drop(['others','info'],axis=1)
photo_df.to_csv(args.image_dir+'photos_sizes.csv', sep='\t',encoding='utf-8-sig')
return Photos

def main():

flickr = flickrapi.FlickrAPI(args.api_key, args.api_secret)

if 'completed.p' in [files for root, dirs, files in os.walk(args.image_dir)][0]:
with open(args.image_dir+'completed.p', 'rb') as fp:
completed = pickle.load(fp)
else:
completed = {}

if 'Photo_sizes.p' in [files for root, dirs, files in os.walk(args.image_dir)][0]:
with open(args.image_dir+'Photo_sizes.p', 'rb') as fp:
Photos = pickle.load(fp)
else:
Photos = {}

for x in range(args.len_grid):
for y in range(args.len_grid):
if (x,y) in completed.keys():
continue

lat,lon = get_latlon(x,y)

if 'photo_ids_{}_{}.csv'.format(x,y) in [files for root, dirs, files in os.walk(args.save_dir)][0]:
Ids = collect_ids(flickr, lat,lon, args.radius, tags=args.tags, x=x,y=y)
else:
Ids = collect_ids(flickr, lat,lon, args.radius, tags=args.tags, x=x,y=y)


Photos = get_photos(flickr, Photos, Ids)

completed[(x,y)] = {}
completed[(x,y)]['lat'] = lat
completed[(x,y)]['lon'] = lon
completed[(x,y)]['collected'] = len(Ids)
completed[(x,y)]['total'] = len(Photos)

with open(args.save_dir+'completed.p', 'wb') as fp:
pickle.dump(completed,fp, protocol=pickle.HIGHEST_PROTOCOL)

completed_df = pd.DataFrame(completed).T
completed_df.to_csv(args.save_dir+'completed.csv')


if __name__ == "__main__":
main()
"""## END"""
Loading

0 comments on commit 1f51fd6

Please sign in to comment.