forked from pablobarbera/pytwools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexport-network-data.py
129 lines (107 loc) · 4.26 KB
/
export-network-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
'''
export-network-data.py
Exports nodes and edges from tweets (either from retweets or
mentions) in json format, and saves it in a file format
compatible with Gephi
Note: it only extracts automatic retweets
@p_barbera
Usage:
### extract retweet nodes and edges
python export-network-data.py -f tweets.json -et retweets -oe edges.csv -on nodes.csv
### extract mention edges
python export-network-data.py -f tweets.json -et mentions -oe edges.csv -on nodes.csv
Variable names
- sender_id = user id of user who sends the retweet/mention (retweeter)
- sender_name = screen name for that user
- receiver_id = user id of user who receiveds the retweet/mention (retweeted/mentioned)
- receiver_name = screen name for that user
'''
import sys
import json
import argparse
import re
from datetime import datetime
# arguments
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--file', required=True,
help = 'name of file with tweets in json format')
parser.add_argument('-oe', '--outputedges', default='edges.csv',
help = 'name of file where list of edges will be saved')
parser.add_argument('-on', '--outputnodes', default='nodes.csv',
help = 'name of file where list of nodes will be saved')
parser.add_argument('-et', '--edgetype',
choices=['retweets', 'mentions'],
help = 'type of edge to extract')
args = parser.parse_args()
# arguments, and opening files for output
tweetfile = args.file
edges = args.edgetype
def export_retweets(tweetfile, outputedges, outputnodes):
fh = open(tweetfile, 'r')
oute = open(outputedges, 'w')
oute.write('Source,Target,Time\n')
outn = open(outputnodes, 'w')
outn.write('Id,Label,Followers,Lang\n')
user_data = {}
for line in fh:
try:
tweet = json.loads(line)
except:
continue
if 'retweeted_status' not in tweet:
continue
lw = tweet['user']['id_str'] + ',' + \
tweet['retweeted_status']['user']['id_str'] + \
',' + str(datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y'))
oute.write(lw + "\n")
user_data[tweet['user']['id_str']] = "{0},{1},{2},{3}".format(
tweet['user']['id_str'],
tweet['user']['screen_name'],
tweet['user']['followers_count'],
tweet['user']['lang'])
user_data[tweet['retweeted_status']['user']['id_str']] = "{0},{1},{2},{3}".format(
tweet['retweeted_status']['user']['id_str'],
tweet['retweeted_status']['user']['screen_name'],
tweet['retweeted_status']['user']['followers_count'],
tweet['retweeted_status']['user']['lang'])
for user, user_string in user_data.items():
outn.write('{0}\n'.format(user_string))
oute.close()
outn.close()
def export_mentions(tweetfile, outputedges, outputnodes):
fh = open(tweetfile, 'r')
oute = open(outputedges, 'w')
oute.write('Source,Target,Time\n')
outn = open(outputnodes, 'w')
outn.write('Id,Label,Followers,Lang\n')
user_data = {}
for line in fh:
try:
tweet = json.loads(line)
except:
continue
if len(tweet['entities']['user_mentions']) == 0:
continue
for mention in tweet['entities']['user_mentions']:
lw = tweet['user']['id_str'] + ',' + mention['id_str'] + \
',' + str(datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y'))
oute.write(lw + "\n")
user_data[tweet['user']['id_str']] = "{0},{1},{2},{3}".format(
tweet['user']['id_str'],
tweet['user']['screen_name'],
tweet['user']['followers_count'],
tweet['user']['lang'])
if mention['id_str'] not in user_data.keys():
user_data[mention['id_str']] = "{0},{1},{2},{3}".format(
mention['id_str'],
mention['screen_name'],
'NA',
'NA')
for user, user_string in user_data.items():
outn.write('{0}\n'.format(user_string))
oute.close()
outn.close()
if args.edgetype == 'retweets':
export_retweets(tweetfile, args.outputedges, args.outputnodes)
if args.edgetype == 'mentions':
export_mentions(tweetfile, args.outputedges, args.outputnodes)