-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtwitter_stream.py
105 lines (88 loc) · 4.59 KB
/
twitter_stream.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import pymongo as py
import tweepy as tw
import re
'''A reference of steps for Twitter extraction process has been taken from a DataQuest blog. But the program has not
been copied. Reference Link: https://www.dataquest.io/blog/streaming-data-python/'''
# variables that contain my user credentials to access the Twitter API
consumer_key = 'LGJPU01uuK4NIqYgr4aoVvahZ'
consumer_secret = 'Y0RWWR11J9IB1qcbYNkUxFdi5x2VHW1OuS2Ls9Shd7X2qGvofJ'
access_token = '302138681-q41ccKrd6fEaHRjsgnSTHzJKvtQQ3f9vPd20Vyyb'
access_token_secret = 'AC0vbT5aVvvJlPNyZC0cUzaZw3Bwfl58atCT658TVaKBF'
# setting up tweepy to authenticate with Twitter
auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
# create API object to pull data from Twitter
api = tw.API(auth)
# setup db and collection for MongoDB
client = py.MongoClient('mongodb://127.0.0.1:27017/')
db = client['Asgmt3']
collection = db['twitter_stream']
x = []
# search keywords
search_terms = ['Canada', 'University', 'Dalhousie University', 'Halifax', 'Canada Education']
max_tweets = 3000
# clean emoji logic, regex from https://programmersought.com/article/15261349366/
def clean_emoji(string):
if string is not None:
pattern = re.compile("\["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"\]+", flags=re.UNICODE)
return pattern.sub(r'', string)
# Creating a StreamListener
class StreamListener(tw.StreamListener):
"""
logic to stop collecting tweets, Reference: https://stackoverflow.com/questions/38297150/twitter-streaming-stop-collecting-data
"""
def __init__(self, api=None):
print ("Streaming Tweets...")
super(StreamListener, self).__init__()
# initializes the counter
self.count = 0
def on_status(self, status):
tweetDetail = {'description': status.user.description, 'twitter_handle': status.user.screen_name,
'tweet_date': status.created_at.strftime("%d-%b-%Y"),
'location': status.user.location, 'name': status.user.name,
'following': status.user.friends_count,
'tweet': status.text, 'retweets': status.retweet_count, 'followers': status.user.followers_count}
# prints status text. Also count the mentions and clean tweets.
self.count += 1
if self.count <= max_tweets:
clean_emoji(tweetDetail['tweet'])
clean_emoji(tweetDetail['description'])
clean_emoji(tweetDetail['location'])
clean_emoji(tweetDetail['name'])
if tweetDetail['tweet'] is not None:
tweetDetail['tweet'] = re.sub(r'[^a-zA-Z0-9\s\.]+', '', tweetDetail['tweet'])
tweetDetail['tweet'] = re.sub(r'http\S+', '', tweetDetail['tweet'], flags=re.MULTILINE)
if tweetDetail['description'] is not None:
tweetDetail['description'] = re.sub(r'http\S+', '', tweetDetail['description'], flags=re.MULTILINE)
tweetDetail['description'] = re.sub(r'[^a-zA-Z0-9\s\.]+', '', tweetDetail['description'])
if tweetDetail['location'] is not None:
tweetDetail['location'] = re.sub(r'http\S+', '', tweetDetail['location'], flags=re.MULTILINE)
tweetDetail['location'] = re.sub(r'[^a-zA-Z0-9\s\.]+', '', tweetDetail['location'])
if tweetDetail['name'] is not None:
tweetDetail['name'] = re.sub(r'[^a-zA-Z0-9\s\.]+', '', tweetDetail['name'])
tweetDetail['name'] = re.sub(r'http\S+', '', tweetDetail['name'], flags=re.MULTILINE)
x.append(tweetDetail)
return True
else:
print("Fetched", self.count - 1, "tweets")
return False
def on_error(self, status_code):
# returning False in on_error disconnects the stream
if status_code == 420:
return False
# returning non-False reconnects the stream, with backoff.
# creating a stream
stream_listener = StreamListener()
stream = tw.Stream(auth=api.auth, listener=stream_listener)
# starting a stream
stream.filter(track=['Canada', 'University', 'Dalhousie University', 'Halifax', 'Canada Education'])
# sending data to mongoDB
collection.insert_many(x)
print('data stored successfully in MongoDB')