-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTweetParser_historical.py
84 lines (64 loc) · 3.31 KB
/
TweetParser_historical.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import tweepy
import pandas as pd
import data_parse_helper as helper
import re
import spacy
#Authentication
consumer_key="L0YdSokeGD14UGtfBKrexHMYl"
consumer_secret="NOsLL52yXpbDb5kNWx3CeltouydERnWGzmkTu6drxNXSLVYUsG"
access_token="826558986266804227-O7XFdDVLMkYIf6BFNgfSNEJowJiKNz5"
access_token_secret="0DMra9GSN9F5f1vidbcYruZjPSDdjxgH56chLYx3bKHoZ"
#Spacy nlp object to pass into data helper
nlp = spacy.load('en_core_web_lg')
#Authentication
def dump_tweets(screen_name):
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
# initialize a list to hold all the tweepy Tweets
alltweets = []
# make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name=screen_name, count=20)
# save most recent tweets
alltweets.extend(new_tweets)
# save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
# keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
print
"getting tweets before %s" % (oldest)
# all subsiquent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name=screen_name, count=20, max_id=oldest)
# save most recent tweets
alltweets.extend(new_tweets)
# update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
print(
"...%s tweets downloaded so far" % (len(alltweets)))
# transform the tweepy tweets into a 2D array that will populate the csv
outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8")] for tweet in alltweets]
output = pd.DataFrame(columns=["Tweet_id","data_source","Building_name", "Room_name", "Pizza_Status", "Event_Date", "Event_Time", "Sent_Date", "Text"])
for each_tweet in outtweets:
is_pizza = classify_is_pizza(each_tweet[2].decode())
#event_date, loc = find_date_location(each_tweet)
event_date, building_name, room_name, event_time = find_datelocation(each_tweet)
# output = output.append(pd.DataFrame([[each_tweet[0], "tweet", loc[0], loc[1], is_pizza, event_date, each_tweet[1], each_tweet[2].decode()]], columns=["Tweet_id", "data_source","Building_num", "Room_num", "Pizza_Status", "Event_Date", "Sent_Date","Text"]))
output = output.append(pd.DataFrame([[each_tweet[0], "tweet", building_name, room_name, is_pizza, event_date, event_time,each_tweet[1], each_tweet[2].decode()]], columns=["Tweet_id", "data_source","Building_num", "Room_num", "Pizza_Status", "Event_Date","Event_Time", "Sent_Date","Text"]))
writer = pd.ExcelWriter('output.xlsx')
output.to_excel(writer, "Twitter_data")
writer.save()
def classify_is_pizza(tweet):
return "pizza" in tweet
def find_event_name():
pass
def find_datelocation(tweet):
parsed_date, building_name, room_name, time = helper.get_datelocation(tweet[2].decode(), nlp)
#return "1/27/2018", "35-2064"
try:
return parsed_date.date(), building_name, room_name, time
except:
return "None", "None", "None", "None"
if __name__ == '__main__':
# pass in the username of the account you want to download
dump_tweets("RITFreeFood")