-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetcher_template.py
84 lines (67 loc) · 3.28 KB
/
fetcher_template.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import tweepy
from backend.dbcconnection import DBCConnection
from backend.run_streamer import run_streamer
from backend.stream_listener import StreamListener
from backend.twconnection import TWConnection
"""
Assumes you have mongodb installed locally
and that you have a database called "test"
"""
MONGO_HOST = 'mongodb://localhost/test'
"""
Database (mongodb) information.
Has not been tested with a database requiring username and password.
pretty_name is used in error emails.
If max_count is an integer, the collection will be dumped to disk and cleared for new tweets when that number of
tweets has been handled by the script. Attempts to get the current number of objects in the collection and
to start counting from that.
"""
db_connection = DBCConnection(host="127.0.0.1", port="27017", username=None, password=None,
db="test", collection="test_collection", out_dir="out/",
pretty_name="Test Tweets", called_by="test, computer 1",
max_count=None)
"""
Create twitter connection.
To get consumer keys and access tokens, you will need to create a twitter application.
See: http://apps.twitter.com/
It is recommended to create a twitter account specifically for streaming.
"""
tw_connection = TWConnection(consumer_key="",
consumer_secret="",
access_token="",
access_token_secret="")
"""
Mail account information for emailing when errors occur.
NOTE: Currently only tested with gmail.
It is recommended to create a new gmail account specifically for this purpose,
as the password is hardcoded.
Important: You must make sure that gmail recognizes your device as safe.
You must also "allow less secure apps" at:
https://myaccount.google.com/security?pli=1&nlr=1#connectedapps
The latter is another good reason to have a separate gmail account.
Set to None to disable error emails.
"""
mail_connection = {"smtp": "smtp.gmail.com:587",
"from": "[email protected]",
"to": "[email protected]",
"password": ""}
WORDS = [u"snack", u"humus", u"bitcoin"]
LANGUAGES = ["en"]
if __name__ == '__main__':
"""
Set up the listener:
The 'wait_on_rate_limit=True' is needed to help with Twitter API rate limiting.
collect_retweets specifies whether to collect retweets or not.
"""
listener = StreamListener(api=tweepy.API(wait_on_rate_limit=True), db_connection=db_connection,
mongo_host=MONGO_HOST, mail_connection=mail_connection, collect_retweets=False)
"""
Start streaming.
error_sleep_time is the time to sleep when an unknown error occurs (minutes).
sleep_every is the number of hours to stream before taking a break.
sleep_for is the number of hours to sleep, when taking a break.
sleep_every and sleep_for details allows to sample the tweets over long time durations. E.g. sleep every 24 hours,
and sleep for 48 hours, allows you to only stream every 3rd day without manually having to start and stop the stream.
"""
run_streamer(tw_connection=tw_connection, listener=listener, WORDS=WORDS, LANGUAGES=LANGUAGES,
error_sleep_time=15, mail_connection=mail_connection, sleep_every=None, sleep_for=None)