Skip to content

Commit b42961c

Browse files
committedOct 30, 2017
change to python 3 and add hashtag parsing
1 parent 3ec8219 commit b42961c

File tree

1 file changed

+173
-17
lines changed

1 file changed

+173
-17
lines changed
 

‎DiscursiveEDA.ipynb

+173-17
Original file line numberDiff line numberDiff line change
@@ -2,36 +2,37 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 29,
5+
"execution_count": 17,
66
"metadata": {
77
"collapsed": false
88
},
99
"outputs": [],
1010
"source": [
1111
"# imports\n",
12-
"\n",
13-
"import urllib2\n",
14-
"import json"
12+
"from urllib.request import urlopen\n",
13+
"import json\n",
14+
"import re\n",
15+
"from collections import defaultdict"
1516
]
1617
},
1718
{
1819
"cell_type": "code",
19-
"execution_count": 30,
20+
"execution_count": 9,
2021
"metadata": {
21-
"collapsed": true
22+
"collapsed": false
2223
},
2324
"outputs": [],
2425
"source": [
2526
"# get an s3 file from Discursive. This file is a sample of ~500 tweets. \n",
2627
"\n",
27-
"response = urllib2.urlopen('https://s3-us-west-2.amazonaws.com/discursive/2017/1/10/18/tweets-25.json')\n",
28+
"response = urlopen('https://s3-us-west-2.amazonaws.com/discursive/2017/1/10/18/tweets-25.json')\n",
2829
"html = response.read()\n",
2930
"resp = json.loads(html)"
3031
]
3132
},
3233
{
3334
"cell_type": "code",
34-
"execution_count": 34,
35+
"execution_count": 11,
3536
"metadata": {
3637
"collapsed": false
3738
},
@@ -40,19 +41,19 @@
4041
"name": "stdout",
4142
"output_type": "stream",
4243
"text": [
43-
"[{u'loc': None, u'description': u\"I Fuck Up... Just don't forget you Fuck Up Too.\", u'friends_count': 979, u'created': u'2017-01-10 18:14:08', u'text': u\"RT @LindaSuhler: Can we hear from #MSM here?\\n@MTV's @Ira Madison III Calls Jeff Sessions' Granddaughter 'Prop' Stolen from Toys R Us\\u2026 \", u'hashtags': u'[\"MSM\"]', u'original_name': u'Linda Suhler, Ph.D.', u'original_id': 347627434, u'id_str': u'818883641640177665', u'user_created': u'2012-12-29 17:54:08', u'followers': 3098, u'retweet_count': 0, u'retweet': u'Y', u'name': u'VFL2013'}, {u'loc': u'Lagos, Nigeria', u'description': u'Publisher | Editor | Online Promoter | Web Designer | Blogger | Social Media Manager.. Contact: peretzomasi51@gmail.com', u'friends_count': 128, u'created': u'2017-01-10 18:14:09', u'text': u'Breitbart to launch populist business news site https://t.co/DeLYRbFZDs', u'hashtags': u'[]', u'original_name': None, u'original_id': None, u'id_str': u'818883644039294976', u'user_created': u'2016-09-25 18:53:21', u'followers': 154, u'retweet_count': 0, u'retweet': u'N', u'name': u'PeretzOmasi'}, {u'loc': u'Schloss von Kastenfarbe', u'description': u'Extra Bl\\xf6d\\n\\nIrgendwann wird auch Dich niemand verstehen wollen Ach ja: \\u201eNicht alles was Ich teile ist gleich meine Meinung\\u201c http://www.faz.net/-gqz-7h2fh', u'friends_count': 496, u'created': u'2017-01-10 18:14:09', u'text': u'RT @dushanwegner: F\\xfcr eine #SPD-Abgeordnete? Hahaha. Witzig. \\U0001f602\\U0001f602\\U0001f602 https://t.co/YBoSx9YR5f', u'hashtags': u'[\"SPD\"]', u'original_name': u'Dushan Wegner', u'original_id': 14784064, u'id_str': u'818883645096202241', u'user_created': u'2012-03-11 21:22:57', u'followers': 145, u'retweet_count': 0, u'retweet': u'Y', u'name': u'kricketkrackel'}, {u'loc': None, u'description': None, u'friends_count': 249, u'created': u'2017-01-10 18:14:09', u'text': u'RT @pg80808: @qatarairways \\u2708\\ufe0f Did you know your ads support stories like this on Breitbart? 10 THINGS MILO HATES ABOUT ISLAM.\\u2026 ', u'hashtags': u'[]', u'original_name': u'ColoradoVsBreitbart', u'original_id': 814872255406620672, u'id_str': u'818883645188358146', u'user_created': u'2014-12-12 22:55:33', u'followers': 76, u'retweet_count': 0, u'retweet': u'Y', u'name': u'awordonplays'}, {u'loc': None, u'description': u'Conservative; luv my doxie...KEEP THE CHANGE!... ONE NATION UNDER GOD!', u'friends_count': 3361, u'created': u'2017-01-10 18:14:10', u'text': u\"#racist #fool #MTV's Ira #Madison III Calls Jeff Sessions' Granddaughter 'Prop' Stolen from Toys R Us - Breitbart https://t.co/z8GR7iN59Y\", u'hashtags': u'[\"racist\", \"fool\", \"MTV\", \"Madison\"]', u'original_name': None, u'original_id': None, u'id_str': u'818883647382192128', u'user_created': u'2011-08-02 13:45:16', u'followers': 3166, u'retweet_count': 0, u'retweet': u'N', u'name': u'Bonacker69'}]\n"
44+
"[{'description': \"I Fuck Up... Just don't forget you Fuck Up Too.\", 'friends_count': 979, 'text': \"RT @LindaSuhler: Can we hear from #MSM here?\\n@MTV's @Ira Madison III Calls Jeff Sessions' Granddaughter 'Prop' Stolen from Toys R Us… \", 'hashtags': '[\"MSM\"]', 'original_id': 347627434, 'user_created': '2012-12-29 17:54:08', 'retweet': 'Y', 'loc': None, 'name': 'VFL2013', 'created': '2017-01-10 18:14:08', 'original_name': 'Linda Suhler, Ph.D.', 'followers': 3098, 'id_str': '818883641640177665', 'retweet_count': 0}, {'description': 'Publisher | Editor | Online Promoter | Web Designer | Blogger | Social Media Manager.. Contact: peretzomasi51@gmail.com', 'friends_count': 128, 'text': 'Breitbart to launch populist business news site https://t.co/DeLYRbFZDs', 'hashtags': '[]', 'original_id': None, 'user_created': '2016-09-25 18:53:21', 'retweet': 'N', 'loc': 'Lagos, Nigeria', 'name': 'PeretzOmasi', 'created': '2017-01-10 18:14:09', 'original_name': None, 'followers': 154, 'id_str': '818883644039294976', 'retweet_count': 0}, {'description': 'Extra Blöd\\n\\nIrgendwann wird auch Dich niemand verstehen wollen Ach ja: „Nicht alles was Ich teile ist gleich meine Meinung“ http://www.faz.net/-gqz-7h2fh', 'friends_count': 496, 'text': 'RT @dushanwegner: Für eine #SPD-Abgeordnete? Hahaha. Witzig. 😂😂😂 https://t.co/YBoSx9YR5f', 'hashtags': '[\"SPD\"]', 'original_id': 14784064, 'user_created': '2012-03-11 21:22:57', 'retweet': 'Y', 'loc': 'Schloss von Kastenfarbe', 'name': 'kricketkrackel', 'created': '2017-01-10 18:14:09', 'original_name': 'Dushan Wegner', 'followers': 145, 'id_str': '818883645096202241', 'retweet_count': 0}, {'description': None, 'friends_count': 249, 'text': 'RT @pg80808: @qatarairways ✈️ Did you know your ads support stories like this on Breitbart? 10 THINGS MILO HATES ABOUT ISLAM.… ', 'hashtags': '[]', 'original_id': 814872255406620672, 'user_created': '2014-12-12 22:55:33', 'retweet': 'Y', 'loc': None, 'name': 'awordonplays', 'created': '2017-01-10 18:14:09', 'original_name': 'ColoradoVsBreitbart', 'followers': 76, 'id_str': '818883645188358146', 'retweet_count': 0}, {'description': 'Conservative; luv my doxie...KEEP THE CHANGE!... ONE NATION UNDER GOD!', 'friends_count': 3361, 'text': \"#racist #fool #MTV's Ira #Madison III Calls Jeff Sessions' Granddaughter 'Prop' Stolen from Toys R Us - Breitbart https://t.co/z8GR7iN59Y\", 'hashtags': '[\"racist\", \"fool\", \"MTV\", \"Madison\"]', 'original_id': None, 'user_created': '2011-08-02 13:45:16', 'retweet': 'N', 'loc': None, 'name': 'Bonacker69', 'created': '2017-01-10 18:14:10', 'original_name': None, 'followers': 3166, 'id_str': '818883647382192128', 'retweet_count': 0}]\n"
4445
]
4546
}
4647
],
4748
"source": [
4849
"# have a look at the data\n",
4950
"\n",
50-
"print resp[:5]"
51+
"print(resp[:5])"
5152
]
5253
},
5354
{
5455
"cell_type": "code",
55-
"execution_count": 33,
56+
"execution_count": 12,
5657
"metadata": {
5758
"collapsed": false
5859
},
@@ -74,27 +75,182 @@
7475
"# start exploring the available fields\n",
7576
"\n",
7677
"for x in resp[:5]:\n",
77-
" print x['text']"
78+
" print(x['text'])"
79+
]
80+
},
81+
{
82+
"cell_type": "code",
83+
"execution_count": 25,
84+
"metadata": {
85+
"collapsed": false
86+
},
87+
"outputs": [],
88+
"source": [
89+
"# get all of the hashtags and counts\n",
90+
"hashtags = defaultdict(int)\n",
91+
"for tweet in resp:\n",
92+
" # take out hashtags\n",
93+
" hts = [ht for ht in tweet['text'].split() if ht.startswith('#')]\n",
94+
" # add them to the dictionary\n",
95+
" for ht in hts:\n",
96+
" hashtags[ht] += 1\n"
7897
]
98+
},
99+
{
100+
"cell_type": "code",
101+
"execution_count": 44,
102+
"metadata": {
103+
"collapsed": false
104+
},
105+
"outputs": [
106+
{
107+
"name": "stdout",
108+
"output_type": "stream",
109+
"text": [
110+
"Hashtag\t\t\t\tCount\n",
111+
"#MSM \t\t33\n",
112+
"#MAGA \t\t11\n",
113+
"#StopSessions \t\t8\n",
114+
"#FakeNews \t\t8\n",
115+
"#ConfirmSessions!\t\t7\n",
116+
"#SupportSessions\t\t7\n",
117+
"#AmericaFirst \t\t7\n",
118+
"#Breitbart \t\t6\n",
119+
"#MakeAmericaGreatAgain’\t\t6\n",
120+
"#SPD-Abgeordnete?\t\t5\n",
121+
"#tcot \t\t5\n",
122+
"#tichy \t\t4\n",
123+
"#xing \t\t4\n",
124+
"#Br… \t\t4\n",
125+
"#Sessions \t\t4\n",
126+
"#NewYork \t\t4\n",
127+
"#NYC \t\t4\n",
128+
"#Real… \t\t3\n",
129+
"#TCOT \t\t3\n",
130+
"#PJNET \t\t3\n",
131+
"#pjnet \t\t3\n",
132+
"#p2 \t\t3\n",
133+
"#MakeAmericaGreatAgain'\t\t3\n",
134+
"#CloserNation \t\t2\n",
135+
"#worldnews \t\t2\n",
136+
"#Obamacare \t\t2\n",
137+
"#PlannedParenthood\t\t2\n",
138+
"#PJNET🇺🇸… \t\t2\n",
139+
"#Obama \t\t2\n",
140+
"#Jerusalem \t\t2\n",
141+
"#PresidentElectTrump\t\t2\n",
142+
"#jerusalem \t\t2\n",
143+
"#TimScott: \t\t2\n",
144+
"#JeffSessions \t\t2\n",
145+
"#AttorneyGeneral\t\t2\n",
146+
"#Racist \t\t2\n",
147+
"#breitbart \t\t2\n",
148+
"#LiberalHate \t\t2\n",
149+
"#EverythingIsRacial\t\t2\n",
150+
"#racist \t\t1\n",
151+
"#fool \t\t1\n",
152+
"#MTV's \t\t1\n",
153+
"#Madison \t\t1\n",
154+
"#Libtards \t\t1\n",
155+
"#Liberals \t\t1\n",
156+
"#Racism \t\t1\n",
157+
"#Racist… \t\t1\n",
158+
"#AI \t\t1\n",
159+
"#IA \t\t1\n",
160+
"#machinelearning\t\t1\n",
161+
"#AfD: \t\t1\n",
162+
"#TRUMP \t\t1\n",
163+
"#VETS \t\t1\n",
164+
"#OATHKEEPERS \t\t1\n",
165+
"#CONSTITUTION \t\t1\n",
166+
"#2A \t\t1\n",
167+
"#1A \t\t1\n",
168+
"#ORPUW \t\t1\n",
169+
"#NRA \t\t1\n",
170+
"#TLOT \t\t1\n",
171+
"#INFOWARS \t\t1\n",
172+
"#DrudgeReport… \t\t1\n",
173+
"#Resist \t\t1\n",
174+
"#preach \t\t1\n",
175+
"#Bannon \t\t1\n",
176+
"#Germany \t\t1\n",
177+
"#France \t\t1\n",
178+
"#ConfirmationHearings\t\t1\n",
179+
"#JeffSessions' \t\t1\n",
180+
"#TheResistance \t\t1\n",
181+
"#ResistTrump \t\t1\n",
182+
"#Fascism \t\t1\n",
183+
"#NYT \t\t1\n",
184+
"#NYT=FakeNews \t\t1\n",
185+
"#NowPlaying \t\t1\n",
186+
"#woodpile \t\t1\n",
187+
"#p… \t\t1\n",
188+
"#populism\" \t\t1\n",
189+
"#media \t\t1\n",
190+
"#SPD-Abgeordnete\t\t1\n",
191+
"#squirrel \t\t1\n",
192+
"#beta \t\t1\n",
193+
"#bitches \t\t1\n",
194+
"#cuck… \t\t1\n",
195+
"#liberal \t\t1\n",
196+
"#leftists \t\t1\n",
197+
"#bannon \t\t1\n",
198+
"#manafort \t\t1\n",
199+
"#kushner \t\t1\n",
200+
"#putin \t\t1\n",
201+
"#PublicDiplomacy\t\t1\n",
202+
"#dortmund \t\t1\n",
203+
"#TuesdayMotivation\t\t1\n",
204+
"#Fakenews \t\t1\n",
205+
"#verfassungsschutz\t\t1\n",
206+
"#APT28 \t\t1\n",
207+
"#ACA… \t\t1\n",
208+
"#DumpKelloggs \t\t1\n",
209+
"#RIPDNC \t\t1\n",
210+
"#MerylStreep \t\t1\n",
211+
"#Awesome...coming\t\t1\n",
212+
"#TGDN \t\t1\n",
213+
"#CCOT \t\t1\n",
214+
"#diversity \t\t1\n",
215+
"#StopFundingHate\t\t1\n"
216+
]
217+
}
218+
],
219+
"source": [
220+
"# print out the results of hashtags\n",
221+
"print(\"Hashtag\\t\\t\\t\\tCount\")\n",
222+
"for ht in sorted(hashtags, key=hashtags.get, reverse=True):\n",
223+
" print(\"{:16s}\\t\\t{}\".format(ht, hashtags[ht]))\n",
224+
" \n"
225+
]
226+
},
227+
{
228+
"cell_type": "code",
229+
"execution_count": null,
230+
"metadata": {
231+
"collapsed": true
232+
},
233+
"outputs": [],
234+
"source": []
79235
}
80236
],
81237
"metadata": {
82238
"kernelspec": {
83-
"display_name": "Python [Root]",
239+
"display_name": "Python 3",
84240
"language": "python",
85-
"name": "Python [Root]"
241+
"name": "python3"
86242
},
87243
"language_info": {
88244
"codemirror_mode": {
89245
"name": "ipython",
90-
"version": 2
246+
"version": 3
91247
},
92248
"file_extension": ".py",
93249
"mimetype": "text/x-python",
94250
"name": "python",
95251
"nbconvert_exporter": "python",
96-
"pygments_lexer": "ipython2",
97-
"version": "2.7.12"
252+
"pygments_lexer": "ipython3",
253+
"version": "3.6.3"
98254
}
99255
},
100256
"nbformat": 4,

0 commit comments

Comments
 (0)