-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
359 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
## load rtweet | ||
library(rtweet) | ||
|
||
## load ggplot2 | ||
library(ggplot2) | ||
|
||
## Read in the streamed data | ||
d <- parse_stream("data/stream-1.json") | ||
|
||
## function to create freq table | ||
tab_sort <- function (x, n = 10, mentions = FALSE) { | ||
sumrow <- data.frame( | ||
"screen_name" = paste(length(unique(x)), "users"), | ||
"n_tweets" = length(x), | ||
"prop_tweets" = 1.000, | ||
stringsAsFactors = FALSE | ||
) | ||
x <- sort(table(x), decreasing = TRUE) | ||
x <- data.frame( | ||
"screen_name" = names(x), | ||
"n_tweets" = as.integer(x), | ||
stringsAsFactors = FALSE | ||
) | ||
x$prop_tweets <- x$n_tweets / sum(x$n_tweets, na.rm = TRUE) | ||
x$prop_tweets <- round(x$prop_tweets, 3) | ||
x <- head(x, n) | ||
x <- rbind(x, sumrow) | ||
row.names(x) <- c(seq_len(nrow(x) - 1L), "total") | ||
if (mentions) { | ||
names(x)[2:3] <- c("n_mentions", "prop_mentions") | ||
} | ||
x | ||
} | ||
|
||
## most frequent tweeters table | ||
usrs <- tab_sort(nca$screen_name) | ||
|
||
## save most freq tweeters table | ||
png("../nca17-usrs.png", height = 3.1, width = 4.25, "in", res = 300) | ||
par(bg = "white") | ||
gridExtra::grid.table(usrs, theme = gridExtra::ttheme_default(base_size = 9)) | ||
dev.off() | ||
|
||
## most frequent mentions table | ||
naomit <- function(x) x[!is.na(x)] | ||
usrs <- tab_sort(naomit(unlist(nca$mentions_screen_name)), mentions = TRUE) | ||
|
||
## save most freq mentions table | ||
png("../nca17-ats.png", height = 3.1, width = 4.25, "in", res = 300) | ||
par(bg = "white") | ||
gridExtra::grid.table(usrs, theme = gridExtra::ttheme_default(base_size = 9)) | ||
dev.off() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
## load rtweet | ||
library(rtweet) | ||
|
||
## load igraph | ||
library(igraph) | ||
|
||
## Read in the streamed data | ||
d <- parse_stream("data/stream-1.json") | ||
|
||
## function to filter out missing and non-unique IDs | ||
uq_naomit <- function(x) unique(x[!is.na(x)]) | ||
|
||
## function to create connections data frames | ||
connections_df <- function(user, var, interaction = NULL) { | ||
connections_df_ <- function(user, var) { | ||
data.frame( | ||
screen_name = user, | ||
connection = unlist(var, use.names = FALSE), | ||
row.names = NULL, | ||
check.rows = FALSE, | ||
check.names = FALSE, | ||
stringsAsFactors = FALSE | ||
) | ||
} | ||
d <- Map("connections_df_", user, var) | ||
d <- do.call("rbind", d) | ||
d <- d[!is.na(d$connection), ] | ||
if (!is.null(interaction)) { | ||
d$interaction <- interaction | ||
} | ||
tibble::as_tibble(d, validate = FALSE) | ||
} | ||
|
||
##----------------------------------------------------------------------------## | ||
## retweet users ## | ||
##----------------------------------------------------------------------------## | ||
|
||
## lookup retweets | ||
rts <- lookup_tweets(uq_naomit(d$retweet_status_id)) | ||
|
||
## select and rename columns | ||
rts <- dplyr::select( | ||
rts, retweet_status_id = status_id, retweet_screen_name = screen_name | ||
) | ||
|
||
## left join with data | ||
d <- dplyr::left_join(d, rts, by = "retweet_status_id") | ||
|
||
|
||
##----------------------------------------------------------------------------## | ||
## quote users ## | ||
##----------------------------------------------------------------------------## | ||
|
||
## lookup quotes | ||
qts <- lookup_tweets(uq_naomit(d$quoted_status_id)) | ||
|
||
## select and rename columns | ||
qts <- dplyr::select( | ||
qts, quoted_status_id = status_id, quoted_screen_name = screen_name | ||
) | ||
|
||
## left join with data | ||
d <- dplyr::left_join(d, qts, by = "quoted_status_id") | ||
|
||
|
||
##----------------------------------------------------------------------------## | ||
## semantic connections data ## | ||
##----------------------------------------------------------------------------## | ||
|
||
## mentions data | ||
md <- connections_df(d$screen_name, d$mentions_screen_name, "mention") | ||
|
||
## replies data | ||
td <- connections_df(d$screen_name, d$reply_to_screen_name, "reply") | ||
|
||
## retweets data | ||
rd <- connections_df(d$screen_name, d$retweet_screen_name, "retweet") | ||
|
||
## quotes data | ||
qd <- connections_df(d$screen_name, d$quoted_screen_name, "quote") | ||
|
||
## combine connections data | ||
snd <- do.call("rbind", list(md, td, rd, qd)) | ||
|
||
## rename | ||
names(snd) <- c("from", "to", "interaction") | ||
|
||
## list of all users | ||
all_users <- c(snd$from, snd$to) | ||
|
||
## list of user screen names with at least 5 connections | ||
kp_users <- table(all_users) | ||
kp_users <- names(kp_users[kp_users > 4L]) | ||
|
||
## lookup users data | ||
nodes <- lookup_users(kp_users) | ||
|
||
## filter kp_usres and count interactions | ||
links <- snd %>% | ||
dplyr::filter(from %in% nodes$screen_name & to %in% nodes$screen_name & from != to) %>% | ||
dplyr::group_by(from, to) %>% | ||
dplyr::summarise(n = n()) %>% | ||
dplyr::ungroup() | ||
|
||
## filter only nodes found in links | ||
nodes <- dplyr::filter(nodes, screen_name %in% c(links$to, links$from)) | ||
|
||
## size vector | ||
size1 <- links %>% | ||
dplyr::group_by(from) %>% | ||
dplyr::summarise(n = sum(n)) %>% | ||
dplyr::select(screen_name = from, n) | ||
|
||
size2 <- links %>% | ||
dplyr::group_by(to) %>% | ||
dplyr::summarise(n = sum(n)) %>% | ||
dplyr::select(screen_name = to, n) | ||
|
||
sizes <- rbind(size1, size2) %>% | ||
dplyr::group_by(screen_name) %>% | ||
dplyr::summarise(n = sum(n)) | ||
|
||
## network graph | ||
net <- graph_from_data_frame( | ||
d = links, | ||
vertices = nodes[, c("screen_name", "statuses_count")], | ||
directed = TRUE | ||
) | ||
|
||
## save plot | ||
png("../nca17-network.png", 20, 18, "in", res = 300) | ||
par(mar = c(0, 0, 0, 0), bg = "#000044") | ||
plot(net, edge.size = .25, | ||
margin = c(-.05, -.05, -.05, -.05), | ||
edge.arrow.size = 0, | ||
edge.arrow.width = 0, | ||
vertex.color = "#ff00ff55", | ||
vertex.frame.color = "transparent", | ||
vertex.label.color = "greenyellow", | ||
vertex.label.cex = .35, | ||
vertex.label.family = "sans", | ||
vertex.size = sqrt(sizes$n) / 1.8, | ||
edge.color = "#ff00ff55", | ||
edge.width = .25) | ||
dev.off() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
## NCA17 tweets | ||
|
||
## install and load rtweet | ||
if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools") | ||
devtools::install_github("mkearney/rtweet") | ||
library(rtweet) | ||
|
||
## create data folder is it doesn't already exist | ||
if (!dir.exists(file.path("..", "data"))) dir.create(file.path("..", "data")) | ||
|
||
## download stream data, save it to data folder | ||
download.file( | ||
"https://www.dropbox.com/s/t0sefc0lzqbwd32/stream-1.json?dl=1", | ||
file.path("..", "data", "nca17.json") | ||
) | ||
|
||
## read in stream data, converting it to data frame | ||
nca <- parse_stream(file.path("..", "data", "nca17.json")) | ||
|
||
## preview data (should be N = 3332) | ||
nca |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
## load rtweet | ||
library(rtweet) | ||
|
||
## load tidyverse | ||
lib(tidyverse) | ||
|
||
## Read in the streamed data | ||
d <- parse_stream("data/stream-1.json") | ||
|
||
## Estimate pos/neg sentiment for each tweet | ||
d$sentiment <- syuzhet::get_sentiment(d$text, "syuzhet") | ||
|
||
## Function to parse time intervals | ||
time_intervals <- function(x) { | ||
stopifnot(is.atomic(x) && length(x) == 1L) | ||
if (is.numeric(x)) { | ||
return(x) | ||
} | ||
x <- tolower(x) | ||
if (grepl("year", x)) { | ||
n <- 60 * 60 * 24 * 365 | ||
} else if (grepl("quarter", x)) { | ||
n <- 365 / 4 | ||
} else if (grepl("month", x)) { | ||
n <- 60 * 60 * 24 * 30 | ||
} else if (grepl("week", x)) { | ||
n <- 60 * 60 * 24 * 7 | ||
} else if (grepl("day", x)) { | ||
n <- 60 * 60 * 24 | ||
} else if (grepl("hour", x)) { | ||
n <- 60 * 60 | ||
} else if (grepl("min", x)) { | ||
n <- 60 | ||
} else if (grepl("sec", x)) { | ||
n <- 1 | ||
} else { | ||
stop("interval must be secs, mins, hours, days, weeks, months, or years", | ||
call. = FALSE) | ||
} | ||
x <- as.double(gsub("[^[:digit:]|\\.]", "", x)) | ||
if (any(is.na(x), identical(x, ""))) { | ||
x <- 1 | ||
} | ||
n * x | ||
} | ||
|
||
## Create function to round time into rounded var | ||
round_time <- function(x, sec) { | ||
sec <- time_units(sec) | ||
as.POSIXct(hms::hms(as.numeric(x) %/% sec * sec)) | ||
} | ||
|
||
## create and save plot | ||
d %>% | ||
mutate(time = round_time(created_at, "hours")) %>% | ||
group_by(time) %>% | ||
summarise(sentiment = mean(sentiment, na.rmm = TRUE)) %>% | ||
mutate(valence = ifelse(sentiment > 0L, "Positive", "Negative")) %>% | ||
ggplot(aes(x = time, y = sentiment)) + | ||
geom_smooth(method = "loess", span = .6, colour = "#aa11aadd", fill = "#bbbbbb11") + | ||
geom_bar(aes(fill = valence), alpha = .7, stat = "identity", width = 1250) + | ||
geom_point(aes(colour = valence), alpha = .9, size = 2.5) + | ||
theme_mwk() + | ||
theme(legend.position = "none") + | ||
scale_fill_manual(values = c(Positive = "#2244ee", Negative = "#dd2222")) + | ||
scale_colour_manual(values = c(Positive = "#0022cc", Negative = "#bb0000")) + | ||
labs(x = NULL, y = NULL, | ||
title = "Sentiment of #NCA17 tweets by hour", | ||
subtitle = "Mean positive/negative sentiment scores of tweets") + | ||
ggsave("../nca17-sa.png", width = 8, height = 7, units = "in") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
## load rtweet | ||
library(rtweet) | ||
|
||
## load ggplot2 | ||
library(ggplot2) | ||
|
||
## Read in the streamed data | ||
d <- parse_stream("data/stream-1.json") | ||
|
||
## plot the time series of #NCA17 activity | ||
ts_plot(nca, "hours") + | ||
theme_minimal(base_family = "sans") + | ||
theme(plot.title = element_text(face = "bold")) + | ||
labs(x = NULL, y = NULL, title = "Time series of #NCA17 Twitter statuses", | ||
subtitle = "Twitter statuses aggregated by hour", | ||
caption = "\nData collected from Twitter's stream (filter) API using rtweet") + | ||
ggsave("../nca17-ts.png", width = 8, height = 6, units = "in") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
## load rtweet | ||
library(rtweet) | ||
|
||
## load ggplot2 | ||
library(ggplot2) | ||
|
||
## Read in the streamed data | ||
d <- parse_stream("data/stream-1.json") | ||
|
||
## function for cleaning text and creating word freq table | ||
clean_text_table <- function(data) { | ||
txt <- tolower(plain_tweets(data$text)) | ||
txt <- gsub("&", "", txt) | ||
txt <- gsub("#nca17", "", txt, ignore.case = TRUE) | ||
txt <- unlist(strsplit(txt, " ")) | ||
txt <- gsub("^[[:punct:]]{1,}|[[:punct:]]{1,}$", "", txt) | ||
txt <- trimws(txt) | ||
txt <- txt[txt != ""] | ||
swds <- stopwordslangs$word[stopwordslangs$lang == "en" & stopwordslangs$p > .99] | ||
txt <- txt[!txt %in% swds] | ||
sort(table(txt), decreasing = TRUE) | ||
} | ||
|
||
## create frequency table of popular words | ||
wds <- clean_text_table(nca) | ||
|
||
## calc min freq for word cloud | ||
minfreq <- quantile(as.double(wds), .75) | ||
|
||
## save word cloud | ||
png("../nca17-wc.png", height = 8, width = 8, "in", res = 300) | ||
par(bg = "black") | ||
wordcloud::wordcloud( | ||
names(wds), | ||
as.integer(wds), | ||
min.freq = minfreq, | ||
random.color = FALSE, | ||
random.order = FALSE, | ||
colors = gg_cols(6) | ||
) | ||
dev.off() |
Oops, something went wrong.