diff --git a/R/freq_tables.R b/R/freq_tables.R new file mode 100644 index 0000000..b5f6fa2 --- /dev/null +++ b/R/freq_tables.R @@ -0,0 +1,52 @@ +## load rtweet +library(rtweet) + +## load ggplot2 +library(ggplot2) + +## Read in the streamed data +d <- parse_stream("data/stream-1.json") + +## function to create freq table +tab_sort <- function (x, n = 10, mentions = FALSE) { + sumrow <- data.frame( + "screen_name" = paste(length(unique(x)), "users"), + "n_tweets" = length(x), + "prop_tweets" = 1.000, + stringsAsFactors = FALSE + ) + x <- sort(table(x), decreasing = TRUE) + x <- data.frame( + "screen_name" = names(x), + "n_tweets" = as.integer(x), + stringsAsFactors = FALSE + ) + x$prop_tweets <- x$n_tweets / sum(x$n_tweets, na.rm = TRUE) + x$prop_tweets <- round(x$prop_tweets, 3) + x <- head(x, n) + x <- rbind(x, sumrow) + row.names(x) <- c(seq_len(nrow(x) - 1L), "total") + if (mentions) { + names(x)[2:3] <- c("n_mentions", "prop_mentions") + } + x +} + +## most frequent tweeters table +usrs <- tab_sort(nca$screen_name) + +## save most freq tweeters table +png("../nca17-usrs.png", height = 3.1, width = 4.25, "in", res = 300) +par(bg = "white") +gridExtra::grid.table(usrs, theme = gridExtra::ttheme_default(base_size = 9)) +dev.off() + +## most frequent mentions table +naomit <- function(x) x[!is.na(x)] +usrs <- tab_sort(naomit(unlist(nca$mentions_screen_name)), mentions = TRUE) + +## save most freq mentions table +png("../nca17-ats.png", height = 3.1, width = 4.25, "in", res = 300) +par(bg = "white") +gridExtra::grid.table(usrs, theme = gridExtra::ttheme_default(base_size = 9)) +dev.off() diff --git a/R/network_analysis.R b/R/network_analysis.R new file mode 100644 index 0000000..66c3b98 --- /dev/null +++ b/R/network_analysis.R @@ -0,0 +1,145 @@ +## load rtweet +library(rtweet) + +## load igraph +library(igraph) + +## Read in the streamed data +d <- parse_stream("data/stream-1.json") + +## function to filter out missing and non-unique IDs +uq_naomit <- function(x) unique(x[!is.na(x)]) + +## function to create connections data frames +connections_df <- function(user, var, interaction = NULL) { + connections_df_ <- function(user, var) { + data.frame( + screen_name = user, + connection = unlist(var, use.names = FALSE), + row.names = NULL, + check.rows = FALSE, + check.names = FALSE, + stringsAsFactors = FALSE + ) + } + d <- Map("connections_df_", user, var) + d <- do.call("rbind", d) + d <- d[!is.na(d$connection), ] + if (!is.null(interaction)) { + d$interaction <- interaction + } + tibble::as_tibble(d, validate = FALSE) +} + +##----------------------------------------------------------------------------## +## retweet users ## +##----------------------------------------------------------------------------## + +## lookup retweets +rts <- lookup_tweets(uq_naomit(d$retweet_status_id)) + +## select and rename columns +rts <- dplyr::select( + rts, retweet_status_id = status_id, retweet_screen_name = screen_name +) + +## left join with data +d <- dplyr::left_join(d, rts, by = "retweet_status_id") + + +##----------------------------------------------------------------------------## +## quote users ## +##----------------------------------------------------------------------------## + +## lookup quotes +qts <- lookup_tweets(uq_naomit(d$quoted_status_id)) + +## select and rename columns +qts <- dplyr::select( + qts, quoted_status_id = status_id, quoted_screen_name = screen_name +) + +## left join with data +d <- dplyr::left_join(d, qts, by = "quoted_status_id") + + +##----------------------------------------------------------------------------## +## semantic connections data ## +##----------------------------------------------------------------------------## + +## mentions data +md <- connections_df(d$screen_name, d$mentions_screen_name, "mention") + +## replies data +td <- connections_df(d$screen_name, d$reply_to_screen_name, "reply") + +## retweets data +rd <- connections_df(d$screen_name, d$retweet_screen_name, "retweet") + +## quotes data +qd <- connections_df(d$screen_name, d$quoted_screen_name, "quote") + +## combine connections data +snd <- do.call("rbind", list(md, td, rd, qd)) + +## rename +names(snd) <- c("from", "to", "interaction") + +## list of all users +all_users <- c(snd$from, snd$to) + +## list of user screen names with at least 5 connections +kp_users <- table(all_users) +kp_users <- names(kp_users[kp_users > 4L]) + +## lookup users data +nodes <- lookup_users(kp_users) + +## filter kp_usres and count interactions +links <- snd %>% + dplyr::filter(from %in% nodes$screen_name & to %in% nodes$screen_name & from != to) %>% + dplyr::group_by(from, to) %>% + dplyr::summarise(n = n()) %>% + dplyr::ungroup() + +## filter only nodes found in links +nodes <- dplyr::filter(nodes, screen_name %in% c(links$to, links$from)) + +## size vector +size1 <- links %>% + dplyr::group_by(from) %>% + dplyr::summarise(n = sum(n)) %>% + dplyr::select(screen_name = from, n) + +size2 <- links %>% + dplyr::group_by(to) %>% + dplyr::summarise(n = sum(n)) %>% + dplyr::select(screen_name = to, n) + +sizes <- rbind(size1, size2) %>% + dplyr::group_by(screen_name) %>% + dplyr::summarise(n = sum(n)) + +## network graph +net <- graph_from_data_frame( + d = links, + vertices = nodes[, c("screen_name", "statuses_count")], + directed = TRUE +) + +## save plot +png("../nca17-network.png", 20, 18, "in", res = 300) +par(mar = c(0, 0, 0, 0), bg = "#000044") +plot(net, edge.size = .25, + margin = c(-.05, -.05, -.05, -.05), + edge.arrow.size = 0, + edge.arrow.width = 0, + vertex.color = "#ff00ff55", + vertex.frame.color = "transparent", + vertex.label.color = "greenyellow", + vertex.label.cex = .35, + vertex.label.family = "sans", + vertex.size = sqrt(sizes$n) / 1.8, + edge.color = "#ff00ff55", + edge.width = .25) +dev.off() diff --git a/R/read_data.R b/R/read_data.R new file mode 100644 index 0000000..cf222a0 --- /dev/null +++ b/R/read_data.R @@ -0,0 +1,21 @@ +## NCA17 tweets + +## install and load rtweet +if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools") +devtools::install_github("mkearney/rtweet") +library(rtweet) + +## create data folder is it doesn't already exist +if (!dir.exists(file.path("..", "data"))) dir.create(file.path("..", "data")) + +## download stream data, save it to data folder +download.file( + "https://www.dropbox.com/s/t0sefc0lzqbwd32/stream-1.json?dl=1", + file.path("..", "data", "nca17.json") +) + +## read in stream data, converting it to data frame +nca <- parse_stream(file.path("..", "data", "nca17.json")) + +## preview data (should be N = 3332) +nca diff --git a/R/sentiment_analysis.R b/R/sentiment_analysis.R new file mode 100644 index 0000000..a41c525 --- /dev/null +++ b/R/sentiment_analysis.R @@ -0,0 +1,70 @@ +## load rtweet +library(rtweet) + +## load tidyverse +lib(tidyverse) + +## Read in the streamed data +d <- parse_stream("data/stream-1.json") + +## Estimate pos/neg sentiment for each tweet +d$sentiment <- syuzhet::get_sentiment(d$text, "syuzhet") + +## Function to parse time intervals +time_intervals <- function(x) { + stopifnot(is.atomic(x) && length(x) == 1L) + if (is.numeric(x)) { + return(x) + } + x <- tolower(x) + if (grepl("year", x)) { + n <- 60 * 60 * 24 * 365 + } else if (grepl("quarter", x)) { + n <- 365 / 4 + } else if (grepl("month", x)) { + n <- 60 * 60 * 24 * 30 + } else if (grepl("week", x)) { + n <- 60 * 60 * 24 * 7 + } else if (grepl("day", x)) { + n <- 60 * 60 * 24 + } else if (grepl("hour", x)) { + n <- 60 * 60 + } else if (grepl("min", x)) { + n <- 60 + } else if (grepl("sec", x)) { + n <- 1 + } else { + stop("interval must be secs, mins, hours, days, weeks, months, or years", + call. = FALSE) + } + x <- as.double(gsub("[^[:digit:]|\\.]", "", x)) + if (any(is.na(x), identical(x, ""))) { + x <- 1 + } + n * x +} + +## Create function to round time into rounded var +round_time <- function(x, sec) { + sec <- time_units(sec) + as.POSIXct(hms::hms(as.numeric(x) %/% sec * sec)) +} + +## create and save plot +d %>% + mutate(time = round_time(created_at, "hours")) %>% + group_by(time) %>% + summarise(sentiment = mean(sentiment, na.rmm = TRUE)) %>% + mutate(valence = ifelse(sentiment > 0L, "Positive", "Negative")) %>% + ggplot(aes(x = time, y = sentiment)) + + geom_smooth(method = "loess", span = .6, colour = "#aa11aadd", fill = "#bbbbbb11") + + geom_bar(aes(fill = valence), alpha = .7, stat = "identity", width = 1250) + + geom_point(aes(colour = valence), alpha = .9, size = 2.5) + + theme_mwk() + + theme(legend.position = "none") + + scale_fill_manual(values = c(Positive = "#2244ee", Negative = "#dd2222")) + + scale_colour_manual(values = c(Positive = "#0022cc", Negative = "#bb0000")) + + labs(x = NULL, y = NULL, + title = "Sentiment of #NCA17 tweets by hour", + subtitle = "Mean positive/negative sentiment scores of tweets") + + ggsave("../nca17-sa.png", width = 8, height = 7, units = "in") diff --git a/R/time_series.R b/R/time_series.R new file mode 100644 index 0000000..b600d6f --- /dev/null +++ b/R/time_series.R @@ -0,0 +1,17 @@ +## load rtweet +library(rtweet) + +## load ggplot2 +library(ggplot2) + +## Read in the streamed data +d <- parse_stream("data/stream-1.json") + +## plot the time series of #NCA17 activity +ts_plot(nca, "hours") + + theme_minimal(base_family = "sans") + + theme(plot.title = element_text(face = "bold")) + + labs(x = NULL, y = NULL, title = "Time series of #NCA17 Twitter statuses", + subtitle = "Twitter statuses aggregated by hour", + caption = "\nData collected from Twitter's stream (filter) API using rtweet") + + ggsave("../nca17-ts.png", width = 8, height = 6, units = "in") diff --git a/R/word_cloud.R b/R/word_cloud.R new file mode 100644 index 0000000..6ff57b7 --- /dev/null +++ b/R/word_cloud.R @@ -0,0 +1,41 @@ +## load rtweet +library(rtweet) + +## load ggplot2 +library(ggplot2) + +## Read in the streamed data +d <- parse_stream("data/stream-1.json") + +## function for cleaning text and creating word freq table +clean_text_table <- function(data) { + txt <- tolower(plain_tweets(data$text)) + txt <- gsub("&", "", txt) + txt <- gsub("#nca17", "", txt, ignore.case = TRUE) + txt <- unlist(strsplit(txt, " ")) + txt <- gsub("^[[:punct:]]{1,}|[[:punct:]]{1,}$", "", txt) + txt <- trimws(txt) + txt <- txt[txt != ""] + swds <- stopwordslangs$word[stopwordslangs$lang == "en" & stopwordslangs$p > .99] + txt <- txt[!txt %in% swds] + sort(table(txt), decreasing = TRUE) +} + +## create frequency table of popular words +wds <- clean_text_table(nca) + +## calc min freq for word cloud +minfreq <- quantile(as.double(wds), .75) + +## save word cloud +png("../nca17-wc.png", height = 8, width = 8, "in", res = 300) +par(bg = "black") +wordcloud::wordcloud( + names(wds), + as.integer(wds), + min.freq = minfreq, + random.color = FALSE, + random.order = FALSE, + colors = gg_cols(6) +) +dev.off() diff --git a/README.md b/README.md index b78ebd9..62e7dfc 100644 --- a/README.md +++ b/README.md @@ -2,38 +2,44 @@ Collecting data on Twitter statuses containing the #NCA17 hashtag. ## Importing the data -To import the Twitter data, see the [make.R](make.R) script file. +To import the Twitter data, see the [R/data.R](R/data.R) script file. ## NCA Twitter tracking Here's the live output generated by the [live.R](live.R) script file. It updates every hour. ### Status frequency -The number of #nca17 tweets aggregated in 1-hour intervals. +The number of #nca17 tweets aggregated in 1-hour intervals. View the +code [here](R/time_series.R). ![](nca17-ts.png) ### Top tweeters -Accounts that have posted the most statuses. +Accounts that have posted the most statuses. View the code +[here](R/freq_tables.R). ![](nca17-usrs.png) ### Top mentions -Accounts most frequently mentioned. +Accounts most frequently mentioned. View the code +[here](R/freq_tables.R). ![](nca17-ats.png) ### Sentiment analysis -Sentiment (positive/negative) of statuses over time. +Sentiment (positive/negative) of statuses over time. View the code +[here](R/sentiment_analysis.R). ![](nca17-sa.png) ### Network analysis -Semantic (quotes, retweets, and mentions) network connections. +Semantic (quotes, retweets, and mentions) network connections. View +the code [here](R/network_analysis.R). ![](nca17-network.png) ### Word cloud -Most popular words appearing in statuses (stop words excluded) +Most popular words appearing in statuses (stop words excluded). View +the code [here](R/word_cloud.R). ![](nca17-wc.png) diff --git a/nca17-network.png b/nca17-network.png index 944956f..1a843f5 100644 Binary files a/nca17-network.png and b/nca17-network.png differ diff --git a/nca17-sa.png b/nca17-sa.png index ec6f1ce..05887be 100644 Binary files a/nca17-sa.png and b/nca17-sa.png differ