updated with all code

mkearney · Nov 21, 2017 · d79c0d5 · d79c0d5
1 parent d166301
commit d79c0d5
Show file tree

Hide file tree

Showing 9 changed files with 359 additions and 7 deletions.
diff --git a/R/freq_tables.R b/R/freq_tables.R
@@ -0,0 +1,52 @@
+## load rtweet
+library(rtweet)
+
+## load ggplot2
+library(ggplot2)
+
+## Read in the streamed data
+d <- parse_stream("data/stream-1.json")
+
+## function to create freq table
+tab_sort <- function (x, n = 10, mentions = FALSE) {
+  sumrow <- data.frame(
+    "screen_name" = paste(length(unique(x)), "users"),
+    "n_tweets" = length(x),
+    "prop_tweets" = 1.000,
+    stringsAsFactors = FALSE
+  )
+  x <- sort(table(x), decreasing = TRUE)
+  x <- data.frame(
+    "screen_name" = names(x),
+    "n_tweets" = as.integer(x),
+    stringsAsFactors = FALSE
+  )
+  x$prop_tweets <- x$n_tweets / sum(x$n_tweets, na.rm = TRUE)
+  x$prop_tweets <- round(x$prop_tweets, 3)
+  x <- head(x, n)
+  x <- rbind(x, sumrow)
+  row.names(x) <- c(seq_len(nrow(x) - 1L), "total")
+  if (mentions) {
+    names(x)[2:3] <- c("n_mentions", "prop_mentions")
+  }
+  x
+}
+
+## most frequent tweeters table
+usrs <- tab_sort(nca$screen_name)
+
+## save most freq tweeters table
+png("../nca17-usrs.png", height = 3.1, width = 4.25, "in", res = 300)
+par(bg = "white")
+gridExtra::grid.table(usrs, theme = gridExtra::ttheme_default(base_size = 9))
+dev.off()
+
+## most frequent mentions table
+naomit <- function(x) x[!is.na(x)]
+usrs <- tab_sort(naomit(unlist(nca$mentions_screen_name)), mentions = TRUE)
+
+## save most freq mentions table
+png("../nca17-ats.png", height = 3.1, width = 4.25, "in", res = 300)
+par(bg = "white")
+gridExtra::grid.table(usrs, theme = gridExtra::ttheme_default(base_size = 9))
+dev.off()
diff --git a/R/network_analysis.R b/R/network_analysis.R
@@ -0,0 +1,145 @@
+## load rtweet
+library(rtweet)
+
+## load igraph
+library(igraph)
+
+## Read in the streamed data
+d <- parse_stream("data/stream-1.json")
+
+## function to filter out missing and non-unique IDs
+uq_naomit <- function(x) unique(x[!is.na(x)])
+
+## function to create connections data frames
+connections_df <- function(user, var, interaction = NULL) {
+  connections_df_ <- function(user, var) {
+    data.frame(
+      screen_name = user,
+      connection = unlist(var, use.names = FALSE),
+      row.names = NULL,
+      check.rows = FALSE,
+      check.names = FALSE,
+      stringsAsFactors = FALSE
+    )
+  }
+  d <- Map("connections_df_", user, var)
+  d <- do.call("rbind", d)
+  d <- d[!is.na(d$connection), ]
+  if (!is.null(interaction)) {
+      d$interaction <- interaction
+  }
+  tibble::as_tibble(d, validate = FALSE)
+}
+
+##----------------------------------------------------------------------------##
+##                                retweet users                               ##
+##----------------------------------------------------------------------------##
+
+## lookup retweets
+rts <- lookup_tweets(uq_naomit(d$retweet_status_id))
+
+## select and rename columns
+rts <- dplyr::select(
+  rts, retweet_status_id = status_id, retweet_screen_name = screen_name
+)
+
+## left join with data
+d <- dplyr::left_join(d, rts, by = "retweet_status_id")
+
+
+##----------------------------------------------------------------------------##
+##                                 quote users                                ##
+##----------------------------------------------------------------------------##
+
+## lookup quotes
+qts <- lookup_tweets(uq_naomit(d$quoted_status_id))
+
+## select and rename columns
+qts <- dplyr::select(
+  qts, quoted_status_id = status_id, quoted_screen_name = screen_name
+)
+
+## left join with data
+d <- dplyr::left_join(d, qts, by = "quoted_status_id")
+
+
+##----------------------------------------------------------------------------##
+##                          semantic connections data                         ##
+##----------------------------------------------------------------------------##
+
+## mentions data
+md <- connections_df(d$screen_name, d$mentions_screen_name, "mention")
+
+## replies data
+td <- connections_df(d$screen_name, d$reply_to_screen_name, "reply")
+
+## retweets data
+rd <- connections_df(d$screen_name, d$retweet_screen_name, "retweet")
+
+## quotes data
+qd <- connections_df(d$screen_name, d$quoted_screen_name, "quote")
+
+## combine connections data
+snd <- do.call("rbind", list(md, td, rd, qd))
+
+## rename
+names(snd) <- c("from", "to", "interaction")
+
+## list of all users
+all_users <- c(snd$from, snd$to)
+
+## list of user screen names with at least 5 connections
+kp_users <- table(all_users)
+kp_users <- names(kp_users[kp_users > 4L])
+
+## lookup users data
+nodes <- lookup_users(kp_users)
+
+## filter kp_usres and count interactions
+links <- snd %>%
+  dplyr::filter(from %in% nodes$screen_name & to %in% nodes$screen_name & from != to) %>%
+  dplyr::group_by(from, to) %>%
+  dplyr::summarise(n = n()) %>%
+  dplyr::ungroup()
+
+## filter only nodes found in links
+nodes <- dplyr::filter(nodes, screen_name %in% c(links$to, links$from))
+
+## size vector
+size1 <- links %>%
+  dplyr::group_by(from) %>%
+  dplyr::summarise(n = sum(n)) %>%
+  dplyr::select(screen_name = from, n)
+
+size2 <- links %>%
+  dplyr::group_by(to) %>%
+  dplyr::summarise(n = sum(n)) %>%
+  dplyr::select(screen_name = to, n)
+
+sizes <- rbind(size1, size2) %>%
+  dplyr::group_by(screen_name) %>%
+  dplyr::summarise(n = sum(n))
+
+## network graph
+net <- graph_from_data_frame(
+  d = links,
+  vertices = nodes[, c("screen_name", "statuses_count")],
+  directed = TRUE
+)
+
+## save plot
+png("../nca17-network.png", 20, 18, "in", res = 300)
+par(mar = c(0, 0, 0, 0), bg = "#000044")
+plot(net, edge.size = .25,
+     margin = c(-.05, -.05, -.05, -.05),
+     edge.arrow.size = 0,
+     edge.arrow.width = 0,
+     vertex.color = "#ff00ff55",
+     vertex.frame.color = "transparent",
+     vertex.label.color = "greenyellow",
+     vertex.label.cex = .35,
+     vertex.label.family = "sans",
+     vertex.size = sqrt(sizes$n) / 1.8,
+     edge.color = "#ff00ff55",
+     edge.width = .25)
+dev.off()
diff --git a/R/read_data.R b/R/read_data.R
@@ -0,0 +1,21 @@
+## NCA17 tweets
+
+## install and load rtweet
+if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools")
+devtools::install_github("mkearney/rtweet")
+library(rtweet)
+
+## create data folder is it doesn't already exist
+if (!dir.exists(file.path("..", "data"))) dir.create(file.path("..", "data"))
+
+## download stream data, save it to data folder
+download.file(
+  "https://www.dropbox.com/s/t0sefc0lzqbwd32/stream-1.json?dl=1",
+  file.path("..", "data", "nca17.json")
+)
+
+## read in stream data, converting it to data frame
+nca <- parse_stream(file.path("..", "data", "nca17.json"))
+
+## preview data (should be N = 3332)
+nca
diff --git a/R/sentiment_analysis.R b/R/sentiment_analysis.R
@@ -0,0 +1,70 @@
+## load rtweet
+library(rtweet)
+
+## load tidyverse
+lib(tidyverse)
+
+## Read in the streamed data
+d <- parse_stream("data/stream-1.json")
+
+## Estimate pos/neg sentiment for each tweet
+d$sentiment <- syuzhet::get_sentiment(d$text, "syuzhet")
+
+## Function to parse time intervals
+time_intervals <- function(x) {
+  stopifnot(is.atomic(x) && length(x) == 1L)
+  if (is.numeric(x)) {
+    return(x)
+  }
+  x <- tolower(x)
+  if (grepl("year", x)) {
+    n <- 60 * 60 * 24 * 365
+  } else if (grepl("quarter", x)) {
+    n <- 365 / 4
+  } else if (grepl("month", x)) {
+    n <- 60 * 60 * 24 * 30
+  } else if (grepl("week", x)) {
+    n <- 60 * 60 * 24 * 7
+  } else if (grepl("day", x)) {
+    n <- 60 * 60 * 24
+  } else if (grepl("hour", x)) {
+    n <- 60 * 60
+  } else if (grepl("min", x)) {
+    n <- 60
+  } else if (grepl("sec", x)) {
+    n <- 1
+  } else {
+    stop("interval must be secs, mins, hours, days, weeks, months, or years",
+         call. = FALSE)
+  }
+  x <- as.double(gsub("[^[:digit:]|\\.]", "", x))
+  if (any(is.na(x), identical(x, ""))) {
+    x <- 1
+  }
+  n * x
+}
+
+## Create function to round time into rounded var
+round_time <- function(x, sec) {
+  sec <- time_units(sec)
+  as.POSIXct(hms::hms(as.numeric(x) %/% sec * sec))
+}
+
+## create and save plot
+d %>%
+  mutate(time = round_time(created_at, "hours")) %>%
+  group_by(time) %>%
+  summarise(sentiment = mean(sentiment, na.rmm = TRUE)) %>%
+  mutate(valence = ifelse(sentiment > 0L, "Positive", "Negative")) %>%
+  ggplot(aes(x = time, y = sentiment)) +
+  geom_smooth(method = "loess", span = .6, colour = "#aa11aadd", fill = "#bbbbbb11") +
+  geom_bar(aes(fill = valence), alpha = .7, stat = "identity", width = 1250) +
+  geom_point(aes(colour = valence), alpha = .9, size = 2.5) +
+  theme_mwk() +
+  theme(legend.position = "none") +
+  scale_fill_manual(values = c(Positive = "#2244ee", Negative = "#dd2222")) +
+  scale_colour_manual(values = c(Positive = "#0022cc", Negative = "#bb0000")) +
+  labs(x = NULL, y = NULL,
+       title = "Sentiment of #NCA17 tweets by hour",
+       subtitle = "Mean positive/negative sentiment scores of tweets") +
+  ggsave("../nca17-sa.png", width = 8, height = 7, units = "in")
diff --git a/R/time_series.R b/R/time_series.R
@@ -0,0 +1,17 @@
+## load rtweet
+library(rtweet)
+
+## load ggplot2
+library(ggplot2)
+
+## Read in the streamed data
+d <- parse_stream("data/stream-1.json")
+
+## plot the time series of #NCA17 activity
+ts_plot(nca, "hours") +
+  theme_minimal(base_family = "sans") +
+  theme(plot.title = element_text(face = "bold")) +
+  labs(x = NULL, y = NULL, title = "Time series of #NCA17 Twitter statuses",
+       subtitle = "Twitter statuses aggregated by hour",
+       caption = "\nData collected from Twitter's stream (filter) API using rtweet") +
+  ggsave("../nca17-ts.png", width = 8, height = 6, units = "in")
diff --git a/R/word_cloud.R b/R/word_cloud.R
@@ -0,0 +1,41 @@
+## load rtweet
+library(rtweet)
+
+## load ggplot2
+library(ggplot2)
+
+## Read in the streamed data
+d <- parse_stream("data/stream-1.json")
+
+## function for cleaning text and creating word freq table
+clean_text_table <- function(data) {
+  txt <- tolower(plain_tweets(data$text))
+  txt <- gsub("&amp;", "", txt)
+  txt <- gsub("#nca17", "", txt, ignore.case = TRUE)
+  txt <- unlist(strsplit(txt, " "))
+  txt <- gsub("^[[:punct:]]{1,}|[[:punct:]]{1,}$", "", txt)
+  txt <- trimws(txt)
+  txt <- txt[txt != ""]
+  swds <- stopwordslangs$word[stopwordslangs$lang == "en" & stopwordslangs$p > .99]
+  txt <- txt[!txt %in% swds]
+  sort(table(txt), decreasing = TRUE)
+}
+
+## create frequency table of popular words
+wds <- clean_text_table(nca)
+
+## calc min freq for word cloud
+minfreq <- quantile(as.double(wds), .75)
+
+## save word cloud
+png("../nca17-wc.png", height = 8, width = 8, "in", res = 300)
+par(bg = "black")
+wordcloud::wordcloud(
+  names(wds),
+  as.integer(wds),
+  min.freq = minfreq,
+  random.color = FALSE,
+  random.order = FALSE,
+  colors = gg_cols(6)
+)
+dev.off()