live update number 1

mkearney · Nov 15, 2017 · 5efcc85 · 5efcc85
1 parent 6ead4b6
commit 5efcc85
Show file tree

Hide file tree

Showing 8 changed files with 497 additions and 29 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/README.md b/README.md
@@ -4,8 +4,21 @@ Collecting data on Twitter statuses containing the #NCA17 hashtag.
 ## Importing the data
 To import the Twitter data, see the [make.R](make.R) script file.
 
-## Twitter activity tracker
-Here's the live output generated by the [make.R](make.R) script
+## NCA Twitter tracking
+Here's the live output generated by the [live.R](live.R) script
 file. It updates every hour.
 
+### Status frequency
+The number of #nca17 tweets aggregated in 1-hour intervals.
+
 ![](nca17-ts.png)
+
+### Top users
+Most frequently appearing Twitter accounts.
+
+![](nca17-usrs.png)
+
+### Word cloud
+Most popular words appearing in statuses (stop words excluded)
+
+![](nca17-wc.png)
diff --git a/data/nca17.json b/data/nca17.json
diff --git a/live.R b/live.R
@@ -0,0 +1,98 @@
+## load packages
+library(rtweet)
+
+## install and load ggplot
+if (!requireNamespace("ggplot2", quietly = TRUE)) install.packages("ggplot2")
+library(ggplot2)
+
+## install gridExtra
+if (!requireNamespace("gridExtra", quietly = TRUE)) install.packages("gridExtra")
+
+## table output function
+tab_sort <- function (x, n = 10) {
+  sumrow <- data.frame(
+    "screen_name" = c("...", paste(length(unique(x)), "total users")),
+    "n_tweets" = c(NA_integer_, length(x)),
+    "prop_tweets" = c(NA_real_, 1.000),
+    stringsAsFactors = FALSE
+  )
+  x <- sort(table(x), decreasing = TRUE)
+  x <- data.frame(
+    "screen_name" = names(x),
+    "n_tweets" = as.integer(x),
+    stringsAsFactors = FALSE
+  )
+  x$prop_tweets <- x$n_tweets / sum(x$n_tweets, na.rm = TRUE)
+  x$prop_tweets <- round(x$prop_tweets, 3)
+  x <- head(x, n)
+  x <- rbind(x, sumrow)
+  row.names(x) <- c(seq_len(nrow(x) - 2L), "...", "tot")
+  x
+}
+
+## function for cleaning text and creating word freq table
+clean_text_table <- function(data) {
+  txt <- tolower(plain_tweets(data$text))
+  txt <- gsub("&amp;", "", txt)
+  txt <- gsub("#nca17", "", txt, ignore.case = TRUE)
+  txt <- unlist(strsplit(txt, " "))
+  txt <- gsub("^[[:punct:]]{1,}|[[:punct:]]{1,}$", "", txt)
+  txt <- trimws(txt)
+  txt <- txt[txt != ""]
+  swds <- stopwordslangs$word[stopwordslangs$lang == "en" & stopwordslangs$p > .99]
+  txt <- txt[!txt %in% swds]
+  sort(table(txt), decreasing = TRUE)
+}
+
+## counter
+i <- 1L
+
+## while loop for live updating
+while (Sys.time() < as.POSIXct("2017-11-21")) {
+  ## download stream data, save it to data folder
+  download.file(
+    "https://www.dropbox.com/s/t0sefc0lzqbwd32/stream-1.json?dl=1",
+    "data/nca17.json"
+  )
+
+  ## read in stream data, converting it to data frame
+  nca <- parse_stream("data/nca17.json")
+
+  ## plot the time series of #NCA17 activity
+  ts_plot(nca, "3 mins") +
+    theme_minimal(base_family = "sans") +
+    theme(plot.title = element_text(face = "bold")) +
+    labs(x = NULL, y = NULL, title = "Time series of #NCA17 Twitter statuses",
+         subtitle = "Twitter statuses aggregated by minute",
+         caption = "\nData collected from Twitter's stream (filter) API using rtweet") +
+    ggsave("nca17-ts.png", width = 8, height = 6, units = "in")
+
+  ## most frequent tweeters table
+  usrs <- tab_sort(nca$screen_name)
+  png("nca17-usrs.png", height = 4, width = 4, "in", res = 300)
+  gridExtra::grid.table(usrs)
+  dev.off()
+
+  ## create frequency table for popular words
+  wds <- clean_text_table(nca)
+
+  png("nca17-wc.png", height = 8, width = 8, "in", res = 300)
+  par(bg = "black")
+  wordcloud::wordcloud(
+    names(wds),
+    as.integer(wds),
+    min.freq = 2,
+    random.color = FALSE,
+    random.order = FALSE,
+    colors = gg_cols(6)
+  )
+  dev.off()
+
+  ## update git repo (this is from my own utils R package)
+  tfse::add_to_git(paste("live update number", i))
+  1L
+  i <- i + 1L
+
+  ## sleep for hour
+  Sys.sleep(60L * 60L)
+}
diff --git a/make.R b/make.R
@@ -3,34 +3,16 @@
 ## install and load rtweet
 if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools")
 devtools::install_github("mkearney/rtweet")
-
-## install and load ggplot
-if (!requireNamespace("ggplot2", quietly = TRUE)) install.packages("ggplot2")
-library(ggplot2)
+library(rtweet)
 
 ## create data folder is it doesn't already exist
 if (!dir.exists("data")) dir.create("data")
 
-## while loop for live updating
-while (Sys.time() < as.POSIXct("2017-11-21")) {
-  ## download stream data, save it to data folder
-  download.file(
-    "https://www.dropbox.com/s/t0sefc0lzqbwd32/stream-1.json?dl=1",
-    "data/nca17.json"
-  )
-
-  ## read in stream data, converting it to data frame
-  nca <- parse_stream("data/nca17.json")
-
-  ## plot the time series of #NCA17 activity
-  ts_plot(nca, "mins") +
-    theme_minimal(base_family = "sans") +
-    theme(plot.title = element_text(face = "bold")) +
-    labs(x = NULL, y = NULL, title = "Time series of #NCA17 Twitter statuses",
-         subtitle = "Twitter statuses aggregated by minute",
-         caption = "\nData collected from Twitter's stream (filter) API using rtweet") +
-    ggsave("nca17-ts.png", width = 8, height = 6, units = "in")
+## download stream data, save it to data folder
+download.file(
+  "https://www.dropbox.com/s/t0sefc0lzqbwd32/stream-1.json?dl=1",
+  "data/nca17.json"
+)
 
-  ## sleep for hour
-  Sys.sleep(60L * 60L)
-}
+## read in stream data, converting it to data frame
+nca <- parse_stream("data/nca17.json")
diff --git a/nca17-ts.png b/nca17-ts.png
diff --git a/nca17-usrs.png b/nca17-usrs.png
diff --git a/nca17-wc.png b/nca17-wc.png