Skip to content

Commit

Permalink
updated with all code
Browse files Browse the repository at this point in the history
  • Loading branch information
mkearney committed Nov 21, 2017
1 parent d166301 commit d79c0d5
Show file tree
Hide file tree
Showing 9 changed files with 359 additions and 7 deletions.
52 changes: 52 additions & 0 deletions R/freq_tables.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
## load rtweet
library(rtweet)

## load ggplot2
library(ggplot2)

## Read in the streamed data
d <- parse_stream("data/stream-1.json")

## function to create freq table
tab_sort <- function (x, n = 10, mentions = FALSE) {
sumrow <- data.frame(
"screen_name" = paste(length(unique(x)), "users"),
"n_tweets" = length(x),
"prop_tweets" = 1.000,
stringsAsFactors = FALSE
)
x <- sort(table(x), decreasing = TRUE)
x <- data.frame(
"screen_name" = names(x),
"n_tweets" = as.integer(x),
stringsAsFactors = FALSE
)
x$prop_tweets <- x$n_tweets / sum(x$n_tweets, na.rm = TRUE)
x$prop_tweets <- round(x$prop_tweets, 3)
x <- head(x, n)
x <- rbind(x, sumrow)
row.names(x) <- c(seq_len(nrow(x) - 1L), "total")
if (mentions) {
names(x)[2:3] <- c("n_mentions", "prop_mentions")
}
x
}

## most frequent tweeters table
usrs <- tab_sort(nca$screen_name)

## save most freq tweeters table
png("../nca17-usrs.png", height = 3.1, width = 4.25, "in", res = 300)
par(bg = "white")
gridExtra::grid.table(usrs, theme = gridExtra::ttheme_default(base_size = 9))
dev.off()

## most frequent mentions table
naomit <- function(x) x[!is.na(x)]
usrs <- tab_sort(naomit(unlist(nca$mentions_screen_name)), mentions = TRUE)

## save most freq mentions table
png("../nca17-ats.png", height = 3.1, width = 4.25, "in", res = 300)
par(bg = "white")
gridExtra::grid.table(usrs, theme = gridExtra::ttheme_default(base_size = 9))
dev.off()
145 changes: 145 additions & 0 deletions R/network_analysis.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
## load rtweet
library(rtweet)

## load igraph
library(igraph)

## Read in the streamed data
d <- parse_stream("data/stream-1.json")

## function to filter out missing and non-unique IDs
uq_naomit <- function(x) unique(x[!is.na(x)])

## function to create connections data frames
connections_df <- function(user, var, interaction = NULL) {
connections_df_ <- function(user, var) {
data.frame(
screen_name = user,
connection = unlist(var, use.names = FALSE),
row.names = NULL,
check.rows = FALSE,
check.names = FALSE,
stringsAsFactors = FALSE
)
}
d <- Map("connections_df_", user, var)
d <- do.call("rbind", d)
d <- d[!is.na(d$connection), ]
if (!is.null(interaction)) {
d$interaction <- interaction
}
tibble::as_tibble(d, validate = FALSE)
}

##----------------------------------------------------------------------------##
## retweet users ##
##----------------------------------------------------------------------------##

## lookup retweets
rts <- lookup_tweets(uq_naomit(d$retweet_status_id))

## select and rename columns
rts <- dplyr::select(
rts, retweet_status_id = status_id, retweet_screen_name = screen_name
)

## left join with data
d <- dplyr::left_join(d, rts, by = "retweet_status_id")


##----------------------------------------------------------------------------##
## quote users ##
##----------------------------------------------------------------------------##

## lookup quotes
qts <- lookup_tweets(uq_naomit(d$quoted_status_id))

## select and rename columns
qts <- dplyr::select(
qts, quoted_status_id = status_id, quoted_screen_name = screen_name
)

## left join with data
d <- dplyr::left_join(d, qts, by = "quoted_status_id")


##----------------------------------------------------------------------------##
## semantic connections data ##
##----------------------------------------------------------------------------##

## mentions data
md <- connections_df(d$screen_name, d$mentions_screen_name, "mention")

## replies data
td <- connections_df(d$screen_name, d$reply_to_screen_name, "reply")

## retweets data
rd <- connections_df(d$screen_name, d$retweet_screen_name, "retweet")

## quotes data
qd <- connections_df(d$screen_name, d$quoted_screen_name, "quote")

## combine connections data
snd <- do.call("rbind", list(md, td, rd, qd))

## rename
names(snd) <- c("from", "to", "interaction")

## list of all users
all_users <- c(snd$from, snd$to)

## list of user screen names with at least 5 connections
kp_users <- table(all_users)
kp_users <- names(kp_users[kp_users > 4L])

## lookup users data
nodes <- lookup_users(kp_users)

## filter kp_usres and count interactions
links <- snd %>%
dplyr::filter(from %in% nodes$screen_name & to %in% nodes$screen_name & from != to) %>%
dplyr::group_by(from, to) %>%
dplyr::summarise(n = n()) %>%
dplyr::ungroup()

## filter only nodes found in links
nodes <- dplyr::filter(nodes, screen_name %in% c(links$to, links$from))

## size vector
size1 <- links %>%
dplyr::group_by(from) %>%
dplyr::summarise(n = sum(n)) %>%
dplyr::select(screen_name = from, n)

size2 <- links %>%
dplyr::group_by(to) %>%
dplyr::summarise(n = sum(n)) %>%
dplyr::select(screen_name = to, n)

sizes <- rbind(size1, size2) %>%
dplyr::group_by(screen_name) %>%
dplyr::summarise(n = sum(n))

## network graph
net <- graph_from_data_frame(
d = links,
vertices = nodes[, c("screen_name", "statuses_count")],
directed = TRUE
)

## save plot
png("../nca17-network.png", 20, 18, "in", res = 300)
par(mar = c(0, 0, 0, 0), bg = "#000044")
plot(net, edge.size = .25,
margin = c(-.05, -.05, -.05, -.05),
edge.arrow.size = 0,
edge.arrow.width = 0,
vertex.color = "#ff00ff55",
vertex.frame.color = "transparent",
vertex.label.color = "greenyellow",
vertex.label.cex = .35,
vertex.label.family = "sans",
vertex.size = sqrt(sizes$n) / 1.8,
edge.color = "#ff00ff55",
edge.width = .25)
dev.off()
21 changes: 21 additions & 0 deletions R/read_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
## NCA17 tweets

## install and load rtweet
if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools")
devtools::install_github("mkearney/rtweet")
library(rtweet)

## create data folder is it doesn't already exist
if (!dir.exists(file.path("..", "data"))) dir.create(file.path("..", "data"))

## download stream data, save it to data folder
download.file(
"https://www.dropbox.com/s/t0sefc0lzqbwd32/stream-1.json?dl=1",
file.path("..", "data", "nca17.json")
)

## read in stream data, converting it to data frame
nca <- parse_stream(file.path("..", "data", "nca17.json"))

## preview data (should be N = 3332)
nca
70 changes: 70 additions & 0 deletions R/sentiment_analysis.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
## load rtweet
library(rtweet)

## load tidyverse
lib(tidyverse)

## Read in the streamed data
d <- parse_stream("data/stream-1.json")

## Estimate pos/neg sentiment for each tweet
d$sentiment <- syuzhet::get_sentiment(d$text, "syuzhet")

## Function to parse time intervals
time_intervals <- function(x) {
stopifnot(is.atomic(x) && length(x) == 1L)
if (is.numeric(x)) {
return(x)
}
x <- tolower(x)
if (grepl("year", x)) {
n <- 60 * 60 * 24 * 365
} else if (grepl("quarter", x)) {
n <- 365 / 4
} else if (grepl("month", x)) {
n <- 60 * 60 * 24 * 30
} else if (grepl("week", x)) {
n <- 60 * 60 * 24 * 7
} else if (grepl("day", x)) {
n <- 60 * 60 * 24
} else if (grepl("hour", x)) {
n <- 60 * 60
} else if (grepl("min", x)) {
n <- 60
} else if (grepl("sec", x)) {
n <- 1
} else {
stop("interval must be secs, mins, hours, days, weeks, months, or years",
call. = FALSE)
}
x <- as.double(gsub("[^[:digit:]|\\.]", "", x))
if (any(is.na(x), identical(x, ""))) {
x <- 1
}
n * x
}

## Create function to round time into rounded var
round_time <- function(x, sec) {
sec <- time_units(sec)
as.POSIXct(hms::hms(as.numeric(x) %/% sec * sec))
}

## create and save plot
d %>%
mutate(time = round_time(created_at, "hours")) %>%
group_by(time) %>%
summarise(sentiment = mean(sentiment, na.rmm = TRUE)) %>%
mutate(valence = ifelse(sentiment > 0L, "Positive", "Negative")) %>%
ggplot(aes(x = time, y = sentiment)) +
geom_smooth(method = "loess", span = .6, colour = "#aa11aadd", fill = "#bbbbbb11") +
geom_bar(aes(fill = valence), alpha = .7, stat = "identity", width = 1250) +
geom_point(aes(colour = valence), alpha = .9, size = 2.5) +
theme_mwk() +
theme(legend.position = "none") +
scale_fill_manual(values = c(Positive = "#2244ee", Negative = "#dd2222")) +
scale_colour_manual(values = c(Positive = "#0022cc", Negative = "#bb0000")) +
labs(x = NULL, y = NULL,
title = "Sentiment of #NCA17 tweets by hour",
subtitle = "Mean positive/negative sentiment scores of tweets") +
ggsave("../nca17-sa.png", width = 8, height = 7, units = "in")
17 changes: 17 additions & 0 deletions R/time_series.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
## load rtweet
library(rtweet)

## load ggplot2
library(ggplot2)

## Read in the streamed data
d <- parse_stream("data/stream-1.json")

## plot the time series of #NCA17 activity
ts_plot(nca, "hours") +
theme_minimal(base_family = "sans") +
theme(plot.title = element_text(face = "bold")) +
labs(x = NULL, y = NULL, title = "Time series of #NCA17 Twitter statuses",
subtitle = "Twitter statuses aggregated by hour",
caption = "\nData collected from Twitter's stream (filter) API using rtweet") +
ggsave("../nca17-ts.png", width = 8, height = 6, units = "in")
41 changes: 41 additions & 0 deletions R/word_cloud.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
## load rtweet
library(rtweet)

## load ggplot2
library(ggplot2)

## Read in the streamed data
d <- parse_stream("data/stream-1.json")

## function for cleaning text and creating word freq table
clean_text_table <- function(data) {
txt <- tolower(plain_tweets(data$text))
txt <- gsub("&amp;", "", txt)
txt <- gsub("#nca17", "", txt, ignore.case = TRUE)
txt <- unlist(strsplit(txt, " "))
txt <- gsub("^[[:punct:]]{1,}|[[:punct:]]{1,}$", "", txt)
txt <- trimws(txt)
txt <- txt[txt != ""]
swds <- stopwordslangs$word[stopwordslangs$lang == "en" & stopwordslangs$p > .99]
txt <- txt[!txt %in% swds]
sort(table(txt), decreasing = TRUE)
}

## create frequency table of popular words
wds <- clean_text_table(nca)

## calc min freq for word cloud
minfreq <- quantile(as.double(wds), .75)

## save word cloud
png("../nca17-wc.png", height = 8, width = 8, "in", res = 300)
par(bg = "black")
wordcloud::wordcloud(
names(wds),
as.integer(wds),
min.freq = minfreq,
random.color = FALSE,
random.order = FALSE,
colors = gg_cols(6)
)
dev.off()
Loading

0 comments on commit d79c0d5

Please sign in to comment.