#kill scientific notation
options(scipen=999)
if (!file.exists('activity.csv')) {
unzip('activity.zip', exdir='../')
}
df <- read.csv('activity.csv')
#convert the intervals into date-time strings
#this adds today's date to every time, which is incorrect, but will be ignored
#converting into full date-time objects removes jumps on the x-axis e.g. 855 to 900
df$interval <- as.POSIXct(strptime(sprintf('%04d', df$interval), '%H%M'))
#lubridate for date-time evaluation
#loading packages near first call for to explain use
library(lubridate)
df$date <- ymd(df$date)
#chain operations with dplyr
library(dplyr)
df.daily <- df %>%
group_by(date) %>%
summarize(steps.total = sum(steps))
steps.mean <- mean(df.daily$steps.total, na.rm = T)
steps.median <- median(df.daily$steps.total, na.rm = T)
library(ggplot2)
g.daily <- ggplot(df.daily, aes(date, steps.total)) +
geom_histogram(stat = 'identity') +
xlab('Date') + ylab('Steps') +
theme(axis.text.x = element_text(angle = 300, hjust = 0)) +
#because the mean and median are so close, overlap dashed blue on top of solid red to see both
geom_hline(yintercept = steps.mean, linetype = 'solid', color = 'red') +
geom_hline(yintercept = steps.median, linetype = 'dashed', color = 'blue')
g.daily
Most devices registered between 0 and 22,000 steps per day. The mean total of steps taken per day is 10766.1886792 (red line) and the median total is 10765 (blue line).
df.intvl <- df %>%
group_by(interval) %>%
summarize(steps.mean = mean(steps, na.rm = T))
interval.max <- df.intvl[which.max(df.intvl$steps.mean),]
#Question asked for a plot, the code below works, but I like using ggplot so that is what I present.
#plot(df.intvl, type = 'l')
#scales package for better control of the x-axis labels
library(scales)
g.intvl <- ggplot(df.intvl, aes(interval, steps.mean)) +
geom_line() +
scale_x_datetime(breaks = date_breaks("2 hours"),
labels=date_format("%H:%M")) +
xlab('Time') + ylab('Steps') +
annotate('text', x = interval.max[[1]], y = interval.max[[2]] + 3,
label = paste('Maximum steps:', interval.max[[2]],'at 8:35'))
g.intvl
The interval with the maximum number of steps is 8:35 (going to work). There are also local maxima at 12:05 (lunch), 3:55 (leaving work?), and 6:50 (going out for the evening?)
df.nas <- sum(is.na(df))
There are 2304 intervals with missing values in the dataset.
#create new data frame with a NA's replaced by interval averages
df.replace <- df %>%
group_by(interval) %>%
mutate(steps = ifelse(is.na(steps),
#i've seen as.integer recommended here, but I think fractional steps are acceptable.
mean(steps, na.rm=TRUE),
steps))
If we set every missing value equal to the mean of its interval, it changes our previous average daily activity patterns.
#same code from above with new data frame
df.replace.daily <- df.replace %>%
group_by(date) %>%
summarize(steps.total = sum(steps))
steps.replace.mean <- mean(df.daily$steps.total, na.rm = T)
steps.replace.median <- median(df.daily$steps.total, na.rm = T)
g.replace.daily <- ggplot(df.replace.daily, aes(date, steps.total)) +
geom_histogram(stat = 'identity') +
xlab('Date') + ylab('Steps') +
theme(axis.text.x = element_text(angle = 300, hjust = 0)) +
geom_hline(yintercept = steps.replace.mean, linetype = 'solid', color = 'red') +
geom_hline(yintercept = steps.replace.median, linetype = 'dashed', color = 'blue')
g.replace.daily
However, this doesn't change the mean total of steps taken per day at 10766.1886792 (red line) or the median total at 10765 (blue line) since we added mean values for every interval.
To find out, first we assign whether a datapoint happened on a weekday or weekend, and then we aggregrate the means according to both interval and whether it's a weekday or weekend.
df.replace$date <- as.POSIXct(df.replace$date)
df.replace$Day <- as.factor(wday(df.replace$date, label = T))
levels(df.replace$Day) <- list('Weekend' = c('Sat', 'Sun'),
'Weekday' = c('Mon', 'Tues', 'Wed', 'Thurs', 'Fri'))
df.replace.intvl <- df.replace %>%
group_by(Day, interval) %>%
summarize(steps = mean(steps))
head(df.replace.intvl)
## Source: local data frame [6 x 3]
## Groups: Day
##
## Day interval steps
## 1 Weekend 2015-02-12 00:00:00 0.214622642
## 2 Weekend 2015-02-12 00:05:00 0.042452830
## 3 Weekend 2015-02-12 00:10:00 0.016509434
## 4 Weekend 2015-02-12 00:15:00 0.018867925
## 5 Weekend 2015-02-12 00:20:00 0.009433962
## 6 Weekend 2015-02-12 00:25:00 3.511792453
#Instructions ask for lattice plot, but I find the side by sides hard to read. Code below does work though.
#library(lattice)
#xyplot(steps ~ interval | Day, df.replace.intvl, type = 'l')
g.replace.intvl <- ggplot(df.replace.intvl, aes(x = interval, y = steps, col = Day)) +
geom_line() +
scale_x_datetime(breaks = date_breaks("2 hours"),
labels=date_format("%H:%M")) +
xlab('Time') + ylab('Steps') +
#I think this makes it harder to read, but the question really wants it.
facet_grid(. ~ Day)
g.replace.intvl
In some respects, weekend activity mirrors weekday activity. Local maxima are found around 8:30, 12:00, and 4:00. On the other hand, weekday activity skews strongly towards the morning spike, while weekend activity is distributed more evenly throughout the day.