-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_analysis.R
162 lines (143 loc) · 5.66 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
library(dplyr)
DATA_DIR <- "data"
DATASET_URL <- "https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip"
DATASET_FILES <- file.path(
DATA_DIR,
c(
"activity_labels.txt",
"features.txt",
"subject_test.txt",
"subject_train.txt",
"X_test.txt",
"X_train.txt",
"y_test.txt",
"y_train.txt"
)
)
names(DATASET_FILES) <- basename(DATASET_FILES)
DATASET_SUMMARY_FILE <- file.path(DATA_DIR, "summary.txt")
# dataset_exist function checks if files from the UCI HAR Dataset exist
# in data directory
dataset_exist <- function(files = DATASET_FILES, dir = DATA_DIR) {
file.exists(dir) & all(basename(files) %in% dir(dir))
}
# download_dataset function downloads, unzips & saves the UCI HAR Dataset files
download_dataset <- function(dataset_url = DATASET_URL, dest_dir = DATA_DIR) {
if (!dataset_exist()) {
# create temporary file for the zip file
temp <- tempfile()
# Create data directory if it doesn't exist
if (!file.exists(dest_dir)) { dir.create(dest_dir) }
message("Downloading UCI HAR Dataset")
download.file(dataset_url, destfile = temp, mode = "wb")
# unzip the dataset
unzip(temp, overwrite = TRUE, junkpaths = TRUE, exdir = DATA_DIR)
# delete the zip file
unlink(temp)
} else {
message("UCI HAR Dataset found in data directory")
}
}
# combine_data_for_means_and_std function does the following:
# - loads the training and test sets
# - extracts only the measurements on the mean and standard deviation for each
# measurement
# - appropriately labels the data set with descriptive variable names
# - merges the training and the test sets to create one data set
# - uses descriptive activity names to name the activities in the data set
combine_data_for_means_and_std <- function(dataset = DATASET_FILES) {
message("Combining data")
# Read variables labels
features <- read.table(
dataset["features.txt"],
col.names = c("id", "name")
)
# Format the variable names by removing the non alphabetic or numeric
# characters
normalized_feature_names <- gsub("[^A-Za-z0-9]", "", features$name)
# Start Mean and Std in variable names with uppercase
normalized_feature_names <- sub("mean", "Mean", normalized_feature_names)
normalized_feature_names <- sub("std", "Std", normalized_feature_names)
# Create logical vector which represents only the variables for measurements
# on the mean and standard deviation
selected_features_logical <- grepl("(mean|std)\\(\\)", features$name)
# Based on the logical vector above create a vector for column classes.
# Variables for mean and standard deviation get numeric and the other
# variables get NULL. The NULL value will allow skipping the columns we are
# not interested in.
selected_features_classes <- sapply(selected_features_logical, function(i) {
if (i) "numeric"
else "NULL"
})
# Read the X_test and X_train data which containes variables with various
# measurements from the phone sensors.
# Apply the descriptive column names using the col.names attribute.
# Read only the variables for measurements on the mean and standard
# deviation using NULL values in the colClasses attribute
X_test <- read.table(
dataset["X_test.txt"],
colClasses = selected_features_classes,
col.names = normalized_feature_names
)
X_train <- read.table(
dataset["X_train.txt"],
colClasses = selected_features_classes,
col.names = normalized_feature_names
)
# Read the y_test and y_train data which contains information about activity
# type
y_test <- read.table(
dataset["y_test.txt"],
colClasses = "factor",
col.names = "activity"
)
y_train <- read.table(
dataset["y_train.txt"],
colClasses = "factor",
col.names = "activity"
)
# Read the subject_test and subject_train data which contains information
# about subject id
subject_test <- read.table(
dataset["subject_test.txt"],
col.names = "subject"
)
subject_train <- read.table(
dataset["subject_train.txt"],
col.names = "subject"
)
# Create table with complete test data by combining the subject, activity
# and measurements data
data_test <- cbind(subject_test, y_test, X_test)
# Create table with complete train data by combining the subject, activity
# and measurements data
data_train <- cbind(subject_train, y_train, X_train)
# Combine the test and train data
data_combined <- rbind(data_test, data_train)
# Read activity labels
activity_labels <- read.table(
dataset["activity_labels.txt"],
col.names = c("id", "name")
)
# Set the levels of activity variable to the activity labels to get
# descriptive names instead of numbers
levels(data_combined$activity) <- activity_labels$name
data_combined
}
# summarise_means function groups data frame by given variables and summarises
# the data by creating average of each variable
summarise_means <- function(data, ...) {
message("Creatting data summary")
data %>%
group_by(...) %>%
summarise_all(mean)
}
# write_data_file function creates file with given data
write_data_file <- function(data, dest = DATASET_SUMMARY_FILE) {
message("Writting data into ", dest, " file")
write.table(data, file=dest, row.names = FALSE)
}
download_dataset()
data_combined <- combine_data_for_means_and_std()
data_combined_means_summary <- summarise_means(data_combined, subject, activity)
write_data_file(data_combined_means_summary)