-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproject1.R
executable file
·368 lines (282 loc) · 12.3 KB
/
project1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
################
# (1) Read data
################
#setwd('C:/Users/Eric/Documents/CSU East Bay Coursework/Spring 2015/Machine Learning/Project 1')
data = read.csv('C:/Users/Delle/Documents/Stat6620MachLearning/1987.csv')
#unzipped with Bzip2 for Windows
####################
# (2.1) Examine data
####################
str(data) #The output from this function answers 'Part 2.1'
#Remove extraneous columns
data$TailNum = NULL
data$AirTime = NULL
data$TaxiIn = NULL
data$TaxiOut = NULL
data$CancellationCode = NULL
data$CarrierDelay = NULL
data$WeatherDelay = NULL
data$NASDelay = NULL
data$SecurityDelay = NULL
data$LateAircraftDelay = NULL
data$Year = NULL #All data is from 1987
#Convert 'Cancelled' & 'Diverted' to type factor
data$Cancelled = factor(data$Cancelled, levels = c('0','1'), labels = c('no','yes'))
data$Diverted = factor(data$Diverted, levels = c('0','1'), labels = c('no','yes'))
str(data)
str(as.factor(data$Month)) #note that there is only Q4 data for 1987
####################################
#Remove Observations with NA values
####################################
data = na.omit(data)
str(data)
########################
#Verify removal of NA's
########################
is.valid.numeric <- function(x,na.valid = FALSE, nan.valid = FALSE, Inf.valid = FALSE)
{ #begin is.valid.numeric
if ( !is.numeric(x) ) { cat("mode is not numeric\n"); return(FALSE) }
if ( length(x) == 0 ) { cat("length equals 0\n"); return(FALSE) }
if ( nan.valid == FALSE && any(is.nan(x)) ) { cat("NaN value(s) detected\n"); return(FALSE) }
if ( na.valid == FALSE && any(is.na(x)) ) { cat("NA value(s) detected\n"); return(FALSE) }
if ( Inf.valid == FALSE && any(is.infinite(x)) ) { cat("Inf value(s) detected\n"); return(FALSE) }
return(TRUE)
} #end is.valid.numeric
lapply(data[sapply(data,is.numeric)], is.valid.numeric) #only apply function to numeric columns
###########################################
#Remove cancelled or diverted observations
###########################################
table(data$Cancelled)
table(data$Diverted)
#note that after removing all obs with NA values there are no longer any cancelled or div observations
#therefore we remove the variables
data$Cancelled = NULL
data$Diverted = NULL
#############################################################
# (2.2) Calculate mean & sd for numerical variables by month
#############################################################
var_name = character(0)
mean_oct = numeric(0)
sd_oct = numeric(0)
mean_nov = numeric(0)
sd_nov = numeric(0)
mean_dec = numeric(0)
sd_dec = numeric(0)
for (i in 4:length(data[sapply(data,is.numeric)])) #start at i=4 to skip summarizing date & time variables
{
var_name[i] = names(data[sapply(data,is.numeric)])[i]
mean_oct[i] = mean( (data[sapply(data,is.numeric)])[which(data$Month == 10),i] )
sd_oct[i] = sd( (data[sapply(data,is.numeric)])[which(data$Month == 10),i] )
mean_nov[i] = mean( (data[sapply(data,is.numeric)])[which(data$Month == 11),i] )
sd_nov[i] = sd( (data[sapply(data,is.numeric)])[which(data$Month == 11),i] )
mean_dec[i] = mean( (data[sapply(data,is.numeric)])[which(data$Month == 12),i] )
sd_dec[i] = sd( (data[sapply(data,is.numeric)])[which(data$Month == 12),i] )
}
#remove leading missing values
var_name = var_name[-1*1:3]
mean_oct = mean_oct[-1*1:3]
sd_oct = sd_oct[-1*1:3]
mean_nov = mean_nov[-1*1:3]
sd_nov = sd_nov[-1*1:3]
mean_dec = mean_dec[-1*1:3]
sd_dec = sd_dec[-1*1:3]
#create dataframe
summary_stats = data.frame(var_name, mean_oct, sd_oct, mean_nov, sd_nov,
mean_dec, sd_dec, stringsAsFactors=FALSE)
#view dataframe
summary_stats
#write dataframe to disk
write.csv(summary_stats, file = "summary_stats.csv")
####################################################################################
# (2.3) Generate tables of counts & relative freq for categorical variables by month
####################################################################################
var_name = character(0)
value = character(0)
count_oct = numeric(0)
prop_oct = numeric(0)
count_nov = numeric(0)
prop_nov = numeric(0)
count_dec = numeric(0)
prop_dec = numeric(0)
for (i in 1:length(data[sapply(data,is.factor)]))
{
var_name = names(data[sapply(data,is.factor)])[i]
value = levels((data[sapply(data,is.factor)])[[i]])
count_oct = as.numeric(table((data[sapply(data,is.factor)])[which(data$Month == 10),i]))
prop_oct = as.numeric(table((data[sapply(data,is.factor)])[which(data$Month == 10),i]))/length((data[sapply(data,is.factor)])[which(data$Month == 10),i])
count_nov = as.numeric(table((data[sapply(data,is.factor)])[which(data$Month == 11),i]))
prop_nov = as.numeric(table((data[sapply(data,is.factor)])[which(data$Month == 11),i]))/length((data[sapply(data,is.factor)])[which(data$Month == 11),i])
count_dec = as.numeric(table((data[sapply(data,is.factor)])[which(data$Month == 12),i]))
prop_dec = as.numeric(table((data[sapply(data,is.factor)])[which(data$Month == 12),i]))/length((data[sapply(data,is.factor)])[which(data$Month == 12),i])
#create dataframe
factor_tables = data.frame(value, count_oct, prop_oct, count_nov, prop_nov, count_dec, prop_dec, stringsAsFactors=FALSE)
#view dataframe
print(factor_tables)
#write dataframe to disk
write.csv(factor_tables, file = paste(var_name,'csv', sep='.'))
#zero out vectors for next iteration
var_name = character(0)
value = character(0)
count_oct = numeric(0)
prop_oct = numeric(0)
count_nov = numeric(0)
prop_nov = numeric(0)
count_dec = numeric(0)
prop_dec = numeric(0)
}
###################################
# (3.1) Create 'ArrivedLate' label
###################################
ArrivedLate = ifelse(data$ArrDelay>15, 1, 0)
ArrivedLate = factor(ArrivedLate, levels = c(0,1), labels = c('on_time', 'delayed'))
table(ArrivedLate)
table(ArrivedLate)/length(ArrivedLate)
#######################################
# (3.2) Verify time delay calculations
#######################################
#Just checking first 10,000 observations (checking too many kills my computer)
for (i in 1:10000)
{
if(nchar(data$ArrTime[i],allowNA=TRUE) == 3){data$ArrTime[i] = paste('0',data$ArrTime[i],sep='')}
if(nchar(data$CRSArrTime[i],allowNA=TRUE) == 3){data$CRSArrTime[i] = paste('0',data$CRSArrTime[i],sep='')}
}
ActArr.split = t(sapply(data$ArrTime[1:10000], function(x) substring(x,first=c(1,3),last=c(2,4))))
CRSArr.split = t(sapply(data$CRSArrTime[1:10000], function(x) substring(x,first=c(1,3),last=c(2,4))))
#Not bothering to actually specify year/month/day since we are just comparing sheduled and actual arrival
ActArrTime.str = sprintf("1987-01-01 %s:%s:00",ActArr.split[,1],ActArr.split[,2])
ActArrTime = strptime(ActArrTime.str,"%Y-%m-%d %H:%M:%S")
CRSArrTime.str = sprintf("1987-01-01 %s:%s:00",CRSArr.split[,1],CRSArr.split[,2])
CRSArrTime = strptime(CRSArrTime.str,"%Y-%m-%d %H:%M:%S")
delay = difftime(ActArrTime,CRSArrTime,units='mins')
table(data$ArrDelay[1:10000] == delay, useNA = 'ifany')
table(data$ArrDelay[1:10000] == delay)/10000
#Looks pretty good. I'm finding ~99% to be calculated correctly.
#I could probably improve this a bit by considering the date, but I'm stopping here.
#######################################
# (4.1) kNN classification
#######################################
# randomize order of observations
set.seed(0)
#This should make results reproducible
data_r = data[sample(nrow(data)),]
#remove problematic predictors
data_r$DepTime = NULL
data_r$ArrTime = NULL
data_r$ActualElapsedTime = NULL
data_r$DepDelay = NULL
data_r$ArrDelay = NULL
data_r$CRSDepTime = NULL
data_r$CRSArrTime = NULL
data_r$FlightNum = NULL
data_r$Month = NULL
data_r$DayofMonth = NULL
data_r$DayOfWeek = NULL
#remove categorical predictors
data_r$UniqueCarrier = NULL
data_r$Origin = NULL
data_r$Dest = NULL
# create normalization function
normalize = function(x) {
return ((x - min(x, na.rm=TRUE)) / (max(x, na.rm=TRUE) - min(x, na.rm=TRUE)))
}
# normalize the data
data_n = as.data.frame(lapply(data_r, normalize))
str(data_n)
#note there are really only too numeric predictors appropriate for knn analysis
#also note that CRSElapsedTime is stored as minutes in the air so the variable doesn't require any special handling
# create smaller training and test data (I get a "too many ties" error running on the full data set)
data_train = data_n[1:100000, ]
data_test = data_n[100001:130000, ]
labels_train = ArrivedLate[1:100000]
labels_test = ArrivedLate[100001:130000]
# train a model on the data
library(class)
# classify test set using knn with k=9
test_pred = knn(train = data_train, test = data_test, cl = labels_train, k=9)
# evaluate model performance
library(gmodels)
CrossTable(x = labels_test, y = test_pred, prop.chisq=FALSE)
####################################################
# (5.1) Correlations between quantitative variables
####################################################
#randomize the 'original' data set
data_ran = data[sample(nrow(data)),]
#normalization isn't needed for this algorithm
#looking at structure of the data set
str(data_ran)
#removing problematic predictors
data_ran$DayofMonth = NULL
data_ran$DepTime = NULL
data_ran$ArrTime = NULL
data_ran$CRSArrTime = NULL
data_ran$FlightNum = NULL
data_ran$ElapsedTime = NULL
data_ran$Origin = NULL
data_ran$Dest = NULL
data_ran$DepDelay = NULL
data_ran$ActualElapsedTime = NULL
#multicollinearity with distance
data_ran$CRSElapsedTime = NULL
#Making a new variable for the busiest flight days
BusyDay = ifelse((data_ran$DayOfWeek!=2) & (data_ran$DayOfWeek!=3) &
(data_ran$DayOfWeek!=4), 1, 0)
data_ran$BusyDay = factor(BusyDay, levels = c(0,1), labels = c('not busy', 'busy'))
#Turning Month into a factor
data_ran$Month = factor(data_ran$Month, levels = c(10,11,12), labels = c('Oct', 'Nov', 'Dec'))
#Removing DayOfWeek since it has been replaced by BusyDay
data_ran$DayOfWeek = NULL
#splitting the data set into training and test sets,
#only using 130000 records so my computer doesn't break
data_train = data_ran[1:100000, ]
data_test = data_ran[100001:130000, ]
#Creating a scatterplot/correlation matrix
library('psych')
pairs.panels(data_train[,c(4,2,5)])
###############################################
# (5.2) Regression Tree for numeric prediction
###############################################
par(mfrow=c(1,1))
# regression tree using rpart
library(rpart)
air.rpart <- rpart(data_train$ArrDelay ~ ., data=data_train)
air.rpart2 <- rpart(data_train$ArrDelay ~ ., data=data_train,
control = rpart.control(cp = 0.001))
# use the rpart.plot package to create a visualization
library(rpart.plot)
# a basic decision tree diagram
rpart.plot(air.rpart, digits = 2)
rpart.plot(air.rpart2, digits = 2)
##Evaluating model performance
# generate predictions for the testing dataset
p.rpart <- predict(air.rpart, data_test)
p.rpart2 <- predict(air.rpart2, data_test)
#complexity parameter
printcp(air.rpart)
printcp(air.rpart2)
# compare the correlation
cor(p.rpart, data_test$ArrDelay)
cor(p.rpart2, data_test$ArrDelay)
# function to calculate the mean absolute error
MAE <- function(actual, predicted) {
mean(abs(actual - predicted))
}
# mean absolute error between predicted and actual values
MAE(p.rpart, data_test$ArrDelay)
MAE(p.rpart2, data_test$ArrDelay)
# mean absolute error between actual values and mean value
MAE(mean(data_train$ArrDelay), data_test$ArrDelay) # **** correction ****
## Improving model performance
# train a M5' Model Tree
library(RWeka)
m.m5p <- M5P(data_train$ArrDelay ~ ., data = data_train)
# get a summary of the model's performance
summary(m.m5p)
#m.m5p
#Number of Rules : 80
# generate predictions for the model
p.m5p <- predict(m.m5p, data_test)
# correlation between the predicted and true values
cor(p.m5p, data_test$ArrDelay)
# mean absolute error of predicted and true values
# (uses a custom function defined above)
MAE(data_test$ArrDelay, p.m5p)