B365-final-project/ashLinearModels.R at master · nelariddle/B365-final-project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
data<-read.csv("data-with-inflation.csv",stringsAsFactors = TRUE)
library(ggplot2)
library(dplyr)

## Data cleaning Nela did:
# dont care about career and thesis courses
data<-data%>%filter(LETTER!="Y"&LETTER!="G")
# only undergrad
data<-data%>%filter(COURSE.<500)

## More Data cleaning
# Select only important columns
data = data[,c("YEAR", "COURSE.", "PERCENT.MAJORS", "NEWSALARY", "AVG.SECT.GPA")]

# Only use if you want to put the course numbers into bins
#data["COURSE."] = floor(data["COURSE."] / 100)

# Test linear model with each of the variables for the best model
allSE = rep(0, 4)
for(i in 1:(length(data[1,])-1)){
  d = cbind(rep(1,length(data[,1])), data[,i])
  a = solve(t(d) %*% d, t(d) %*% data[,length(data)])
  yhat = d %*% a
  error = data[,5] - yhat
  se = sum(error ^ 2)
  allSE[i] = se
  print(paste("Squared Error for", colnames(data)[i], "is", round(se, 3)))
}

# Printing out the order of how good the predictors were
ordered = order(allSE)
print(paste("The best predictor for GPA is", colnames(data)[ordered[1]], "with squared error", allSE[ordered[1]]))
print(paste("The second best predictor for GPA is", colnames(data)[ordered[2]], "with squared error", allSE[ordered[2]]))
print(paste("The third best predictor for GPA is", colnames(data)[ordered[3]], "with squared error", allSE[ordered[3]]))
print(paste("The worst predictor for GPA is", colnames(data)[ordered[4]], "with squared error", allSE[ordered[4]]))

# Bar chart of the SSE for the single variable regression
# Not used in paper or presentation
ggplot() +
  geom_bar(aes(x = (colnames(data)[1:4]), y = allSE, fill = (colnames(data)[1:4])), stat = "identity") +
  geom_text(aes(x = (colnames(data)[1:4]), y = allSE, label = round(allSE, 2)), vjust = 2)

# Test linear model for 2nd best predictor
# Not important because the one below does the same thing but better
# Only used if you want to do the 2-variable graph because the one below
#     isn't set up to calculate yhat2 or the equation
for(i in 1:4){
  if(i != 1){
    d = cbind(rep(1,length(data[,1])), data[,i], data[,1])
    a = solve(t(d) %*% d, t(d) %*% data[,length(data)])
    yhat2 = d %*% a
    error = data[,5] - yhat2
    se = sum(error ^ 2)
    print(paste("Squared Error for year and", colnames(data)[i], "is", round(se, 3)))
  }
}

print("Second best predictor combined with year is course number")
print(paste("The linear model for the effect of year and course number on GPA is ",
            round(a[1], 7)," + ", round(a[2], 7),"[COURSE.] + ", round(a[3], 7), "[YEAR]", sep = ""))

# Not using this plot- ggplot is just better
#plot(data[,"COURSE."], data[,"AVG.SECT.GPA"])
#lines(sort(data[,"COURSE."]), sort(yhat2))

# Creating the ggplot of the linear relationship between GPA and year
#     (with course number as the color)
ggplot() +
  geom_point(data = data, aes(x = data[,"YEAR"], y = data[,"AVG.SECT.GPA"], color = data[,"COURSE."])) +
  geom_smooth(aes(x = sort(data[,"YEAR"]), y = sort(yhat)), method = "lm") +
  labs(color = "Course Number", title = "Effect of Year on GPA", x = "Year",
       y = "Average Student GPA")

# Linear relationship with both year and course number considered
ggplot() +
  geom_point(data = data, aes(x = data[,"YEAR"], y = data[,"AVG.SECT.GPA"], color = data[,"COURSE."])) +
  geom_smooth(aes(x = sort(data[,"YEAR"]), y = sort(yhat)), method = "lm") +
  geom_smooth(aes(x = sort(data[,"YEAR"]), y = sort(yhat2)), method = "lm", color = "red") +
  labs(color = "Course Number", title = "Effect of Year on GPA", x = "Year",
       y = "Average Student GPA")

# Finding the graph of the SSE as the number of attributes used increases
bestSSE = rep(0,4)
n = length(data[,1])
ncol = length(data) - 1
used = rep(0, (n-1))
prev = rep(1,length(data[,1]))
for(j in 1:ncol){
  allSE = rep(0, 4)
  for(i in 1:ncol){
    if(sum(used == i) == 0){
      d = cbind(prev, data[,i])
      a = solve(t(d) %*% d, t(d) %*% data[,length(data)])
      yhat = d %*% a
      error = data[,5] - yhat
      se = sum(error ^ 2)
      allSE[i] = se
      print(paste("Squared Error for", colnames(data)[i], "is", round(se, 3)))
    }
  }
  o = order(allSE)
  used[j] = o[j]
  prev = cbind(prev, data[,o[j]])
  bestSSE[j] = allSE[o[j]]
  print(paste("Attribute",j, "is", colnames(data)[o[j]]))
}
print(bestSSE)

# The graph of the change in SSE as more attributes are added
ggplot() +
  geom_point(aes(x = 1:4, y = bestSSE)) +
  geom_smooth(aes(x = 1:4, y = bestSSE), method = "lm") +
   xlab("Number of attributes used") +
  ylab("Sum of squared error") +
  ggtitle("Sum of squared error predicting GPA of a class")