-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path614_project.R
158 lines (115 loc) · 3.67 KB
/
614_project.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
########################## Impoarting data ##########################
library(readxl)
library(ggplot2)
library(MSQC)
library(factoextra)
library(qcc)
data <- read_xlsx("Project_dataset.xlsx", col_names = FALSE)
df1 <- as.matrix.data.frame(data)
df2 <- as.matrix.data.frame(df1)
########################## Initialize variables ##########################
UCL<- NULL
LCL<- NULL
y<- NULL
out_up<-NULL
out_low<-NULL
S <- cov(df2)
n <- nrow(df2)
p <- ncol(df2)
mu <-colMeans((df2))
################### Decide number of Principle components ##################
#methods used are Scree plot and Minimum Desciption Length Analysis
pcaCharts <- function(x) {
x.var <- x$sdev ^ 2
x.pvar <- x.var/sum(x.var)
print("cumulitive proportions of variance:")
print(cumsum(x.pvar[1:35]))
par(mfrow=c(2,2))
plot(x.pvar[1:20],xlab="Principal component", ylab="Proportion of variance explained", ylim=c(0,1), type='b')
plot(cumsum(x.pvar[1:20]),xlab="Principal component", ylab="Cumulative Proportion of variance explained", ylim=c(0,1), type='b')
screeplot(x)
screeplot(x,type="l")
par(mfrow=c(1,1))
}
### Screeplot
pdf(file= "PC Analysis - screeplot.pdf" )
df2.PCA <- prcomp(df2)
var_explained = df2.PCA$sdev^2 / sum(df2.PCA$sdev^2)
pcaCharts(df2.PCA)
dev.off()
### MDL Analysis
pdf(file= "PC Analysis - MDL.pdf" )
eigen_values <- eigen(S)$values
MDL<-c(rep(0,250))
l<- c(1:250)
for (k in 1:250) {
al<- mean(eigen_values[(k+1):251])
gl<- exp(mean(log(eigen_values[(k+1):251])))
MDL[k]<- n*(251-k)*log(al/gl)+k*(2*251-k)*log(n)/2
}
plot(l,MDL, xlab = "l", ylab = "MDL", col="red",)
optimum_MDL<-which.min(MDL)
optimum_MDL
dev.off()
#---- PCA Analysis using Covariance ----
#The loop is used to create a monitoring chart untill all outliers are removed. The control charts are saved in pdf file
pdf(file= "PCA Cov1.pdf" )
par(mfrow=c(2,2))
p <- 5 #no of PC used
for (j in 1:10){
n<- nrow(df2) # no of rows
S = cov(df2)
x1<- matrix(1:n,nrow = 1,byrow = TRUE)
y<- prcomp(df2)$x
for (i in 1:p){
UCL[i]<- 3*sqrt(eigen(S)$values[i])
LCL[i]<- -3*sqrt(eigen(S)$values[i])
out_up[i]<-list(which(t(y[,i])>UCL[i]))
out_low[i]<-list(which(t(y[,i])<LCL[i]))
plot(x1,y[,i], col="blue", main = paste("Iteration J = ",j), xlab = "X", ylab =paste("PC",i))
lines(x1, rep(UCL[[i]],n), col="red",lty=2)
lines(x1,rep(LCL[[i]],n),col="red",lty=2)
}
outliers <- unlist(c(out_up,out_low))
list(outliers)
if (is.null(outliers) | length(outliers)==0)
break
df2<-df2[-outliers,]
print(outliers)
}
dim(df2)
df2.PCA2 <- prcomp(df2)
var_explained2 = df2.PCA2$sdev^2 / sum(df2.PCA2$sdev^2)
sum(var_explained2[1:p])
sum(var_explained[1:p])
dev.off()
#the pdf file will contain all univariate control charts based on number of PC selected
#mCUSUM chart for detecting sustained mean shift
#---- mCUSUM Analysis ----
### Choose k and h
###
pdf(file= "mCUSUM.pdf" )
par(mfrow=c(2,2))
k <- list(0.5,0.7,1.0,1.2,1.5, 1.7, 2) # various values of k tested.
#for (j in 1:7){
df3 <- as.matrix.data.frame(df1)
df3.centered <- scale(df3, scale = F, center = T)
df3.PCA <- prcomp(df3.centered)
for (i in 1:10){
df3.PCA <- prcomp(df3.centered)
df.pca <- df3.PCA$x[,1:p]
df.pca <- as.data.frame(df.pca)
mcusum <- mult.chart(x = df.pca, type = "mcusum2", alpha=0.0027, k = 1, h = 20, method = "sw")
outliers.cusum <- which(mcusum$t2 > mcusum$ucl)
if (is.null(outliers.cusum) | length(outliers.cusum)==0)
break
df3.centered <- df3.centered[-outliers.cusum,]
#print(outliers.cusum)
}
var_explained.cusum = df3.PCA$sdev^2 / sum(df3.PCA$sdev^2)
#print("iteration = ", j)
print(dim(df3.centered))
print(sum(var_explained.cusum[1:p]))
#}
dev.off()
####### Performace