forked from andrew4costa/Data_Silence
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathUnsupervised_kmeans
92 lines (81 loc) · 2.8 KB
/
Unsupervised_kmeans
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#This code allows the clustering of our dataset with the k-means unsupervised algorithm
#useful tool------------------------------------------------------------------------------
extract_results <- function (dat,res){
qual<-c("high", "low")
typ<-c("white", "red")
a<-1
#loop for the combinations
for(c in 1:2){
for (t in 1:2){
for(q in 1:2){
res[a,c]<-length(which(dat$quality==qual[q]&dat$type==typ[t]&dat$cluster==c))
a<-a+1
}
}
a<-1}
#loop for the totals
a<-5
for(c in 1:2){
for (t in 1:2){
res[a,c]<-length(which(dat$type==typ[t]&dat$cluster==c))
a<-a+1}
for(q in 1:2){
res[a,c]<-length(which(dat$quality==qual[q]&dat$cluster==c))
a<-a+1
}
a<-5}
print(res)
return(res)
}
#-------------------------------------------------------------------------------------
##Importing & Preprocessing####
rw = read.table("C:\\Users\\...\\winequality-red.csv",sep=",",header=T)
ww = read.table("C:\\Users\\...\\winequality-white.csv",sep=";",header=T)
rw['type'] <- c("red")
ww['type'] <- c("white")
wdb <- rbind(ww,rw)
wdb<-as.data.frame(wdb)
wdb <- wdb %>%
mutate(
quality = ifelse(quality<=5, "low", "high"),
quality = factor(quality)
) %>%
mutate(across(where(is.character), as.factor))
wdb <- wdb[!duplicated(wdb),]
std <- wdb %>% mutate_if(is.numeric, ~(scale(.) %>% as.vector))#k-means computes euclidean distance, so we standardize
#std %>%
# DataExplorer::create_report()
#How will 2-Means Classify our dataset?####--------------------------------------------------------------
library(ggpubr)
library(factoextra)
set.seed(42)
res.km <- kmeans(std[,-c(12,13)], 2, nstart = 150)
print(res.km)
std <- cbind(std, cluster = res.km$cluster)
std$cluster<-as.factor(std$cluster)
library(caret)
summary(std[,c(12,13,14)])
#extracting results-----------------------------------------------------------------------------------
r<-matrix(c(1:16),nrow=8)
r<-as.data.frame(r)
rownames(r)<-c("WH","WL","RH","RL","W","R","H","L")
r<-extract_results(std,r)
colnames(r)<-c("C1","C2")
kmeans_res<-r%>%
mutate(tot = C1+C2)%>%
mutate(pc_1 = round(C1*100/tot,2))%>%
mutate(pc_2 = round(C2*100/tot,2))
kmeans_res
dat <- std%>%
filter((cluster==1&type=="white")|(cluster==2&type=="red"))
highlight_df <- std%>%
filter((cluster==2&type=="white")|(cluster==1&type=="red"))
##Plotting Results---------------------------------------------------------------------------------------
atop<- dat%>%
ggplot(aes(x=type, y=quality))+
scale_y_discrete(limits = c("low","high"))+
geom_jitter(aes(color=cluster))+
scale_color_manual(values=c("#DBDAB5","#94345D"))+
geom_point(data=highlight_df,aes(x=type, y=quality), size=1.5, position="jitter", color="black") +
ggtitle("K-means Classification of Wine","Based on Eleven Features Standardized")
atop