-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWGCNA.R
307 lines (278 loc) · 11.4 KB
/
WGCNA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
#WGCNA
library(WGCNA)
library(doParallel)
setwd("C:/Users/11146/Desktop/双疾病生信数据/GSE33463")
load("exp.Rdata")
#(1)导入数据
#输入数据的格式是列为样本,行为基因的矩阵格式,所以不用置换矩阵
## 因为WGCNA针对的是基因进行聚类,而一般我们的聚类是针对样本用hclust即可,所以这个时候需要转置。
#根据其方差选择了前25%的变异较大的基因,包括约5,000个基因进行WGCNA分析
exp_mt = t(exp[order(apply(exp,1,mad), decreasing = T)[1:5000],])
##(2)判断数据质量
gsg <- goodSamplesGenes(exp_mt,verbose = 3)
gsg$allOK
if (!gsg$allOK) { # 如果存在异常样本或基因
if (sum(!gsg$goodGenes)>0) # 异常的基因
printFlush(paste("Removing genes:",
paste(names(exp_mt)[!gsg$goodGenes], collapse = ",")));
if (sum(!gsg$goodSamples)>0) # 异常的样本
printFlush(paste("Removing samples:",
paste(rownames(exp_mt)[!gsg$goodSamples], collapse = ",")));
# 删除异常的样本和基因
exp_mt = exp_mt[gsg$goodSamples, gsg$goodGenes]
}
##(3)判断数据质量--离群点样本
sampleTree = hclust(dist(exp_mt), method = "average")
sizeGrWindow(12,9)
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers",
sub="", xlab="", cex.lab = 1.5,
cex.axis = 1.5, cex.main = 2)
abline(h = 70, col = "red") #根据实际情况而定
##如下图所示,存在一个显著离群点;剔除掉
clust = cutreeStatic(sampleTree, cutHeight = 70, minSize = 10)
table(clust)
# clust
# 0 1
# 1 112
keepSamples = (clust==1)
exp_mt_f = exp_mt[keepSamples, ]
exp_mt_f <- exp_mt
row <- row.names(exp_mt_f)
row <- data.frame(row)
write.csv(row,"rowgroup.csv")
#分组数据
group <- read.table("group.txt",row.names = 1,header = T, sep = "\t")
sameSample=intersect(rownames(exp_mt_f), rownames(group))
exp_mt_f=exp_mt_f[sameSample,]
datTraits=group[sameSample,]
traitColors = numbers2colors(datTraits, signed = FALSE)
sampleTree2 = hclust(dist(exp_mt_f), method = "average")
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree2, main = "Sample clustering to detect outliers",
sub="", xlab="", cex.lab = 1.5,
cex.axis = 1.5, cex.main = 2)
plotDendroAndColors(sampleTree2, traitColors,
groupLabels = names(datTraits),
main = "Sample dendrogram and trait heatmap")
#选择合适的软阈值β
powers = c(c(1:10), seq(from = 12, to=20, by=2))
sft = pickSoftThreshold(exp_mt_f, powerVector = powers, verbose = 5)
##结果可视化:
sizeGrWindow(9, 5)
par(mfrow = c(1,2))
cex1 = 0.85#一般是0.9,可以降低为0.8,最低是0.8
### (1)是否符合幂律分布;
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red")
abline(h=0.85,col="red")
# Soft threshold与平均连通性
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers,
cex=cex1, col="red")
#横坐标为软阈值的梯度,第一幅图的纵坐标为无标度网络适应系数,越大越好;第二幅图的纵坐标为节点的平均连通度,越小越好。
#基因分布符合无尺度网络
#查看系统给我们推荐的软阈值
sft$powerEstimate
#4
#6
####----构建共表达网络----####
net <- blockwiseModules(
exp_mt_f,
power = 6,
minModuleSize = 20,#模块的最少基因数
corType = "pearson",#默认皮尔森
networkType="unsigned",#计算邻接矩阵时,是否考虑正负相关性,默认unsigned
TOMType = "unsigned",#计算TOM矩阵时,是否考虑正负相关性,默认unsigned
mergeCutHeight = 0.25,#合并模块的阈值
numericLabels = TRUE,#模块名是否为数字;若设置FALSE,表示映射为颜色名
saveTOMs = TRUE,
saveTOMFileBase = "PAH-TOM",
verbose = 5
)
names(net)
#聚类树状图,color注释模块
#灰色部分太多则说明由于样本中基因共表达趋势不明显,可能需要调整基因过滤的方法
mergedColors = labels2colors(net$colors)
plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
# (1)计算模块特征值
moduleLabels = net$colors
moduleColors = labels2colors(net$colors)
MEs = net$MEs
geneTree = net$dendrograms[[1]]
nGenes = ncol(exp_mt_f)
nSamples = nrow(exp_mt_f)
# 用color labels重新计算MEs(Module Eigengenes:模块的第一主成分)
MEs0 = moduleEigengenes(exp_mt_f, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
# (2)计算module与表型的相关性以及对应的P值
moduleTraitCor = cor(MEs, group, use = "p")
moduleTraitCor[1:2,1:2]
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(MEs))
#(3)可视化相关性与P值
sizeGrWindow(10,6)
# #设置热图上的文字(两行数字:第一行是模块与各种表型的相关系数;第二行是p值,signif 取有效数字
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "")
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3))
# 然后对moduleTraitCor画热图
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(group),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = blueWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.8,
cex.lab = 1,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
#相似模块进行网络构建
MEList = moduleEigengenes(exp_mt_f, colors = moduleColors)#【提取相似模块】
MEs = MEList$eigengenes
MEDiss = 1-cor(MEs);
METree = hclust(as.dist(MEDiss), method = "average")
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
#筛选hub基因
#计算基因与模块的相关性
nSamples <- dim(exp_mt_f)[1]
geneModuleMembership <- cor(exp_mt_f, MEs, use = "p")
MMPvalue <- corPvalueStudent(geneModuleMembership, nSamples)
#计算基因与表型的相关性
geneSignificanceCor <- cor(exp_mt_f, group, use = "p")
geneSignificanceP <- corPvalueStudent(geneSignificanceCor, nSamples)
#模块内基因与模块和表型之间的相关性
module <- "blue"
column <- paste0("ME", module)
moduleGenes <- names(net$colors)[which(moduleColors == module)]
MM <- abs(geneModuleMembership[moduleGenes, column])
GS <- abs(geneSignificanceCor[moduleGenes, 1])
verboseScatterplot(
MM, GS,
xlab = paste("Module Membership in", module, "module"),
ylab = "Gene significance for proliferating",
main = paste("Module membership vs. gene significance\n"),
abline = TRUE,
pch = 21,
cex.main = 1.2,
cex.lab = 1.2,
cex.axis = 1.2,
col = "black",
bg = module
)
#推荐选取散点图中右上角部分的基因,即两个相关性均较大的基因。
#brown
moduleGenes[(GS > 0.4 & MM > 0.85)]
# "SCYB10";"BAL";"FBX6";"DKFZP564A2416";"FLJ10335";"STAT1";"FABP5";"MTHFD1";"CASP3"
#cyan
moduleGenes[(GS > 0.4 & MM > 0.85)]
#"IFNG" "FIP2"
###【导出基因所在的模块】
probes = colnames(exp_mt_f)#【读取基因名】
geneInfo0 = data.frame(probes= probes,
moduleColor = moduleColors)
geneOrder =order(geneInfo0$moduleColor)#【读取模块】
geneInfo = geneInfo0[geneOrder, ]
#【保存所有模块基因】
write.table(geneInfo, file = "all_genes1.csv",sep="\t",row.names=F,quote=F)
###【输出每个模块的基因】
#【使用循环对每个模块的基因进行导出】
for (mod in 1:nrow(table(moduleColors))){
modules = names(table(moduleColors))[mod]
probes = colnames(exp_mt_f)
inModule = (moduleColors == modules)
modGenes = probes[inModule]
write.table(modGenes, file =paste0("module_",modules,".txt"),sep="\t",row.names=F,col.names=F,quote=F)
}
#挑选模块Hub基因
## (1) 模块内基因连接度
adjacency = adjacency(exp_mt_f, power = 14)
TOM = TOMsimilarity(adjacency)
TOM[1:4,1:4]
Alldegrees =intramodularConnectivity(adjacency, net$colors)
head(Alldegrees)
#### kTotal:基因在整个网络中的连接度
#### kWithin: 基因在所属模块中的连接度,即Intramodular connectivity
#### kOut: kTotal-kWithin
#### kDiff: kIn-kOut
TOM <- as.matrix(TOM)
dissTOM = 1-TOM
# Transform dissTOM with a power to make moderately strong
# connections more visible in the heatmap
plotTOM = dissTOM^7
# Set diagonal to NA for a nicer plot
diag(plotTOM) = NA
# Call the plot function
mycolor <- gplots::colorpanel(250, 'red', "orange", 'lemonchiffon')
TOMplot(
plotTOM, net$dendrograms[[1]], # 取对应块基因的颜色
moduleColors[net$blockGenes[[1]]],
main = "Network heatmap plot, all genes",
col = mycolor
)
##绘制TOM热图
# tom plot
nGenes = ncol(exp_mt_f)
nSamples = nrow(exp_mt_f)
geneTree = net$dendrograms[[1]]
dissTOM = 1 - TOMsimilarityFromExpr(exp_mt_f, power = 6)
nSelect = 400
select = sample(nGenes, size = nSelect)
selectTOM = dissTOM[select, select]
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = moduleColors[select]
sizeGrWindow(9,9)
plotDiss = selectTOM^7
diag(plotDiss) = NA
TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected
genes",col=gplots::colorpanel(250,'red',"orange",'lemonchiffon'))
##
MEs0 = moduleEigengenes(exp_mt_f, moduleColors)$eigengenes
# 计算根据模块特征向量基因计算模块相异度:
MEDiss = 1-cor(MEs0);
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average")
# Plot the result
plotEigengeneNetworks(MEs0,
"Eigengene adjacency heatmap",
marHeatmap = c(3,4,2,2),
plotDendrograms = FALSE,
xLabelsAngle = 90)
####----数据导出用于Cytoscape----####
#【构建文件夹】
cytoDir="CytoscapeInput"
dir.create(cytoDir)
for (mod in 1:nrow(table(moduleColors))){ #【对每个模块进行循环】
modules = names(table(moduleColors))[mod]
probes = colnames(exp_mt_f)
inModule = (moduleColors == modules)
modProbes = probes[inModule]
modGenes = modProbes
modTOM = TOM[inModule, inModule]
dimnames(modTOM) = list(modProbes, modProbes)
edges_File = paste("CytoscapeInput-edges-", modules , ".txt", sep="")
nodes_File = paste("CytoscapeInput-nodes-", modules, ".txt", sep="")
outEdge=paste(cytoDir,edges_File,sep="\\")
outNode=paste(cytoDir,nodes_File,sep="\\")
cyt = exportNetworkToCytoscape(modTOM,
edgeFile = outEdge,
nodeFile = outNode,
weighted = TRUE,
threshold = 0.02,
nodeNames = modProbes,
altNodeNames = modGenes,
nodeAttr = moduleColors[inModule])
}