-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathDESeq2_template.R
184 lines (141 loc) · 9.83 KB
/
DESeq2_template.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/env Rscript
# Variables
table <- ""
cond1 <- ""
rep1 <- "" # Number of replicates for cond1, automatically determined
cond2 <- ""
rep2 <- "" # Number of replicates for cond2, automatically determined
output <- ""
siglfc <- 1 # |Log2| fold change to be used as significant. Set at 1 by default, meaning a 2 fold change (log2(2)=1)
sigpadj <- 0.05 # Adjusted p-value used as upper threshold for significance
##
## Top part will actually be generated by the main sh_RNAseq.sh script.
## When updating this template in sh_RNAseq.sh do not forget to escape the "$" character by replacing it with "\$"
##
# This script uses DESeq2 in R to run differential-expression analysis on a reads count matrix of genes and automatically generate some graphs.
# Load libraries
library(DESeq2)
library(RColorBrewer)
library(calibrate)
library(pheatmap)
# Read in the data and make sample information table
data=read.table(table, header=TRUE, row.names=1, check.names=FALSE, stringsAsFactors=FALSE) # read the reads count matrix file that was generated by rsem-generate-data-matrix
colnames(data) <- sub('.rsem.genes.results', '', basename(colnames(data))) # fix sample names in column names
cols=c(1:ncol(data)) # get the number of columns
data[,cols]=apply(data[,cols], 2, function(x) as.numeric(as.integer(x))) # format the matrix as integer numbers for DESeq2 to be happy
conditions <- factor(c(rep(cond1, rep1), rep(cond2, rep2))) # generate a comma-separated list of conditions/treatments for each sample. Replicates should be named the same.
samples=as.data.frame((colnames(data))) # make samples list for DESeq2
samples <- cbind(samples, conditions)
colnames(samples)=c("experimental","condition")
groups=factor(samples[,2])
# Run DESeq2 to get the result for differential-expression
dds=DESeqDataSetFromMatrix(countData=as.matrix(data), colData=samples, design =~condition) # generate a DESeq2 object from the data
colnames(dds) <- colnames(data)
dds <- DESeq(dds) # run DESeq2
# Regularized log transformation for clustering/heatmaps, etc
vsd <- vst(dds, blind=FALSE)
# Save differential expression results
res <- results(dds)
table(res$padj<sigpadj)
## Order by adjusted p-value
res <- res[order(res$padj), ]
## Merge with normalized count data
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "Gene"
## Write results
write.csv(resdata, file=paste0(output, cond1, "_vs_", cond2, "-ExpDiff.csv"))
# Variables
conditions_df=as.data.frame(colData(dds)[c("condition")])
colnames(conditions_df)=c("Labels")
condcol <- brewer.pal(8, "Set1")[1:length(unique(conditions))] # Distance Matrix conditions colors
names(condcol) <- unique(conditions) # Proper column names to use in the legend
condcol <- list(Labels = condcol)
dmcol <- colorRampPalette(brewer.pal(9, 'GnBu'))(100) # Distance Matrix color palette
hmcol <- colorRampPalette(brewer.pal(9, 'RdYlBu'))(100) # Heatmaps color palette
# MA plot (without and with "significant" genes labeled)
maplot <- function (res, sigthresh=0.05, labelsig=TRUE, textcx=1, ...) { # sigthresh=0.05 is the default value if ommitted when creating the graph. No need to edit here.
with(res, plot(baseMean, log2FoldChange, pch=20, cex=.5, log="x", ...))
with(subset(res, padj<sigthresh), points(baseMean, log2FoldChange, col="red", pch=20, cex=1.5))
if (labelsig) {
require(calibrate)
with(subset(res, padj<sigthresh), textxy(baseMean, log2FoldChange, labs=Gene, cex=textcx, col=2))
}
}
resNorm <- lfcShrink(dds, coef=2, type="normal")
try({ # May fail if apeglm is not installed ( BiocManager::install("apeglm") )
resLFC <- lfcShrink(dds, coef=resultsNames(dds)[2], type="apeglm") # Will take a while
})
# DESeq2 MA plot function and custom MA plot with significant gene names. Here we use the sigpadj variable value to generate the graph
pdf(paste0(output, cond1, "_vs_", cond2, "-MAplot.pdf"))
plotMA(res, ylim=c(-2,2), main=paste0(cond1, " vs ", cond2, " MA Plot"))
plotMA(resNorm, ylim=c(-2,2), main=paste0(cond1, " vs ", cond2, " normal shrunken log2 MA Plot"))
try({ # May fail if resLFC was not computed before
plotMA(resLFC, ylim=c(-2,2), main=paste0(cond1, " vs ", cond2, " apeglm shrunken log2 MA Plot"))
})
maplot(resdata, sigpadj, xlab="mean of normalized counts", ylab=expression(log[2]~fold~change), main=paste0(cond1, " vs ", cond2, " MA Plot"))
garbage <- dev.off() # Save to file
# Sample Distance Matrix
sampleDists <- dist(t(assay(vsd)))
pdf(paste0(output, cond1, "_vs_", cond2, "-DistanceMatrix.pdf"))
pheatmap(as.matrix(sampleDists), col=rev(dmcol), clustering_distance_rows=sampleDists, clustering_distance_cols=sampleDists, cluster_rows=TRUE, cluster_cols=TRUE, show_rownames=TRUE, show_colnames=TRUE, annotation_col=conditions_df, annotation_row=conditions_df, annotation_names_row=FALSE, annotation_names_col=FALSE, annotation_colors=condcol, legend = FALSE, main=paste0(cond1, " vs ", cond2, " Matrix"))
garbage <- dev.off() # Save to file
# Plot dispersions
pdf(paste0(output, cond1, "_vs_", cond2, "-DispersionPlot.pdf"))
plotDispEsts(dds, main=paste0(cond1, " vs ", cond2, " Dispersion Plot"))
garbage <- dev.off() # Save to file
# Principal Components Analysis
pdf(paste0(output, cond1, "_vs_", cond2, "-PCA.pdf"))
plotPCA(vsd, intgroup="condition")
garbage <- dev.off() # Save to file
# Volcano plot with "significant" genes labeled
# sigthresh=0.05 and lfcthresh=1 are the default values if ommitted when creating the graph. No need to edit here.
volcanoplot <- function (res, lfcthresh=1, sigthresh=0.05, main="Volcano Plot", legendpos="bottomright", labelsig=TRUE, textcx=1, ...) {
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main=main, ...))
with(subset(res, padj<sigthresh ), points(log2FoldChange, -log10(pvalue), pch=20, col="red", ...))
with(subset(res, abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="orange", ...))
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="green", ...))
if (labelsig) {
require(calibrate)
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=textcx, ...))
}
legend(legendpos, xjust=1, yjust=1, legend=c(paste("FDR<",sigthresh,sep=""), paste("|LogFC|>",lfcthresh,sep=""), "Both"), pch=20, col=c("red","orange","green"))
}
# Custom Volcano Plot with significant gene names. Here we use the sigpadj and siglfc variable values to generate the graph
pdf(paste0(output, cond1, "_vs_", cond2, "-VolcanoPlot.pdf"))
volcanoplot(resdata, siglfc, sigpadj, textcx=.8, xlim=c(-2, 2), main=paste0(cond1, " vs ", cond2, " Volcano Plot"))
garbage <- dev.off() # Save to file
# Heatmap of significant hits ( padj<sigpadj and |log2FoldChange|>=siglfc )
sig <- rownames(res[!is.na(res$padj) & res$padj<sigpadj & abs(res$log2FoldChange)>=siglfc,])[1:30] # Select the first 30 significant hits (sorted by padj)
sig <- sig[!is.na(sig)] # If we have less than 30 hits, clean "NA" fields
counts <- counts(dds,normalized=TRUE) # Get normalized read counts (sorted by padj)
counts <- counts[apply(counts, 1, function(row) all(row !=0 )),] # Remove genes with zero reads
sigcounts <- counts(dds,normalized=TRUE)[sig,] # Get normalized read counts (sorted by padj) for significant genes
try({ # May fail if only 0 or 1 gene is significant
sigcounts <- sigcounts[apply(sigcounts, 1, function(row) all(row !=0 )),] # Remove genes with zero reads
})
# By defaults hits are not clustered and thus stay sorted by their padj value
pdf(paste0(output, cond1, "_vs_", cond2, "-HeatmapSig.pdf"), onefile=FALSE)
try({ # May fail if only 0 or 1 gene is significant
pheatmap(log2(sigcounts), col=rev(hmcol), scale='row', cluster_rows=FALSE, cluster_cols=FALSE, annotation_col=conditions_df, annotation_names_col=FALSE, annotation_colors=condcol, main=paste0(cond1, " vs ", cond2, " Top Hits"))
})
garbage <- dev.off() # Save to file
# But we can cluster them using the following command
pdf(paste0(output, cond1, "_vs_", cond2, "-HeatmapSigClust.pdf"), onefile=FALSE)
try({ # May fail if only 0 or 1 gene is significant
pheatmap(log2(sigcounts), col=rev(hmcol), scale='row', cluster_rows=TRUE, cluster_cols=TRUE, annotation_col=conditions_df, annotation_names_col=FALSE, annotation_colors=condcol, main=paste0(cond1, " vs ", cond2, " Clustered Top Hits"))
})
garbage <- dev.off() # Save to file
# And even plot every gene
pdf(paste0(output, cond1, "_vs_", cond2, "-HeatmapAllClust.pdf"), onefile=FALSE)
pheatmap(log2(counts), col=rev(hmcol), scale='row', cluster_rows=TRUE, cluster_cols=TRUE, show_rownames=FALSE, show_colnames=FALSE, annotation_col=conditions_df, annotation_names_col=FALSE, annotation_colors=condcol, main=paste0(cond1, " vs ", cond2, " Clustered Reads Count"))
garbage <- dev.off() # Save to file
# Additional optional graphs using DESeqAnalysis https://github.com/acidgenomics/DESeqAnalysis
# Principal Components Analysis with labels
try(library(DESeqAnalysis))
pdf(paste0(output, cond1, "_vs_", cond2, "-PCAlabels.pdf"), width=20, height=20)
try(DESeqAnalysis::plotPCA(vsd, label = TRUE, interestingGroups="condition", title=paste0(cond1, " vs ", cond2, " PCA")))
garbage <- dev.off() # Save to file
# Adapted from:
# http://www.bioconductor.org/help/workflows/rnaseqGene/
# https://gist.github.com/stephenturner/f60c1934405c127f09a6
# https://dwheelerau.com/2014/02/17/how-to-use-deseq2-to-analyse-rnaseq-data/