DESeq2_template.R

#!/usr/bin/env Rscript

# Variables
table <- ""
cond1 <- ""
rep1 <- ""			# Number of replicates for cond1, automatically determined
cond2 <- ""
rep2 <- ""			# Number of replicates for cond2, automatically determined
output <- ""
siglfc <- 1			# |Log2| fold change to be used as significant. Set at 1 by default, meaning a 2 fold change (log2(2)=1)
sigpadj <- 0.05		# Adjusted p-value used as upper threshold for significance


##
## Top part will actually be generated by the main sh_RNAseq.sh script.
## When updating this template in sh_RNAseq.sh do not forget to escape the "$" character by replacing it with "\$"
##


# This script uses DESeq2 in R to run differential-expression analysis on a reads count matrix of genes and automatically generate some graphs.


# Load libraries
library(DESeq2)
library(RColorBrewer)
library(calibrate)
library(pheatmap)


# Read in the data and make sample information table
data=read.table(table, header=TRUE, row.names=1, check.names=FALSE, stringsAsFactors=FALSE)				# read the reads count matrix file that was generated by rsem-generate-data-matrix
colnames(data) <- sub('.rsem.genes.results', '', basename(colnames(data)))								# fix sample names in column names
cols=c(1:ncol(data))																					# get the number of columns
data[,cols]=apply(data[,cols], 2, function(x) as.numeric(as.integer(x)))								# format the matrix as integer numbers for DESeq2 to be happy
conditions <- factor(c(rep(cond1, rep1), rep(cond2, rep2)))												# generate a comma-separated list of conditions/treatments for each sample. Replicates should be named the same.
samples=as.data.frame((colnames(data)))																	# make samples list for DESeq2
samples <- cbind(samples, conditions)
colnames(samples)=c("experimental","condition")
groups=factor(samples[,2])


# Run DESeq2 to get the result for differential-expression
dds=DESeqDataSetFromMatrix(countData=as.matrix(data), colData=samples, design =~condition)				# generate a DESeq2 object from the data
colnames(dds) <- colnames(data)
dds <- DESeq(dds)																						# run DESeq2


# Regularized log transformation for clustering/heatmaps, etc
vsd <- vst(dds, blind=FALSE)

# Save differential expression results
res <- results(dds)
table(res$padj<sigpadj)
## Order by adjusted p-value
res <- res[order(res$padj), ]
## Merge with normalized count data
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "Gene"
## Write results
write.csv(resdata, file=paste0(output, cond1, "_vs_", cond2, "-ExpDiff.csv"))


# Variables
conditions_df=as.data.frame(colData(dds)[c("condition")])
colnames(conditions_df)=c("Labels")
condcol <- brewer.pal(8, "Set1")[1:length(unique(conditions))]											# Distance Matrix conditions colors
names(condcol) <- unique(conditions)																	# Proper column names to use in the legend
condcol <- list(Labels = condcol)
dmcol <- colorRampPalette(brewer.pal(9, 'GnBu'))(100)													# Distance Matrix color palette
hmcol <- colorRampPalette(brewer.pal(9, 'RdYlBu'))(100)													# Heatmaps color palette


# MA plot (without and with "significant" genes labeled)
maplot <- function (res, sigthresh=0.05, labelsig=TRUE, textcx=1, ...) {								# sigthresh=0.05 is the default value if ommitted when creating the graph. No need to edit here.
  with(res, plot(baseMean, log2FoldChange, pch=20, cex=.5, log="x", ...))
  with(subset(res, padj<sigthresh), points(baseMean, log2FoldChange, col="red", pch=20, cex=1.5))
if (labelsig) {
    require(calibrate)
    with(subset(res, padj<sigthresh), textxy(baseMean, log2FoldChange, labs=Gene, cex=textcx, col=2))
  }
}
resNorm <- lfcShrink(dds, coef=2, type="normal")
try({																									# May fail if apeglm is not installed ( BiocManager::install("apeglm") )
resLFC <- lfcShrink(dds, coef=resultsNames(dds)[2], type="apeglm")										# Will take a while
})

# DESeq2 MA plot function and custom MA plot with significant gene names. Here we use the sigpadj variable value to generate the graph
pdf(paste0(output, cond1, "_vs_", cond2, "-MAplot.pdf"))
plotMA(res, ylim=c(-2,2), main=paste0(cond1, " vs ", cond2, " MA Plot"))
plotMA(resNorm, ylim=c(-2,2), main=paste0(cond1, " vs ", cond2, " normal shrunken log2 MA Plot"))
try({																									# May fail if resLFC was not computed before
plotMA(resLFC, ylim=c(-2,2), main=paste0(cond1, " vs ", cond2, " apeglm shrunken log2 MA Plot"))
})
maplot(resdata, sigpadj, xlab="mean of normalized counts", ylab=expression(log[2]~fold~change), main=paste0(cond1, " vs ", cond2, " MA Plot"))
garbage <- dev.off() # Save to file


# Sample Distance Matrix
sampleDists <- dist(t(assay(vsd)))
pdf(paste0(output, cond1, "_vs_", cond2, "-DistanceMatrix.pdf"))
pheatmap(as.matrix(sampleDists), col=rev(dmcol), clustering_distance_rows=sampleDists, clustering_distance_cols=sampleDists, cluster_rows=TRUE, cluster_cols=TRUE, show_rownames=TRUE, show_colnames=TRUE, annotation_col=conditions_df, annotation_row=conditions_df, annotation_names_row=FALSE, annotation_names_col=FALSE, annotation_colors=condcol, legend = FALSE, main=paste0(cond1, " vs ", cond2, " Matrix"))
garbage <- dev.off() # Save to file


# Plot dispersions
pdf(paste0(output, cond1, "_vs_", cond2, "-DispersionPlot.pdf"))
plotDispEsts(dds, main=paste0(cond1, " vs ", cond2, " Dispersion Plot"))
garbage <- dev.off() # Save to file


# Principal Components Analysis
pdf(paste0(output, cond1, "_vs_", cond2, "-PCA.pdf"))
plotPCA(vsd, intgroup="condition")
garbage <- dev.off() # Save to file


# Volcano plot with "significant" genes labeled
# sigthresh=0.05 and lfcthresh=1 are the default values if ommitted when creating the graph. No need to edit here.
volcanoplot <- function (res, lfcthresh=1, sigthresh=0.05, main="Volcano Plot", legendpos="bottomright", labelsig=TRUE, textcx=1, ...) {
  with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main=main, ...))
  with(subset(res, padj<sigthresh ), points(log2FoldChange, -log10(pvalue), pch=20, col="red", ...))
  with(subset(res, abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="orange", ...))
  with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="green", ...))
if (labelsig) {
    require(calibrate)
    with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=textcx, ...))
  }
  legend(legendpos, xjust=1, yjust=1, legend=c(paste("FDR<",sigthresh,sep=""), paste("|LogFC|>",lfcthresh,sep=""), "Both"), pch=20, col=c("red","orange","green"))
}


# Custom Volcano Plot with significant gene names. Here we use the sigpadj and siglfc variable values to generate the graph
pdf(paste0(output, cond1, "_vs_", cond2, "-VolcanoPlot.pdf"))
volcanoplot(resdata, siglfc, sigpadj, textcx=.8, xlim=c(-2, 2), main=paste0(cond1, " vs ", cond2, " Volcano Plot"))
garbage <- dev.off() # Save to file


# Heatmap of significant hits ( padj<sigpadj and |log2FoldChange|>=siglfc )
sig <- rownames(res[!is.na(res$padj) & res$padj<sigpadj & abs(res$log2FoldChange)>=siglfc,])[1:30]	# Select the first 30 significant hits (sorted by padj)
sig <- sig[!is.na(sig)]																					# If we have less than 30 hits, clean "NA" fields
counts <- counts(dds,normalized=TRUE)																	# Get normalized read counts (sorted by padj)
counts <- counts[apply(counts, 1, function(row) all(row !=0 )),]										# Remove genes with zero reads
sigcounts <- counts(dds,normalized=TRUE)[sig,]															# Get normalized read counts (sorted by padj) for significant genes
try({																									# May fail if only 0 or 1 gene is significant
sigcounts <- sigcounts[apply(sigcounts, 1, function(row) all(row !=0 )),]								# Remove genes with zero reads
})


# By defaults hits are not clustered and thus stay sorted by their padj value
pdf(paste0(output, cond1, "_vs_", cond2, "-HeatmapSig.pdf"), onefile=FALSE)
try({																									# May fail if only 0 or 1 gene is significant
pheatmap(log2(sigcounts), col=rev(hmcol), scale='row', cluster_rows=FALSE, cluster_cols=FALSE, annotation_col=conditions_df, annotation_names_col=FALSE, annotation_colors=condcol, main=paste0(cond1, " vs ", cond2, " Top Hits"))
})
garbage <- dev.off() # Save to file


# But we can cluster them using the following command
pdf(paste0(output, cond1, "_vs_", cond2, "-HeatmapSigClust.pdf"), onefile=FALSE)
try({																									# May fail if only 0 or 1 gene is significant
pheatmap(log2(sigcounts), col=rev(hmcol), scale='row', cluster_rows=TRUE, cluster_cols=TRUE, annotation_col=conditions_df, annotation_names_col=FALSE, annotation_colors=condcol, main=paste0(cond1, " vs ", cond2, " Clustered Top Hits"))
})
garbage <- dev.off() # Save to file


# And even plot every gene
pdf(paste0(output, cond1, "_vs_", cond2, "-HeatmapAllClust.pdf"), onefile=FALSE)
pheatmap(log2(counts), col=rev(hmcol), scale='row', cluster_rows=TRUE, cluster_cols=TRUE, show_rownames=FALSE, show_colnames=FALSE, annotation_col=conditions_df, annotation_names_col=FALSE, annotation_colors=condcol, main=paste0(cond1, " vs ", cond2, " Clustered Reads Count"))
garbage <- dev.off() # Save to file


# Additional optional graphs using DESeqAnalysis https://github.com/acidgenomics/DESeqAnalysis

# Principal Components Analysis with labels
try(library(DESeqAnalysis))
pdf(paste0(output, cond1, "_vs_", cond2, "-PCAlabels.pdf"), width=20, height=20)
try(DESeqAnalysis::plotPCA(vsd, label = TRUE, interestingGroups="condition", title=paste0(cond1, " vs ", cond2, " PCA")))
garbage <- dev.off() # Save to file


# Adapted from:
# http://www.bioconductor.org/help/workflows/rnaseqGene/
# https://gist.github.com/stephenturner/f60c1934405c127f09a6
# https://dwheelerau.com/2014/02/17/how-to-use-deseq2-to-analyse-rnaseq-data/