Release v2.1.0 - COVID-19 lockdown edition

* RNA-Seq pipeline now with support for local splicing variation detection. * Initial support for [MAJIQ](https://majiq.biociphers.org) and/or [LeafCutter](https://davidaknowles.github.io/leafcutter/). * Major code cleanup for most pipelines. Co-Authored-By: Rosa Ma <[email protected]>
emc2cube · Jul 10, 2020 · c29126b · c29126b
1 parent 755d777
commit c29126b
Show file tree

Hide file tree

Showing 13 changed files with 3,939 additions and 2,633 deletions.
diff --git a/DESeq2_template.R b/DESeq2_template.R
@@ -1,17 +1,22 @@
 #!/usr/bin/env Rscript
 
-# variables
+# Variables
 table <- ""
 cond1 <- ""
+rep1 <- ""			# Number of replicates for cond1, automatically determined
 cond2 <- ""
+rep2 <- ""			# Number of replicates for cond2, automatically determined
 output <- ""
+siglfc <- 1			# |Log2| fold change to be used as significant. Set at 1 by default, meaning a 2 fold change (log2(2)=1)
+sigpadj <- 0.05		# Adjusted p-value used as upper threshold for significance
 
 
 ##
 ## Top part will actually be generated by the main sh_RNAseq.sh script.
-## When updating this template in sh_RNAseq.sh do not forget to escape the "$" character by replacing them with "\$"
+## When updating this template in sh_RNAseq.sh do not forget to escape the "$" character by replacing it with "\$"
 ##
 
+
 # This script uses DESeq2 in R to run differential-expression analysis on a reads count matrix of genes and automatically generate some graphs.
 
 
@@ -23,28 +28,28 @@ library(calibrate)
 library(pheatmap)
 
 # Read in the data and make sample information table
-data=read.table(table, header=TRUE, row.names=1, check.names=FALSE, stringsAsFactors=FALSE)		# read the reads count matrix file that was generated by rsem-generate-data-matrix
-colnames(data) <- sub('.rsem.genes.results', '', basename(colnames(data)))						# fix sample names in column names
-cols=c(1:ncol(data))																			# get the number of columns
-data[,cols]=apply(data[,cols], 2, function(x) as.numeric(as.integer(x)))						# format the matrix as integer numbers for DESeq2 to be happy
-conditions <- factor(c(rep(cond1, ncol(data)/2), rep(cond2, ncol(data)/2)))						# generate a comma-separated list of conditions/treatments for each sample. Replicates should be named the same.
-samples=as.data.frame((colnames(data)))															# make samples list for DESeq2
+data=read.table(table, header=TRUE, row.names=1, check.names=FALSE, stringsAsFactors=FALSE)				# read the reads count matrix file that was generated by rsem-generate-data-matrix
+colnames(data) <- sub('.rsem.genes.results', '', basename(colnames(data)))								# fix sample names in column names
+cols=c(1:ncol(data))																					# get the number of columns
+data[,cols]=apply(data[,cols], 2, function(x) as.numeric(as.integer(x)))								# format the matrix as integer numbers for DESeq2 to be happy
+conditions <- factor(c(rep(cond1, rep1), rep(cond2, rep2)))												# generate a comma-separated list of conditions/treatments for each sample. Replicates should be named the same.
+samples=as.data.frame((colnames(data)))																	# make samples list for DESeq2
 samples <- cbind(samples, conditions)
-colnames(samples)=c("sample","condition") 
+colnames(samples)=c("experimental","condition")
 groups=factor(samples[,2])
 
 # Run DESeq2 to get the result for differential-expression
-dds=DESeqDataSetFromMatrix(countData=as.matrix(data), colData=samples, design =~condition)		# generate a DESeq2 object from the data
+dds=DESeqDataSetFromMatrix(countData=as.matrix(data), colData=samples, design =~condition)				# generate a DESeq2 object from the data
 colnames(dds) <- colnames(data)
-dds <- DESeq(dds)																				# run DESeq2
+dds <- DESeq(dds)																						# run DESeq2
 
 # Regularized log transformation for clustering/heatmaps, etc
 rld <- rlogTransformation(dds)
 
 
 # Save differential expression results
 res <- results(dds)
-table(res$padj<0.05)
+table(res$padj<sigpadj)
 ## Order by adjusted p-value
 res <- res[order(res$padj), ]
 ## Merge with normalized count data
@@ -67,22 +72,24 @@ garbage <- dev.off() # Save to file
 
 
 # MA plot (without and with "significant" genes labeled)
-maplot <- function (res, thresh=0.05, labelsig=TRUE, textcx=1, ...) {
+maplot <- function (res, sigthresh=0.05, labelsig=TRUE, textcx=1, ...) {								# sigthresh=0.05 is the default value if ommitted when creating the graph. No need to edit here.
   with(res, plot(baseMean, log2FoldChange, pch=20, cex=.5, log="x", ...))
-  with(subset(res, padj<thresh), points(baseMean, log2FoldChange, col="red", pch=20, cex=1.5))
+  with(subset(res, padj<sigthresh), points(baseMean, log2FoldChange, col="red", pch=20, cex=1.5))
 if (labelsig) {
     require(calibrate)
-    with(subset(res, padj<thresh), textxy(baseMean, log2FoldChange, labs=Gene, cex=textcx, col=2))
+    with(subset(res, padj<sigthresh), textxy(baseMean, log2FoldChange, labs=Gene, cex=textcx, col=2))
   }
 }
 
+# DESeq2 MA plot function and custom MA plot with significant gene names. Here we use the sigpadj variable value to generate the graph
 pdf(paste0(output, cond1, "_vs_", cond2, "-MAplot.pdf"))
-plotMA(res, ylim=c(-2,2), main=paste0(cond1, " vs ", cond2, " MA Plot"))																# DESeq2 MA plot function
-maplot(resdata, xlab="mean of normalized counts", ylab=expression(log[2]~fold~change), main=paste0(cond1, " vs ", cond2, " MA Plot"))	# Custom MA with gene names
+plotMA(res, ylim=c(-2,2), main=paste0(cond1, " vs ", cond2, " MA Plot"))
+maplot(resdata, sigpadj, xlab="mean of normalized counts", ylab=expression(log[2]~fold~change), main=paste0(cond1, " vs ", cond2, " MA Plot"))
 garbage <- dev.off() # Save to file
 
 
 # Volcano plot with "significant" genes labeled
+# sigthresh=0.05 and lfcthresh=2 are the default values if ommitted when creating the graph. No need to edit here.
 volcanoplot <- function (res, lfcthresh=2, sigthresh=0.05, main="Volcano Plot", legendpos="bottomright", labelsig=TRUE, textcx=1, ...) {
   with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main=main, ...))
   with(subset(res, padj<sigthresh ), points(log2FoldChange, -log10(pvalue), pch=20, col="red", ...))
@@ -95,8 +102,9 @@ if (labelsig) {
   legend(legendpos, xjust=1, yjust=1, legend=c(paste("FDR<",sigthresh,sep=""), paste("|LogFC|>",lfcthresh,sep=""), "Both"), pch=20, col=c("red","orange","green"))
 }
 
+# Custom Volcano Plot with significant gene names. Here we use the sigpadj and siglfc variable values to generate the graph
 pdf(paste0(output, cond1, "_vs_", cond2, "-VolcanoPlot.pdf"))
-volcanoplot(resdata, lfcthresh=1, sigthresh=0.05, textcx=.8, xlim=c(-2, 2), main=paste0(cond1, " vs ", cond2, " Volcano Plot"))
+volcanoplot(resdata, siglfc, sigpadj, textcx=.8, xlim=c(-2, 2), main=paste0(cond1, " vs ", cond2, " Volcano Plot"))
 garbage <- dev.off() # Save to file
 
 
@@ -109,32 +117,47 @@ pdf(paste0(output, cond1, "_vs_", cond2, "-DistanceMatrix.pdf"))
 heatmap.2(as.matrix(sampleDists), key=FALSE, symm=TRUE, trace="none", col=rev(dmcol), ColSideColors=condcol[conditions], RowSideColors=condcol[conditions], margin=c(15, 15), main=paste0(cond1, " vs ", cond2, " Matrix"))
 garbage <- dev.off() # Save to file
 
-# Heatmap of significant hits ( padj<0.05 and |log2FoldChange|>=1 )
+# Heatmap of significant hits ( padj<sigpadj and |log2FoldChange|>=siglfc )
 hmcol <- colorRampPalette(brewer.pal(9, 'RdYlBu'))(100)
-sig <- rownames(res[!is.na(res$padj) & res$padj<0.05 & abs(res$log2FoldChange)>=1,])[1:30]		# Select the first 30 significant hits (sorted by padj)
-sig <- sig[!is.na(sig)]																			# If we have less than 30 hits, clean "NA" fields
+sig <- rownames(res[!is.na(res$padj) & res$padj<sigpadj & abs(res$log2FoldChange)>=siglfc,])[1:30]	# Select the first 30 significant hits (sorted by padj)
+sig <- sig[!is.na(sig)]																					# If we have less than 30 hits, clean "NA" fields
 
-counts <- counts(dds,normalized=TRUE)															# Get normalized read counts (sorted by padj)
-counts <- counts[apply(counts, 1, function(row) all(row !=0 )),]								# remove genes with zero reads
-sigcounts <- counts(dds,normalized=TRUE)[sig,]													# Get normalized read counts (sorted by padj) for significant genes
-sigcounts <- sigcounts[apply(sigcounts, 1, function(row) all(row !=0 )),]						# remove genes with zero reads
+counts <- counts(dds,normalized=TRUE)																	# Get normalized read counts (sorted by padj)
+counts <- counts[apply(counts, 1, function(row) all(row !=0 )),]										# Remove genes with zero reads
+sigcounts <- counts(dds,normalized=TRUE)[sig,]															# Get normalized read counts (sorted by padj) for significant genes
+try({																									# May fail if only 0 or 1 gene is significant
+sigcounts <- sigcounts[apply(sigcounts, 1, function(row) all(row !=0 )),]								# Remove genes with zero reads
+})
 
 # By defaults hits are not clustered and thus stay sorted by their padj value
 pdf(paste0(output, cond1, "_vs_", cond2, "-HeatmapSig.pdf"), onefile=FALSE)
+try({																									# May fail if only 0 or 1 gene is significant
 pheatmap(log2(sigcounts), col=rev(hmcol), scale='row', cluster_rows=FALSE, cluster_cols=FALSE, main=paste0(cond1, " vs ", cond2, " Top Hits"))
+})
 garbage <- dev.off() # Save to file
 
 # But we can cluster them using the following command
 pdf(paste0(output, cond1, "_vs_", cond2, "-HeatmapSigClust.pdf"), onefile=FALSE)
+try({																									# May fail if only 0 or 1 gene is significant
 pheatmap(log2(sigcounts), col=rev(hmcol), scale='row', cluster_rows=TRUE, cluster_cols=TRUE, main=paste0(cond1, " vs ", cond2, " Clustered Top Hits"))
+})
 garbage <- dev.off() # Save to file
 
-# And even plot everything, this time using the old school Red/Green color palette
+# And even plot everything, this time using an old school Red/Green color palette
 pdf(paste0(output, cond1, "_vs_", cond2, "-HeatmapAllClust.pdf"), onefile=FALSE)
 pheatmap(log2(counts), col=redgreen(75), scale='row', cluster_rows=TRUE, cluster_cols=TRUE, main=paste0(cond1, " vs ", cond2, " Clustered Reads Count"))
 garbage <- dev.off() # Save to file
 
 
+# Additional optional graphs using DESeqAnalysis https://github.com/acidgenomics/DESeqAnalysis
+
+# Principal Components Analysis with labels
+try(library(DESeqAnalysis))
+pdf(paste0(output, cond1, "_vs_", cond2, "-PCAlabels.pdf"), width=20, height=20)
+try(DESeqAnalysis::plotPCA(rld, label = TRUE, interestingGroups="condition"))
+garbage <- dev.off() # Save to file
+
+
 # Adapted from:
 # http://www.bioconductor.org/help/workflows/rnaseqGene/
 # https://gist.github.com/stephenturner/f60c1934405c127f09a6