Skip to content

Commit

Permalink
Release v2.1.0 - COVID-19 lockdown edition
Browse files Browse the repository at this point in the history
* RNA-Seq pipeline now with support for local splicing variation detection.
* Initial support for [MAJIQ](https://majiq.biociphers.org) and/or [LeafCutter](https://davidaknowles.github.io/leafcutter/).
* Major code cleanup for most pipelines.

Co-Authored-By: Rosa Ma <[email protected]>
  • Loading branch information
emc2cube and rosaxma committed Jul 10, 2020
1 parent 755d777 commit c29126b
Show file tree
Hide file tree
Showing 13 changed files with 3,939 additions and 2,633 deletions.
75 changes: 49 additions & 26 deletions DESeq2_template.R
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
#!/usr/bin/env Rscript

# variables
# Variables
table <- ""
cond1 <- ""
rep1 <- "" # Number of replicates for cond1, automatically determined
cond2 <- ""
rep2 <- "" # Number of replicates for cond2, automatically determined
output <- ""
siglfc <- 1 # |Log2| fold change to be used as significant. Set at 1 by default, meaning a 2 fold change (log2(2)=1)
sigpadj <- 0.05 # Adjusted p-value used as upper threshold for significance


##
## Top part will actually be generated by the main sh_RNAseq.sh script.
## When updating this template in sh_RNAseq.sh do not forget to escape the "$" character by replacing them with "\$"
## When updating this template in sh_RNAseq.sh do not forget to escape the "$" character by replacing it with "\$"
##


# This script uses DESeq2 in R to run differential-expression analysis on a reads count matrix of genes and automatically generate some graphs.


Expand All @@ -23,28 +28,28 @@ library(calibrate)
library(pheatmap)

# Read in the data and make sample information table
data=read.table(table, header=TRUE, row.names=1, check.names=FALSE, stringsAsFactors=FALSE) # read the reads count matrix file that was generated by rsem-generate-data-matrix
colnames(data) <- sub('.rsem.genes.results', '', basename(colnames(data))) # fix sample names in column names
cols=c(1:ncol(data)) # get the number of columns
data[,cols]=apply(data[,cols], 2, function(x) as.numeric(as.integer(x))) # format the matrix as integer numbers for DESeq2 to be happy
conditions <- factor(c(rep(cond1, ncol(data)/2), rep(cond2, ncol(data)/2))) # generate a comma-separated list of conditions/treatments for each sample. Replicates should be named the same.
samples=as.data.frame((colnames(data))) # make samples list for DESeq2
data=read.table(table, header=TRUE, row.names=1, check.names=FALSE, stringsAsFactors=FALSE) # read the reads count matrix file that was generated by rsem-generate-data-matrix
colnames(data) <- sub('.rsem.genes.results', '', basename(colnames(data))) # fix sample names in column names
cols=c(1:ncol(data)) # get the number of columns
data[,cols]=apply(data[,cols], 2, function(x) as.numeric(as.integer(x))) # format the matrix as integer numbers for DESeq2 to be happy
conditions <- factor(c(rep(cond1, rep1), rep(cond2, rep2))) # generate a comma-separated list of conditions/treatments for each sample. Replicates should be named the same.
samples=as.data.frame((colnames(data))) # make samples list for DESeq2
samples <- cbind(samples, conditions)
colnames(samples)=c("sample","condition")
colnames(samples)=c("experimental","condition")
groups=factor(samples[,2])

# Run DESeq2 to get the result for differential-expression
dds=DESeqDataSetFromMatrix(countData=as.matrix(data), colData=samples, design =~condition) # generate a DESeq2 object from the data
dds=DESeqDataSetFromMatrix(countData=as.matrix(data), colData=samples, design =~condition) # generate a DESeq2 object from the data
colnames(dds) <- colnames(data)
dds <- DESeq(dds) # run DESeq2
dds <- DESeq(dds) # run DESeq2

# Regularized log transformation for clustering/heatmaps, etc
rld <- rlogTransformation(dds)


# Save differential expression results
res <- results(dds)
table(res$padj<0.05)
table(res$padj<sigpadj)
## Order by adjusted p-value
res <- res[order(res$padj), ]
## Merge with normalized count data
Expand All @@ -67,22 +72,24 @@ garbage <- dev.off() # Save to file


# MA plot (without and with "significant" genes labeled)
maplot <- function (res, thresh=0.05, labelsig=TRUE, textcx=1, ...) {
maplot <- function (res, sigthresh=0.05, labelsig=TRUE, textcx=1, ...) { # sigthresh=0.05 is the default value if ommitted when creating the graph. No need to edit here.
with(res, plot(baseMean, log2FoldChange, pch=20, cex=.5, log="x", ...))
with(subset(res, padj<thresh), points(baseMean, log2FoldChange, col="red", pch=20, cex=1.5))
with(subset(res, padj<sigthresh), points(baseMean, log2FoldChange, col="red", pch=20, cex=1.5))
if (labelsig) {
require(calibrate)
with(subset(res, padj<thresh), textxy(baseMean, log2FoldChange, labs=Gene, cex=textcx, col=2))
with(subset(res, padj<sigthresh), textxy(baseMean, log2FoldChange, labs=Gene, cex=textcx, col=2))
}
}

# DESeq2 MA plot function and custom MA plot with significant gene names. Here we use the sigpadj variable value to generate the graph
pdf(paste0(output, cond1, "_vs_", cond2, "-MAplot.pdf"))
plotMA(res, ylim=c(-2,2), main=paste0(cond1, " vs ", cond2, " MA Plot")) # DESeq2 MA plot function
maplot(resdata, xlab="mean of normalized counts", ylab=expression(log[2]~fold~change), main=paste0(cond1, " vs ", cond2, " MA Plot")) # Custom MA with gene names
plotMA(res, ylim=c(-2,2), main=paste0(cond1, " vs ", cond2, " MA Plot"))
maplot(resdata, sigpadj, xlab="mean of normalized counts", ylab=expression(log[2]~fold~change), main=paste0(cond1, " vs ", cond2, " MA Plot"))
garbage <- dev.off() # Save to file


# Volcano plot with "significant" genes labeled
# sigthresh=0.05 and lfcthresh=2 are the default values if ommitted when creating the graph. No need to edit here.
volcanoplot <- function (res, lfcthresh=2, sigthresh=0.05, main="Volcano Plot", legendpos="bottomright", labelsig=TRUE, textcx=1, ...) {
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main=main, ...))
with(subset(res, padj<sigthresh ), points(log2FoldChange, -log10(pvalue), pch=20, col="red", ...))
Expand All @@ -95,8 +102,9 @@ if (labelsig) {
legend(legendpos, xjust=1, yjust=1, legend=c(paste("FDR<",sigthresh,sep=""), paste("|LogFC|>",lfcthresh,sep=""), "Both"), pch=20, col=c("red","orange","green"))
}

# Custom Volcano Plot with significant gene names. Here we use the sigpadj and siglfc variable values to generate the graph
pdf(paste0(output, cond1, "_vs_", cond2, "-VolcanoPlot.pdf"))
volcanoplot(resdata, lfcthresh=1, sigthresh=0.05, textcx=.8, xlim=c(-2, 2), main=paste0(cond1, " vs ", cond2, " Volcano Plot"))
volcanoplot(resdata, siglfc, sigpadj, textcx=.8, xlim=c(-2, 2), main=paste0(cond1, " vs ", cond2, " Volcano Plot"))
garbage <- dev.off() # Save to file


Expand All @@ -109,32 +117,47 @@ pdf(paste0(output, cond1, "_vs_", cond2, "-DistanceMatrix.pdf"))
heatmap.2(as.matrix(sampleDists), key=FALSE, symm=TRUE, trace="none", col=rev(dmcol), ColSideColors=condcol[conditions], RowSideColors=condcol[conditions], margin=c(15, 15), main=paste0(cond1, " vs ", cond2, " Matrix"))
garbage <- dev.off() # Save to file

# Heatmap of significant hits ( padj<0.05 and |log2FoldChange|>=1 )
# Heatmap of significant hits ( padj<sigpadj and |log2FoldChange|>=siglfc )
hmcol <- colorRampPalette(brewer.pal(9, 'RdYlBu'))(100)
sig <- rownames(res[!is.na(res$padj) & res$padj<0.05 & abs(res$log2FoldChange)>=1,])[1:30] # Select the first 30 significant hits (sorted by padj)
sig <- sig[!is.na(sig)] # If we have less than 30 hits, clean "NA" fields
sig <- rownames(res[!is.na(res$padj) & res$padj<sigpadj & abs(res$log2FoldChange)>=siglfc,])[1:30] # Select the first 30 significant hits (sorted by padj)
sig <- sig[!is.na(sig)] # If we have less than 30 hits, clean "NA" fields

counts <- counts(dds,normalized=TRUE) # Get normalized read counts (sorted by padj)
counts <- counts[apply(counts, 1, function(row) all(row !=0 )),] # remove genes with zero reads
sigcounts <- counts(dds,normalized=TRUE)[sig,] # Get normalized read counts (sorted by padj) for significant genes
sigcounts <- sigcounts[apply(sigcounts, 1, function(row) all(row !=0 )),] # remove genes with zero reads
counts <- counts(dds,normalized=TRUE) # Get normalized read counts (sorted by padj)
counts <- counts[apply(counts, 1, function(row) all(row !=0 )),] # Remove genes with zero reads
sigcounts <- counts(dds,normalized=TRUE)[sig,] # Get normalized read counts (sorted by padj) for significant genes
try({ # May fail if only 0 or 1 gene is significant
sigcounts <- sigcounts[apply(sigcounts, 1, function(row) all(row !=0 )),] # Remove genes with zero reads
})

# By defaults hits are not clustered and thus stay sorted by their padj value
pdf(paste0(output, cond1, "_vs_", cond2, "-HeatmapSig.pdf"), onefile=FALSE)
try({ # May fail if only 0 or 1 gene is significant
pheatmap(log2(sigcounts), col=rev(hmcol), scale='row', cluster_rows=FALSE, cluster_cols=FALSE, main=paste0(cond1, " vs ", cond2, " Top Hits"))
})
garbage <- dev.off() # Save to file

# But we can cluster them using the following command
pdf(paste0(output, cond1, "_vs_", cond2, "-HeatmapSigClust.pdf"), onefile=FALSE)
try({ # May fail if only 0 or 1 gene is significant
pheatmap(log2(sigcounts), col=rev(hmcol), scale='row', cluster_rows=TRUE, cluster_cols=TRUE, main=paste0(cond1, " vs ", cond2, " Clustered Top Hits"))
})
garbage <- dev.off() # Save to file

# And even plot everything, this time using the old school Red/Green color palette
# And even plot everything, this time using an old school Red/Green color palette
pdf(paste0(output, cond1, "_vs_", cond2, "-HeatmapAllClust.pdf"), onefile=FALSE)
pheatmap(log2(counts), col=redgreen(75), scale='row', cluster_rows=TRUE, cluster_cols=TRUE, main=paste0(cond1, " vs ", cond2, " Clustered Reads Count"))
garbage <- dev.off() # Save to file


# Additional optional graphs using DESeqAnalysis https://github.com/acidgenomics/DESeqAnalysis

# Principal Components Analysis with labels
try(library(DESeqAnalysis))
pdf(paste0(output, cond1, "_vs_", cond2, "-PCAlabels.pdf"), width=20, height=20)
try(DESeqAnalysis::plotPCA(rld, label = TRUE, interestingGroups="condition"))
garbage <- dev.off() # Save to file


# Adapted from:
# http://www.bioconductor.org/help/workflows/rnaseqGene/
# https://gist.github.com/stephenturner/f60c1934405c127f09a6
Expand Down
Loading

0 comments on commit c29126b

Please sign in to comment.