adjust differential gene expression

NYU-Molecular-Pathology · Sep 6, 2017 · 99699fa · 99699fa
1 parent 8b5d1ba
commit 99699fa
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 35 deletions.
diff --git a/scripts/deseq2-heatmap.R b/scripts/deseq2-heatmap.R
@@ -4,7 +4,7 @@
 
 
 
-deseq2_heatmap = function(mat, genes, samples, title, file_suffix) {
+deseq2_heatmap = function(mat, genes, samples, title, file_prefix = "plot.heatmap") {
 
   library(pheatmap)
 
@@ -14,8 +14,8 @@ deseq2_heatmap = function(mat, genes, samples, title, file_suffix) {
   # labels
   size_text = paste0(length(genes), "x", length(samples))
   title = paste0(title, " - ", size_text)
-  filename_png = paste0("plot.heatmap.", file_suffix, ".", size_text, ".png")
-  filename_pdf = paste0("plot.heatmap.", file_suffix, ".", size_text, ".pdf")
+  filename_png = paste0(file_prefix, ".", size_text, ".png")
+  filename_pdf = paste0(file_prefix, ".", size_text, ".pdf")
 
   # heatmap cells color range: blue-white-red
   cell_colors = colorRampPalette(c("#043177", "#244B88", "#FAFAFA", "#C62E2E", "#BF0F0F"))(50)
@@ -30,15 +30,15 @@ deseq2_heatmap = function(mat, genes, samples, title, file_suffix) {
     show_rownames = FALSE
   }
 
-  # heatmap with clustering
+  # png heatmap with clustering
   message("generate heatmap: ", filename_png)
   pheatmap(mat, color = cell_colors, border_color = NA, scale = "row", cluster_rows = TRUE, cluster_cols = TRUE,
   clustering_distance_rows = "euclidean", clustering_distance_cols = "euclidean", clustering_method = "complete",
   main = title, fontsize_row = fontsize_row, fontsize_col = 12, show_rownames = show_rownames,
   filename = filename_png, width = 12, height = 8)
   Sys.sleep(1)
 
-  # heatmap with clustering
+  # pdf heatmap with clustering
   message("generate heatmap: ", filename_pdf)
   pheatmap(mat, color = cell_colors, border_color = NA, scale = "row", cluster_rows = TRUE, cluster_cols = TRUE,
   clustering_distance_rows = "euclidean", clustering_distance_cols = "euclidean", clustering_method = "complete",

diff --git a/scripts/deseq2-pca.R b/scripts/deseq2-pca.R
@@ -30,9 +30,9 @@ deseq2_pca = function(object, intgroup, ntop = 500) {
   }
 
   # add the intgroup factors together to create a new grouping factor
-  intgroup_df = as.data.frame(colData(object)[, intgroup, drop=FALSE])
+  intgroup_df = as.data.frame(colData(object)[, intgroup, drop = FALSE])
   if (length(intgroup) > 1) {
-    fac = factor(apply( intgroup.df, 1, paste, collapse=" : "))
+    fac = factor(apply(intgroup_df, 1, paste, collapse = " : "))
   } else {
     fac = colData(object)[[intgroup]]
   }

diff --git a/scripts/deseq2-results.R b/scripts/deseq2-results.R
@@ -10,6 +10,7 @@ deseq2_results = function(deseq_dataset, contrast = NULL, name = NULL) {
   library(xlsx)
 
   # calculate results (using contrast or name, depending on what is given)
+  # since v1.16 (11/2016), lfcShrink function performs fold change shrinkage and addMLE is for backward compatibility
   if(!is.null(contrast)) {
     res = results(deseq_dataset, contrast = contrast, cooksCutoff = FALSE, addMLE = TRUE)
     # extract results name
@@ -27,12 +28,12 @@ deseq2_results = function(deseq_dataset, contrast = NULL, name = NULL) {
   # sort results so most significant are first
   res = res[order(res$padj, res$pvalue, -res$baseMean), ]
 
-  # save results object
+  # save unmodified results object
   res_rds = paste0("deseq2.res.", file_suffix, ".rds")
   saveRDS(dds, file = res_rds)
   message("save results object: ", res_rds)
 
-  # save results as csv
+  # save unmodified results as csv
   res_csv = paste0("dge.", file_suffix, ".csv")
   write.csv(as.data.frame(res), file = res_csv)
   message("save results csv: ", res_csv)
@@ -51,58 +52,83 @@ deseq2_results = function(deseq_dataset, contrast = NULL, name = NULL) {
   names(res_df)[names(res_df) == "log2FoldChange"] = "log2FC"
   names(res_df)[names(res_df) == "lfcMLE"] = "log2FCunshrunk"
 
-  # save results as xlsx
+  message("num genes padj<0.90: ", nrow(subset(res_df, padj < 0.9)))
+  message("num genes padj<0.20: ", nrow(subset(res_df, padj < 0.2)))
+  message("num genes padj<0.05: ", nrow(subset(res_df, padj < 0.05)))
+  message("num genes padj<0.01: ", nrow(subset(res_df, padj < 0.01)))
+
+  # save differential expression results in Excel format
   res_xlsx = paste0("dge.", gsub(pattern = " ", replacement = "-", x = res_name), ".xlsx")
   write.xlsx2(x = res_df, file = res_xlsx, sheetName = res_name, col.names = TRUE, row.names = FALSE)
   message("results genes: ", nrow(res_df))
   message("save results xlsx: ", res_xlsx)
 
-  # save results as xlsx (padj<0.05)
+  # save significant (padj<0.05) differential expression results in Excel format
   res_padj005_xlsx = gsub(pattern = ".xlsx", replacement = ".q005.xlsx", x = res_xlsx)
   res_padj005_df = subset(res_df, padj < 0.05)
   write.xlsx2(x = res_padj005_df, file = res_padj005_xlsx, sheetName = res_name, col.names = TRUE, row.names = FALSE)
-  message("num genes padj<0.9:  ", nrow(subset(res_df, padj < 0.9)))
-  message("num genes padj<0.2:  ", nrow(subset(res_df, padj < 0.2)))
-  message("num genes padj<0.05: ", nrow(subset(res_df, padj < 0.05)))
-  message("num genes padj<0.01: ", nrow(subset(res_df, padj < 0.01)))
   message("save filtered results xlsx: ", res_padj005_xlsx)
 
-  # heatmap values matrix
+  # heatmap variance stabilized values matrix
   vsd = assay(varianceStabilizingTransformation(dds, blind = TRUE))
 
   # all samples and the subset used for the comparison
   samples_all = colnames(deseq_dataset)
   samples_comp = samples_all
   if(!is.null(contrast)) samples_comp = rownames(subset(deseq_dataset@colData, group %in% contrast[2:3]))
 
+  # create a separate directory for heatmaps
+  heatmaps_dir = "heatmaps"
+  if (!dir.exists(heatmaps_dir)) dir.create(heatmaps_dir)
+
   # heatmap gene subsets (list with genes, plot title, and file suffix)
   hmg = list()
-  hmg[[length(hmg) + 1]] = list(genes = rownames(res_df)[1:50], title = "50 Most Significant", file_suffix = "top")
-  hmg[[length(hmg) + 1]] = list(genes = rownames(res_df)[1:100], title = "100 Most Significant", file_suffix = "top")
-  hmg[[length(hmg) + 1]] = list(genes = rownames(res_df)[1:1000], title = "1000 Most Significant", file_suffix = "top")
-  hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.10)), title = "q < 0.1", file_suffix = "q010")
-  hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.05)), title = "q < 0.05", file_suffix = "q005")
-  hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.01)), title = "q < 0.01", file_suffix = "q001")
-  hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.001)), title = "q < 0.001", file_suffix = "q0001")
-  hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, pvalue < 0.05)), title = "p < 0.05", file_suffix = "p005")
-  hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, pvalue < 0.01)), title = "p < 0.01", file_suffix = "p001")
+  hmg[[length(hmg) + 1]] = list(genes = rownames(res_df)[1:50],
+                                title = "50 Most Significant",
+                                file_suffix = "top")
+  hmg[[length(hmg) + 1]] = list(genes = rownames(res_df)[1:100],
+                                title = "100 Most Significant",
+                                file_suffix = "top")
+  hmg[[length(hmg) + 1]] = list(genes = rownames(res_df)[1:1000],
+                                title = "1000 Most Significant",
+                                file_suffix = "top")
+  hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.10)),
+                                title = "q < 0.1",
+                                file_suffix = "q010")
+  hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.05)),
+                                title = "q < 0.05",
+                                file_suffix = "q005")
+  hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.01)),
+                                title = "q < 0.01",
+                                file_suffix = "q001")
+  hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.001)),
+                                title = "q < 0.001",
+                                file_suffix = "q0001")
+  hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, pvalue < 0.05)),
+                                title = "p < 0.05",
+                                file_suffix = "p005")
+  hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, pvalue < 0.01)),
+                                title = "p < 0.01",
+                                file_suffix = "p001")
 
   # process every gene subset
   for (i in 1:length(hmg)) {
 
     # generate title and file suffix
     hm_title = paste0(res_name, " - ", hmg[[i]]$title)
-    hm_file_suffix = paste0(file_suffix, ".", hmg[[i]]$file_suffix)
+    hm_file_prefix = paste0(heatmaps_dir, "/plot.heatmap.", file_suffix, ".", hmg[[i]]$file_suffix)
 
     # generate heatmaps if gene list is not too small or big
     if (length(hmg[[i]]$genes) > 5 && length(hmg[[i]]$genes) < 3000) {
 
       # generate heatmap using all samples
-      deseq2_heatmap(mat = vsd, genes = hmg[[i]]$genes, samples = samples_all, title = hm_title, file_suffix = hm_file_suffix)
+      deseq2_heatmap(mat = vsd, genes = hmg[[i]]$genes, samples = samples_all,
+        title = hm_title, file_prefix = hm_file_prefix)
 
       # generate heatmap using a subset of samples used for the comparison
       if (length(samples_comp) < length(samples_all)) {
-        deseq2_heatmap(mat = vsd, genes = hmg[[i]]$genes, samples = samples_comp, title = hm_title, file_suffix = hm_file_suffix)
+        deseq2_heatmap(mat = vsd, genes = hmg[[i]]$genes, samples = samples_comp,
+          title = hm_title, file_prefix = hm_file_prefix)
       }
 
     }

diff --git a/scripts/dge-deseq2.R b/scripts/dge-deseq2.R
@@ -64,22 +64,28 @@ message("GTF median gene length: ", median(gene_lengths))
 # import groups table
 groups_table = read.csv(file = groups_table_file, header = TRUE, row.names = 1, colClasses = "factor")
 message("groups table sample num: ", nrow(groups_table))
-message("groups table groups: ", colnames(groups_table))
+message("groups table samples: ", paste(rownames(groups_table), collapse = ", "))
+message("groups table groups: ", paste(colnames(groups_table), collapse = ", "))
 
 # import counts table
 counts_table = read.delim(file = counts_table_file, header = TRUE, row.names = 1, check.names = FALSE, stringsAsFactors = FALSE)
 message("full counts table gene num: ", nrow(counts_table))
 message("full counts table sample num: ", ncol(counts_table))
+message("full counts table samples: ", paste(colnames(counts_table), collapse = ", "))
+
+# check that all samples from the groups table are found in the counts table
+diff_samples = setdiff(rownames(groups_table), colnames(counts_table))
+if (length(diff_samples)) stop("some samples not in counts table: ", paste(diff_samples, collapse = ", "))
 
 # subset to samples in groups table (also sets samples to be in the same order)
-counts_table = counts_table[,rownames(groups_table)]
+counts_table = counts_table[, rownames(groups_table)]
 message("group-subset counts table gene num: ", nrow(counts_table))
 message("group-subset counts table sample num: ", ncol(counts_table))
 
-# group info
+# group info (use the first column for grouped comparisons)
 group_name = colnames(groups_table)[1]
 message("group name: ", group_name)
-group_levels = levels(groups_table[,group_name])
+group_levels = levels(groups_table[, group_name])
 message("group levels: ", toString(group_levels))
 
 # design formula
@@ -89,8 +95,9 @@ message("design formula: ", design_formula)
 message(" ========== normalize ========== ")
 
 # import raw counts and create DESeq object
+# since v1.16 (11/2016), betaPrior is set to FALSE and shrunken LFCs are obtained afterwards using lfcShrink
 dds = DESeqDataSetFromMatrix(countData = counts_table, colData = groups_table, design = design_formula)
-dds = DESeq(dds, parallel = TRUE, BPPARAM = BiocParallel::MulticoreParam(workers = 4))
+dds = DESeq(dds, betaPrior = TRUE, parallel = TRUE, BPPARAM = BiocParallel::MulticoreParam(workers = 4))
 
 # add gene lengths (used to generate FPKM values)
 if (identical(sort(names(gene_lengths)), sort(rownames(dds)))) {
@@ -131,7 +138,7 @@ print(plotSparsity(dds, normalized = TRUE))
 dev.off()
 
 # PCA plot
-pca_plot = deseq2_pca(vsd, intgroup = c("group"), ntop = 1000)
+pca_plot = deseq2_pca(vsd, intgroup = group_name, ntop = 1000)
 save_plot("plot.pca.pdf", pca_plot, base_width = 8, base_height = 5, units = "in", dpi = 300)
 save_plot("plot.pca.png", pca_plot, base_width = 8, base_height = 5, units = "in", dpi = 300)
 

diff --git a/scripts/load-install-packages.R b/scripts/load-install-packages.R
@@ -26,7 +26,7 @@ load_install_packages = function(package_list) {
     }
 
     # load package
-    library(p, character.only = TRUE, quietly = TRUE)
+    suppressPackageStartupMessages(library(p, character.only = TRUE))
 
   }
-Original file line number
+Diff line change
@@ Expand Up / @@ -26,7 +26,7 @@ load_install_packages = function(package_list) { @@
         }
         # load package
-        library(p, character.only = TRUE, quietly = TRUE)
+        suppressPackageStartupMessages(library(p, character.only = TRUE))
       }
@@ Expand Down @@