From 99699fa55d75117494f8b4df9131359764f0f6ad Mon Sep 17 00:00:00 2001 From: igor Date: Wed, 6 Sep 2017 18:32:39 -0400 Subject: [PATCH] adjust differential gene expression --- scripts/deseq2-heatmap.R | 10 ++--- scripts/deseq2-pca.R | 4 +- scripts/deseq2-results.R | 68 +++++++++++++++++++++++---------- scripts/dge-deseq2.R | 19 ++++++--- scripts/load-install-packages.R | 2 +- 5 files changed, 68 insertions(+), 35 deletions(-) diff --git a/scripts/deseq2-heatmap.R b/scripts/deseq2-heatmap.R index f74d99e..1563bcd 100755 --- a/scripts/deseq2-heatmap.R +++ b/scripts/deseq2-heatmap.R @@ -4,7 +4,7 @@ -deseq2_heatmap = function(mat, genes, samples, title, file_suffix) { +deseq2_heatmap = function(mat, genes, samples, title, file_prefix = "plot.heatmap") { library(pheatmap) @@ -14,8 +14,8 @@ deseq2_heatmap = function(mat, genes, samples, title, file_suffix) { # labels size_text = paste0(length(genes), "x", length(samples)) title = paste0(title, " - ", size_text) - filename_png = paste0("plot.heatmap.", file_suffix, ".", size_text, ".png") - filename_pdf = paste0("plot.heatmap.", file_suffix, ".", size_text, ".pdf") + filename_png = paste0(file_prefix, ".", size_text, ".png") + filename_pdf = paste0(file_prefix, ".", size_text, ".pdf") # heatmap cells color range: blue-white-red cell_colors = colorRampPalette(c("#043177", "#244B88", "#FAFAFA", "#C62E2E", "#BF0F0F"))(50) @@ -30,7 +30,7 @@ deseq2_heatmap = function(mat, genes, samples, title, file_suffix) { show_rownames = FALSE } - # heatmap with clustering + # png heatmap with clustering message("generate heatmap: ", filename_png) pheatmap(mat, color = cell_colors, border_color = NA, scale = "row", cluster_rows = TRUE, cluster_cols = TRUE, clustering_distance_rows = "euclidean", clustering_distance_cols = "euclidean", clustering_method = "complete", @@ -38,7 +38,7 @@ deseq2_heatmap = function(mat, genes, samples, title, file_suffix) { filename = filename_png, width = 12, height = 8) Sys.sleep(1) - # heatmap with clustering + # pdf heatmap with clustering message("generate heatmap: ", filename_pdf) pheatmap(mat, color = cell_colors, border_color = NA, scale = "row", cluster_rows = TRUE, cluster_cols = TRUE, clustering_distance_rows = "euclidean", clustering_distance_cols = "euclidean", clustering_method = "complete", diff --git a/scripts/deseq2-pca.R b/scripts/deseq2-pca.R index 5cd33f5..d207e9c 100755 --- a/scripts/deseq2-pca.R +++ b/scripts/deseq2-pca.R @@ -30,9 +30,9 @@ deseq2_pca = function(object, intgroup, ntop = 500) { } # add the intgroup factors together to create a new grouping factor - intgroup_df = as.data.frame(colData(object)[, intgroup, drop=FALSE]) + intgroup_df = as.data.frame(colData(object)[, intgroup, drop = FALSE]) if (length(intgroup) > 1) { - fac = factor(apply( intgroup.df, 1, paste, collapse=" : ")) + fac = factor(apply(intgroup_df, 1, paste, collapse = " : ")) } else { fac = colData(object)[[intgroup]] } diff --git a/scripts/deseq2-results.R b/scripts/deseq2-results.R index b576dec..9cc939b 100755 --- a/scripts/deseq2-results.R +++ b/scripts/deseq2-results.R @@ -10,6 +10,7 @@ deseq2_results = function(deseq_dataset, contrast = NULL, name = NULL) { library(xlsx) # calculate results (using contrast or name, depending on what is given) + # since v1.16 (11/2016), lfcShrink function performs fold change shrinkage and addMLE is for backward compatibility if(!is.null(contrast)) { res = results(deseq_dataset, contrast = contrast, cooksCutoff = FALSE, addMLE = TRUE) # extract results name @@ -27,12 +28,12 @@ deseq2_results = function(deseq_dataset, contrast = NULL, name = NULL) { # sort results so most significant are first res = res[order(res$padj, res$pvalue, -res$baseMean), ] - # save results object + # save unmodified results object res_rds = paste0("deseq2.res.", file_suffix, ".rds") saveRDS(dds, file = res_rds) message("save results object: ", res_rds) - # save results as csv + # save unmodified results as csv res_csv = paste0("dge.", file_suffix, ".csv") write.csv(as.data.frame(res), file = res_csv) message("save results csv: ", res_csv) @@ -51,23 +52,24 @@ deseq2_results = function(deseq_dataset, contrast = NULL, name = NULL) { names(res_df)[names(res_df) == "log2FoldChange"] = "log2FC" names(res_df)[names(res_df) == "lfcMLE"] = "log2FCunshrunk" - # save results as xlsx + message("num genes padj<0.90: ", nrow(subset(res_df, padj < 0.9))) + message("num genes padj<0.20: ", nrow(subset(res_df, padj < 0.2))) + message("num genes padj<0.05: ", nrow(subset(res_df, padj < 0.05))) + message("num genes padj<0.01: ", nrow(subset(res_df, padj < 0.01))) + + # save differential expression results in Excel format res_xlsx = paste0("dge.", gsub(pattern = " ", replacement = "-", x = res_name), ".xlsx") write.xlsx2(x = res_df, file = res_xlsx, sheetName = res_name, col.names = TRUE, row.names = FALSE) message("results genes: ", nrow(res_df)) message("save results xlsx: ", res_xlsx) - # save results as xlsx (padj<0.05) + # save significant (padj<0.05) differential expression results in Excel format res_padj005_xlsx = gsub(pattern = ".xlsx", replacement = ".q005.xlsx", x = res_xlsx) res_padj005_df = subset(res_df, padj < 0.05) write.xlsx2(x = res_padj005_df, file = res_padj005_xlsx, sheetName = res_name, col.names = TRUE, row.names = FALSE) - message("num genes padj<0.9: ", nrow(subset(res_df, padj < 0.9))) - message("num genes padj<0.2: ", nrow(subset(res_df, padj < 0.2))) - message("num genes padj<0.05: ", nrow(subset(res_df, padj < 0.05))) - message("num genes padj<0.01: ", nrow(subset(res_df, padj < 0.01))) message("save filtered results xlsx: ", res_padj005_xlsx) - # heatmap values matrix + # heatmap variance stabilized values matrix vsd = assay(varianceStabilizingTransformation(dds, blind = TRUE)) # all samples and the subset used for the comparison @@ -75,34 +77,58 @@ deseq2_results = function(deseq_dataset, contrast = NULL, name = NULL) { samples_comp = samples_all if(!is.null(contrast)) samples_comp = rownames(subset(deseq_dataset@colData, group %in% contrast[2:3])) + # create a separate directory for heatmaps + heatmaps_dir = "heatmaps" + if (!dir.exists(heatmaps_dir)) dir.create(heatmaps_dir) + # heatmap gene subsets (list with genes, plot title, and file suffix) hmg = list() - hmg[[length(hmg) + 1]] = list(genes = rownames(res_df)[1:50], title = "50 Most Significant", file_suffix = "top") - hmg[[length(hmg) + 1]] = list(genes = rownames(res_df)[1:100], title = "100 Most Significant", file_suffix = "top") - hmg[[length(hmg) + 1]] = list(genes = rownames(res_df)[1:1000], title = "1000 Most Significant", file_suffix = "top") - hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.10)), title = "q < 0.1", file_suffix = "q010") - hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.05)), title = "q < 0.05", file_suffix = "q005") - hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.01)), title = "q < 0.01", file_suffix = "q001") - hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.001)), title = "q < 0.001", file_suffix = "q0001") - hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, pvalue < 0.05)), title = "p < 0.05", file_suffix = "p005") - hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, pvalue < 0.01)), title = "p < 0.01", file_suffix = "p001") + hmg[[length(hmg) + 1]] = list(genes = rownames(res_df)[1:50], + title = "50 Most Significant", + file_suffix = "top") + hmg[[length(hmg) + 1]] = list(genes = rownames(res_df)[1:100], + title = "100 Most Significant", + file_suffix = "top") + hmg[[length(hmg) + 1]] = list(genes = rownames(res_df)[1:1000], + title = "1000 Most Significant", + file_suffix = "top") + hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.10)), + title = "q < 0.1", + file_suffix = "q010") + hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.05)), + title = "q < 0.05", + file_suffix = "q005") + hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.01)), + title = "q < 0.01", + file_suffix = "q001") + hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.001)), + title = "q < 0.001", + file_suffix = "q0001") + hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, pvalue < 0.05)), + title = "p < 0.05", + file_suffix = "p005") + hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, pvalue < 0.01)), + title = "p < 0.01", + file_suffix = "p001") # process every gene subset for (i in 1:length(hmg)) { # generate title and file suffix hm_title = paste0(res_name, " - ", hmg[[i]]$title) - hm_file_suffix = paste0(file_suffix, ".", hmg[[i]]$file_suffix) + hm_file_prefix = paste0(heatmaps_dir, "/plot.heatmap.", file_suffix, ".", hmg[[i]]$file_suffix) # generate heatmaps if gene list is not too small or big if (length(hmg[[i]]$genes) > 5 && length(hmg[[i]]$genes) < 3000) { # generate heatmap using all samples - deseq2_heatmap(mat = vsd, genes = hmg[[i]]$genes, samples = samples_all, title = hm_title, file_suffix = hm_file_suffix) + deseq2_heatmap(mat = vsd, genes = hmg[[i]]$genes, samples = samples_all, + title = hm_title, file_prefix = hm_file_prefix) # generate heatmap using a subset of samples used for the comparison if (length(samples_comp) < length(samples_all)) { - deseq2_heatmap(mat = vsd, genes = hmg[[i]]$genes, samples = samples_comp, title = hm_title, file_suffix = hm_file_suffix) + deseq2_heatmap(mat = vsd, genes = hmg[[i]]$genes, samples = samples_comp, + title = hm_title, file_prefix = hm_file_prefix) } } diff --git a/scripts/dge-deseq2.R b/scripts/dge-deseq2.R index 5d95fe7..69bb75e 100755 --- a/scripts/dge-deseq2.R +++ b/scripts/dge-deseq2.R @@ -64,22 +64,28 @@ message("GTF median gene length: ", median(gene_lengths)) # import groups table groups_table = read.csv(file = groups_table_file, header = TRUE, row.names = 1, colClasses = "factor") message("groups table sample num: ", nrow(groups_table)) -message("groups table groups: ", colnames(groups_table)) +message("groups table samples: ", paste(rownames(groups_table), collapse = ", ")) +message("groups table groups: ", paste(colnames(groups_table), collapse = ", ")) # import counts table counts_table = read.delim(file = counts_table_file, header = TRUE, row.names = 1, check.names = FALSE, stringsAsFactors = FALSE) message("full counts table gene num: ", nrow(counts_table)) message("full counts table sample num: ", ncol(counts_table)) +message("full counts table samples: ", paste(colnames(counts_table), collapse = ", ")) + +# check that all samples from the groups table are found in the counts table +diff_samples = setdiff(rownames(groups_table), colnames(counts_table)) +if (length(diff_samples)) stop("some samples not in counts table: ", paste(diff_samples, collapse = ", ")) # subset to samples in groups table (also sets samples to be in the same order) -counts_table = counts_table[,rownames(groups_table)] +counts_table = counts_table[, rownames(groups_table)] message("group-subset counts table gene num: ", nrow(counts_table)) message("group-subset counts table sample num: ", ncol(counts_table)) -# group info +# group info (use the first column for grouped comparisons) group_name = colnames(groups_table)[1] message("group name: ", group_name) -group_levels = levels(groups_table[,group_name]) +group_levels = levels(groups_table[, group_name]) message("group levels: ", toString(group_levels)) # design formula @@ -89,8 +95,9 @@ message("design formula: ", design_formula) message(" ========== normalize ========== ") # import raw counts and create DESeq object +# since v1.16 (11/2016), betaPrior is set to FALSE and shrunken LFCs are obtained afterwards using lfcShrink dds = DESeqDataSetFromMatrix(countData = counts_table, colData = groups_table, design = design_formula) -dds = DESeq(dds, parallel = TRUE, BPPARAM = BiocParallel::MulticoreParam(workers = 4)) +dds = DESeq(dds, betaPrior = TRUE, parallel = TRUE, BPPARAM = BiocParallel::MulticoreParam(workers = 4)) # add gene lengths (used to generate FPKM values) if (identical(sort(names(gene_lengths)), sort(rownames(dds)))) { @@ -131,7 +138,7 @@ print(plotSparsity(dds, normalized = TRUE)) dev.off() # PCA plot -pca_plot = deseq2_pca(vsd, intgroup = c("group"), ntop = 1000) +pca_plot = deseq2_pca(vsd, intgroup = group_name, ntop = 1000) save_plot("plot.pca.pdf", pca_plot, base_width = 8, base_height = 5, units = "in", dpi = 300) save_plot("plot.pca.png", pca_plot, base_width = 8, base_height = 5, units = "in", dpi = 300) diff --git a/scripts/load-install-packages.R b/scripts/load-install-packages.R index dc8b776..0d9d458 100755 --- a/scripts/load-install-packages.R +++ b/scripts/load-install-packages.R @@ -26,7 +26,7 @@ load_install_packages = function(package_list) { } # load package - library(p, character.only = TRUE, quietly = TRUE) + suppressPackageStartupMessages(library(p, character.only = TRUE)) }