Skip to content

Commit

Permalink
adjust differential gene expression
Browse files Browse the repository at this point in the history
  • Loading branch information
igordot committed Sep 6, 2017
1 parent 8b5d1ba commit 99699fa
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 35 deletions.
10 changes: 5 additions & 5 deletions scripts/deseq2-heatmap.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@



deseq2_heatmap = function(mat, genes, samples, title, file_suffix) {
deseq2_heatmap = function(mat, genes, samples, title, file_prefix = "plot.heatmap") {

library(pheatmap)

Expand All @@ -14,8 +14,8 @@ deseq2_heatmap = function(mat, genes, samples, title, file_suffix) {
# labels
size_text = paste0(length(genes), "x", length(samples))
title = paste0(title, " - ", size_text)
filename_png = paste0("plot.heatmap.", file_suffix, ".", size_text, ".png")
filename_pdf = paste0("plot.heatmap.", file_suffix, ".", size_text, ".pdf")
filename_png = paste0(file_prefix, ".", size_text, ".png")
filename_pdf = paste0(file_prefix, ".", size_text, ".pdf")

# heatmap cells color range: blue-white-red
cell_colors = colorRampPalette(c("#043177", "#244B88", "#FAFAFA", "#C62E2E", "#BF0F0F"))(50)
Expand All @@ -30,15 +30,15 @@ deseq2_heatmap = function(mat, genes, samples, title, file_suffix) {
show_rownames = FALSE
}

# heatmap with clustering
# png heatmap with clustering
message("generate heatmap: ", filename_png)
pheatmap(mat, color = cell_colors, border_color = NA, scale = "row", cluster_rows = TRUE, cluster_cols = TRUE,
clustering_distance_rows = "euclidean", clustering_distance_cols = "euclidean", clustering_method = "complete",
main = title, fontsize_row = fontsize_row, fontsize_col = 12, show_rownames = show_rownames,
filename = filename_png, width = 12, height = 8)
Sys.sleep(1)

# heatmap with clustering
# pdf heatmap with clustering
message("generate heatmap: ", filename_pdf)
pheatmap(mat, color = cell_colors, border_color = NA, scale = "row", cluster_rows = TRUE, cluster_cols = TRUE,
clustering_distance_rows = "euclidean", clustering_distance_cols = "euclidean", clustering_method = "complete",
Expand Down
4 changes: 2 additions & 2 deletions scripts/deseq2-pca.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ deseq2_pca = function(object, intgroup, ntop = 500) {
}

# add the intgroup factors together to create a new grouping factor
intgroup_df = as.data.frame(colData(object)[, intgroup, drop=FALSE])
intgroup_df = as.data.frame(colData(object)[, intgroup, drop = FALSE])
if (length(intgroup) > 1) {
fac = factor(apply( intgroup.df, 1, paste, collapse=" : "))
fac = factor(apply(intgroup_df, 1, paste, collapse = " : "))
} else {
fac = colData(object)[[intgroup]]
}
Expand Down
68 changes: 47 additions & 21 deletions scripts/deseq2-results.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ deseq2_results = function(deseq_dataset, contrast = NULL, name = NULL) {
library(xlsx)

# calculate results (using contrast or name, depending on what is given)
# since v1.16 (11/2016), lfcShrink function performs fold change shrinkage and addMLE is for backward compatibility
if(!is.null(contrast)) {
res = results(deseq_dataset, contrast = contrast, cooksCutoff = FALSE, addMLE = TRUE)
# extract results name
Expand All @@ -27,12 +28,12 @@ deseq2_results = function(deseq_dataset, contrast = NULL, name = NULL) {
# sort results so most significant are first
res = res[order(res$padj, res$pvalue, -res$baseMean), ]

# save results object
# save unmodified results object
res_rds = paste0("deseq2.res.", file_suffix, ".rds")
saveRDS(dds, file = res_rds)
message("save results object: ", res_rds)

# save results as csv
# save unmodified results as csv
res_csv = paste0("dge.", file_suffix, ".csv")
write.csv(as.data.frame(res), file = res_csv)
message("save results csv: ", res_csv)
Expand All @@ -51,58 +52,83 @@ deseq2_results = function(deseq_dataset, contrast = NULL, name = NULL) {
names(res_df)[names(res_df) == "log2FoldChange"] = "log2FC"
names(res_df)[names(res_df) == "lfcMLE"] = "log2FCunshrunk"

# save results as xlsx
message("num genes padj<0.90: ", nrow(subset(res_df, padj < 0.9)))
message("num genes padj<0.20: ", nrow(subset(res_df, padj < 0.2)))
message("num genes padj<0.05: ", nrow(subset(res_df, padj < 0.05)))
message("num genes padj<0.01: ", nrow(subset(res_df, padj < 0.01)))

# save differential expression results in Excel format
res_xlsx = paste0("dge.", gsub(pattern = " ", replacement = "-", x = res_name), ".xlsx")
write.xlsx2(x = res_df, file = res_xlsx, sheetName = res_name, col.names = TRUE, row.names = FALSE)
message("results genes: ", nrow(res_df))
message("save results xlsx: ", res_xlsx)

# save results as xlsx (padj<0.05)
# save significant (padj<0.05) differential expression results in Excel format
res_padj005_xlsx = gsub(pattern = ".xlsx", replacement = ".q005.xlsx", x = res_xlsx)
res_padj005_df = subset(res_df, padj < 0.05)
write.xlsx2(x = res_padj005_df, file = res_padj005_xlsx, sheetName = res_name, col.names = TRUE, row.names = FALSE)
message("num genes padj<0.9: ", nrow(subset(res_df, padj < 0.9)))
message("num genes padj<0.2: ", nrow(subset(res_df, padj < 0.2)))
message("num genes padj<0.05: ", nrow(subset(res_df, padj < 0.05)))
message("num genes padj<0.01: ", nrow(subset(res_df, padj < 0.01)))
message("save filtered results xlsx: ", res_padj005_xlsx)

# heatmap values matrix
# heatmap variance stabilized values matrix
vsd = assay(varianceStabilizingTransformation(dds, blind = TRUE))

# all samples and the subset used for the comparison
samples_all = colnames(deseq_dataset)
samples_comp = samples_all
if(!is.null(contrast)) samples_comp = rownames(subset(deseq_dataset@colData, group %in% contrast[2:3]))

# create a separate directory for heatmaps
heatmaps_dir = "heatmaps"
if (!dir.exists(heatmaps_dir)) dir.create(heatmaps_dir)

# heatmap gene subsets (list with genes, plot title, and file suffix)
hmg = list()
hmg[[length(hmg) + 1]] = list(genes = rownames(res_df)[1:50], title = "50 Most Significant", file_suffix = "top")
hmg[[length(hmg) + 1]] = list(genes = rownames(res_df)[1:100], title = "100 Most Significant", file_suffix = "top")
hmg[[length(hmg) + 1]] = list(genes = rownames(res_df)[1:1000], title = "1000 Most Significant", file_suffix = "top")
hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.10)), title = "q < 0.1", file_suffix = "q010")
hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.05)), title = "q < 0.05", file_suffix = "q005")
hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.01)), title = "q < 0.01", file_suffix = "q001")
hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.001)), title = "q < 0.001", file_suffix = "q0001")
hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, pvalue < 0.05)), title = "p < 0.05", file_suffix = "p005")
hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, pvalue < 0.01)), title = "p < 0.01", file_suffix = "p001")
hmg[[length(hmg) + 1]] = list(genes = rownames(res_df)[1:50],
title = "50 Most Significant",
file_suffix = "top")
hmg[[length(hmg) + 1]] = list(genes = rownames(res_df)[1:100],
title = "100 Most Significant",
file_suffix = "top")
hmg[[length(hmg) + 1]] = list(genes = rownames(res_df)[1:1000],
title = "1000 Most Significant",
file_suffix = "top")
hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.10)),
title = "q < 0.1",
file_suffix = "q010")
hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.05)),
title = "q < 0.05",
file_suffix = "q005")
hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.01)),
title = "q < 0.01",
file_suffix = "q001")
hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, padj < 0.001)),
title = "q < 0.001",
file_suffix = "q0001")
hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, pvalue < 0.05)),
title = "p < 0.05",
file_suffix = "p005")
hmg[[length(hmg) + 1]] = list(genes = rownames(subset(res_df, pvalue < 0.01)),
title = "p < 0.01",
file_suffix = "p001")

# process every gene subset
for (i in 1:length(hmg)) {

# generate title and file suffix
hm_title = paste0(res_name, " - ", hmg[[i]]$title)
hm_file_suffix = paste0(file_suffix, ".", hmg[[i]]$file_suffix)
hm_file_prefix = paste0(heatmaps_dir, "/plot.heatmap.", file_suffix, ".", hmg[[i]]$file_suffix)

# generate heatmaps if gene list is not too small or big
if (length(hmg[[i]]$genes) > 5 && length(hmg[[i]]$genes) < 3000) {

# generate heatmap using all samples
deseq2_heatmap(mat = vsd, genes = hmg[[i]]$genes, samples = samples_all, title = hm_title, file_suffix = hm_file_suffix)
deseq2_heatmap(mat = vsd, genes = hmg[[i]]$genes, samples = samples_all,
title = hm_title, file_prefix = hm_file_prefix)

# generate heatmap using a subset of samples used for the comparison
if (length(samples_comp) < length(samples_all)) {
deseq2_heatmap(mat = vsd, genes = hmg[[i]]$genes, samples = samples_comp, title = hm_title, file_suffix = hm_file_suffix)
deseq2_heatmap(mat = vsd, genes = hmg[[i]]$genes, samples = samples_comp,
title = hm_title, file_prefix = hm_file_prefix)
}

}
Expand Down
19 changes: 13 additions & 6 deletions scripts/dge-deseq2.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,22 +64,28 @@ message("GTF median gene length: ", median(gene_lengths))
# import groups table
groups_table = read.csv(file = groups_table_file, header = TRUE, row.names = 1, colClasses = "factor")
message("groups table sample num: ", nrow(groups_table))
message("groups table groups: ", colnames(groups_table))
message("groups table samples: ", paste(rownames(groups_table), collapse = ", "))
message("groups table groups: ", paste(colnames(groups_table), collapse = ", "))

# import counts table
counts_table = read.delim(file = counts_table_file, header = TRUE, row.names = 1, check.names = FALSE, stringsAsFactors = FALSE)
message("full counts table gene num: ", nrow(counts_table))
message("full counts table sample num: ", ncol(counts_table))
message("full counts table samples: ", paste(colnames(counts_table), collapse = ", "))

# check that all samples from the groups table are found in the counts table
diff_samples = setdiff(rownames(groups_table), colnames(counts_table))
if (length(diff_samples)) stop("some samples not in counts table: ", paste(diff_samples, collapse = ", "))

# subset to samples in groups table (also sets samples to be in the same order)
counts_table = counts_table[,rownames(groups_table)]
counts_table = counts_table[, rownames(groups_table)]
message("group-subset counts table gene num: ", nrow(counts_table))
message("group-subset counts table sample num: ", ncol(counts_table))

# group info
# group info (use the first column for grouped comparisons)
group_name = colnames(groups_table)[1]
message("group name: ", group_name)
group_levels = levels(groups_table[,group_name])
group_levels = levels(groups_table[, group_name])
message("group levels: ", toString(group_levels))

# design formula
Expand All @@ -89,8 +95,9 @@ message("design formula: ", design_formula)
message(" ========== normalize ========== ")

# import raw counts and create DESeq object
# since v1.16 (11/2016), betaPrior is set to FALSE and shrunken LFCs are obtained afterwards using lfcShrink
dds = DESeqDataSetFromMatrix(countData = counts_table, colData = groups_table, design = design_formula)
dds = DESeq(dds, parallel = TRUE, BPPARAM = BiocParallel::MulticoreParam(workers = 4))
dds = DESeq(dds, betaPrior = TRUE, parallel = TRUE, BPPARAM = BiocParallel::MulticoreParam(workers = 4))

# add gene lengths (used to generate FPKM values)
if (identical(sort(names(gene_lengths)), sort(rownames(dds)))) {
Expand Down Expand Up @@ -131,7 +138,7 @@ print(plotSparsity(dds, normalized = TRUE))
dev.off()

# PCA plot
pca_plot = deseq2_pca(vsd, intgroup = c("group"), ntop = 1000)
pca_plot = deseq2_pca(vsd, intgroup = group_name, ntop = 1000)
save_plot("plot.pca.pdf", pca_plot, base_width = 8, base_height = 5, units = "in", dpi = 300)
save_plot("plot.pca.png", pca_plot, base_width = 8, base_height = 5, units = "in", dpi = 300)

Expand Down
2 changes: 1 addition & 1 deletion scripts/load-install-packages.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ load_install_packages = function(package_list) {
}

# load package
library(p, character.only = TRUE, quietly = TRUE)
suppressPackageStartupMessages(library(p, character.only = TRUE))

}

Expand Down

0 comments on commit 99699fa

Please sign in to comment.