Transcripts_to_DESeq2.Rmd

---
title: "Guide for the Differential Expression Analysis of RNAseq data using DESeq2"
author: "David Requena & Daniel Guevara"
date: "2024-06-20"
output: html_document
---

# 1. Set up
## Install libraries
```{r , eval=FALSE, echo=FALSE}
install.packages('knitr')
install.packages('matrixStats')
install.packages('RColorBrewer')
install.packages('gplots')
install.packages('ggplot2')
install.packages('tsne')
install.packages('BatchJobs')
install.packages('gridExtra')
install.packages('ggrepel')
install.packages('dendextend')
install.packages('tools')
install.packages("xlsx")
install.packages("openxlsx")
install.packages("randomcoloR")
install.packages("readr")
install.packages("ggsignif")

remotes::install_github(repo = 'BigMindLab/OmicsKit')

if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")

BiocManager::install("XML") #In linux, first do: sudo apt-get install
BiocManager::install("BiocParallel", dependencies=TRUE) #libxml2-dev
BiocManager::install("DESeq2", dependencies=TRUE)
BiocManager::install("biomaRt", dependencies=TRUE)
BiocManager::install("pheatmap", dependencies=TRUE)
BiocManager::install("dendextend", dependencies=TRUE)
BiocManager::install("preprocessCore", dependencies=TRUE)
BiocManager::install("sva", dependencies=TRUE)
BiocManager::install("vioplot", dependencies=TRUE)
BiocManager::install("stats", dependencies=TRUE)
BiocManager::install("ggdendro", dependencies=TRUE)
BiocManager::install("tidyr", dependencies=TRUE)
BiocManager::install("dplyr", dependencies=TRUE)
BiocManager::install("umap", dependencies=TRUE)
BiocManager::install("ensembldb", dependencies=TRUE)
BiocManager::install("tximeta", dependencies=TRUE)
```


## Call required libraries
```{r, message = FALSE}
library("OmicsKit", quietly = T)
library("knitr", quietly = T)
library("tidyr", quietly = T)
library("dplyr", quietly = T)
library("openxlsx", quietly = T)
library("tximport", quietly = T)
library("ensembldb", quietly = T)
library("GenomicFeatures", quietly = T)
library("RColorBrewer", quietly = T)
library("gplots", quietly = T)
library("ggplot2", quietly = T)
library("ggrepel", quietly = T)
library("tsne", quietly = T)
library("umap", quietly = T)
library("MatrixGenerics", quietly = T)
library("DESeq2", quietly = T)
library("BatchJobs", quietly = T)
library("BiocParallel", quietly = T)
library("biomaRt", quietly = T)
library("pheatmap", quietly = T)
library("randomcoloR", quietly = T)
library("plotly", quietly = T)
library("svglite", quietly = T)
library("VennDiagram", quietly = T)
library("pheatmap", quietly = T)
library("ggsignif", quietly = T)
library("gridExtra", quietly = T)
library("cowplot", quietly = T)
library("gtable", quietly = T)
library("scales", quietly = T)
library("clusterProfiler", quietly = T)
library("org.Hs.eg.db", quietly = T)
library("AnnotationDbi", quietly = T)
library("DOSE", quietly = T)
```

## Set working directory
```{r}
require(knitr)
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = "./")
sample_data <- "samples_data.xlsx" # File of the samples' metadata
counts_dir <- "./Counts" # Indicate directory where quantification files are stored
```


# 2. Pre processing workflow
## Sample metadata
```{r}
sampledata <- data.frame(read.xlsx(sample_data))
sampledata$group <- factor(sampledata$group, ordered = T, levels = c("N", "T", "M"))
sampledata$gp <- factor(sampledata$gp)
sampledata$sex <- factor(sampledata$sex)
sampledata$age_cat <- factor(sampledata$age_cat)
rownames(sampledata) <- sampledata$id
sampledata$num <- 1:nrow(sampledata)
```

## Managing quantification files
```{r}
# Salmon .sf files
count_files <- file.path(counts_dir, paste0(sampledata$id, "_quant.sf"))
names(count_files) <- sampledata$id

txi <- tximport(count_files, type="salmon", txOut=TRUE, countsFromAbundance="no")

# Kallisto .tsv files
count_files <- file.path(counts_dir, paste0(sampledata$id, "_abundance.tsv"))
names(count_files) <- sampledata$id

txi <- tximport(count_files, type="kallisto", txOut=TRUE, countsFromAbundance="no")

# Filtering out null counts
counts.tx <- txi$counts
counts.tx <- counts.tx[rowSums(counts.tx) > 0,]

sampledata$tx_counts <- colSums(counts.tx)

# Output
write.xlsx(sampledata, file = "samples_with_counts.xlsx", colNames = T, rowNames = F, append = F)
```

### Counts per sample
```{r}
my_colors <- c("paleturquoise1", "lightpink", "#6691C7", "#E54155")

p.counts <- ggplot(data = sampledata, aes(x = id, y = tx_counts, fill = group)) +
  theme_bw() + theme(panel.grid.major.x = element_blank(), panel.grid.minor.y = element_blank()) +
  geom_bar(stat="identity", color="black", linewidth = 0.3) + scale_fill_manual(values = my_colors) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  scale_y_continuous(expand = expansion(mult = c(0, .01)), labels = scales::comma, breaks = seq(0,7e+7,by=0.5e+7)) +
  xlab("Sample ID") + ylab("Counts") + theme(axis.title=element_text(size=16))

p.counts

ggsave("plots/Counts-Transcript_per_Sample.jpg", p.counts, width = 10, height = 8, dpi = 300)
```

## [OPTIONAL] Get the annotation of the transcript IDs, including transcript names, gene names and their description:
```{r, echo=FALSE, eval=FALSE}
tx2gene <- get_annotations(rownames(txi$counts), version = "103", format = "xlsx")
```

## Transcripts into genes
```{r}
tx2gene <- data.frame(read.xlsx("tx2gene_GRCh38v103.xlsx"))
txi.toGene <- summarizeToGene(txi, tx2gene[,c("transcriptID","geneID")])

# Add gene counts
counts.gene <- as.data.frame(txi.toGene$counts)
counts.gene <- counts.gene[rowSums(counts.gene) > 0,]
sampledata$gene_counts <- colSums(counts.gene)

write.xlsx(sampledata, file = "Samples_with_Counts.xlsx", colNames = T, rowNames = F, append = F, overwrite = T)

# Load geneID annotations
annotations <- c("symbol", "biotype", "chromosome", "gene_start", "gene_end", "gene_length", "description")
geneID.details <- tx2gene[,c("geneID", annotations)]
geneID.details <- geneID.details[!duplicated(geneID.details), ]

# Add annotations
counts.gene_annotations <- counts.gene
counts.gene_annotations$geneID <- rownames(counts.gene_annotations)
counts.gene_annotations <- add_annotations(counts.gene_annotations, geneID.details, variables = annotations)

write.xlsx(counts.gene_annotations, file = "Raw_Gene_Counts_Annotated.xlsx", colNames = T, rowNames = F, append = F, overwrite = T)
```

## Filter out blacklists
```{r}
blacklist_nRibo <- data.frame(read.csv("blacklist_nuclear-ribo.tsv", sep = "\t", header = TRUE))
blacklist_mtRibo <- data.frame(read.csv("blacklist_mito-ribo.tsv", sep = "\t", header = TRUE))
blacklist_allRibo <- data.frame(read.csv("blacklist_all-ribo.tsv", sep = "\t", header = TRUE))

# Keep the genes in each blacklist
genecounts.nRibo <- counts.gene[rownames(counts.gene) %in% blacklist_nRibo[,1], ]
genecounts.mtRibo <- counts.gene[rownames(counts.gene) %in% blacklist_mtRibo[,1], ]
genecounts.filtered <- counts.gene[!(rownames(counts.gene) %in% blacklist_allRibo[,1]), ]

sampledata$nRibo <- colSums(genecounts.nRibo)
sampledata$mtRibo <- colSums(genecounts.mtRibo)
sampledata$filtered <- colSums(genecounts.filtered)

write.xlsx(sampledata, file = "samples_with_counts.xlsx", colNames = T, rowNames = F, append = F)

# Filter the txi object
keep <- !(rownames(txi.toGene$counts) %in% blacklist_allRibo[,1])

txi.toGene$abundance <- txi.toGene$abundance[keep, ]
txi.toGene$counts <- txi.toGene$counts[keep, ]
txi.toGene$length <- txi.toGene$length[keep, ]
```

### Counts per blacklist
```{r}
for (bl in c("nRibo", "mtRibo")) {
  p.sub <- ggplot(data = sampledata, aes_string(x = "id", y = bl, fill = "group")) +
    theme_bw() + theme(panel.grid.major.x = element_blank(), panel.grid.minor.y = element_blank()) +
    geom_bar(stat="identity", color="black", linewidth = 0.3) + scale_fill_manual(values = my_colors) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
    scale_y_continuous(expand = expansion(mult = c(0, .01)), labels = scales::comma) +
    xlab("Sample ID") + ylab(bl) + theme(axis.title=element_text(size=16))
  
  print(p.sub)
  
  ggsave(paste0("plots/Counts-Transcript_per_Sample_", bl, ".jpg"),
         p.sub, width = 10, height = 8, dpi = 300)
}

# Filtered
p.filt <- ggplot(data = sampledata, aes(x = id, y = filtered, fill = group)) +
  theme_bw() + theme(panel.grid.major.x = element_blank(), panel.grid.minor.y = element_blank()) +
  geom_bar(stat="identity", color="black", linewidth = 0.3) +
  scale_fill_manual(values = my_colors) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  scale_y_continuous(expand = expansion(mult = c(0, .05)), labels = scales::comma, breaks = seq(0,7e+7,by=0.5e+7)) +
  xlab("Sample ID") + ylab("Filtered") + theme(axis.title=element_text(size=16))

p.filt

ggsave("plots/Counts-Transcript_per_Sample_filtered.jpg", p.filt,
       width = 10, height = 8, dpi = 300)
```

## Scaled TPMs
```{r}
# Calculate the TPM. The columns will now add 1 million.
gene.tpm <- tpm(counts.gene, counts.gene_annotations$gene_length)
gene.tpm <- data.frame(gene.tpm)
gene.tpm.annotated <- add_annotations(gene.tpm, geneID.details, variables = annotations)

write.xlsx(gene.tpm.annotated, file = "Gene_Counts_TPM.xlsx", colNames = T, rowNames = F, append = F, overwrite = T)

# Keep the genes in each blacklist
geneTPM.nRibo <- gene.tpm[rownames(gene.tpm) %in% blacklist_nRibo[,1], ]
geneTPM.mtRibo <- gene.tpm[rownames(gene.tpm) %in% blacklist_mtRibo[,1], ]
geneTPM.allRibo <- gene.tpm[rownames(gene.tpm) %in% blacklist_allRibo[,1], ]

sampledata$TPM_nRibo <- colSums(geneTPM.nRibo[1:(length(geneTPM.nRibo))])
sampledata$TPM_mtRibo <- colSums(geneTPM.mtRibo[1:(length(geneTPM.nRibo))])
sampledata$TPM_allRibo <- colSums(geneTPM.allRibo[1:(length(geneTPM.nRibo))])

write.xlsx(sampledata, file = "Samples_with_Counts.xlsx", colNames = T, rowNames = F, append = F, overwrite = T)
```


# 3. Differential Expression Analysis with DESeq2
```{r}
# Note: Salmon counts are going to be rounded
ds.deseq <- DESeqDataSetFromTximport(txi.toGene, sampledata, ~gp) # Add other variables depending on identified confounders (i.e ~gp+sex)
colData(ds.deseq)$gp  <- factor(colData(ds.deseq)$gp,
                                levels = c("N", "T", "M"))

# Note that read quantifications are rounded in DESeq, so Salmon counts < 0.5 are now 0
nozero <- rowSums(counts(ds.deseq)) > 0
ds.deseq <- ds.deseq[nozero, ] # Remove genes with zero counts
ds.deseq <- estimateSizeFactors(ds.deseq)

######################
# Normalized gene counts
normalized.counts <- as.data.frame(counts(ds.deseq, normalized = TRUE))

N.ids <- sampledata$id[sampledata$gp == "N"]
T.ids <- sampledata$id[sampledata$gp == "T"]
M.ids <- sampledata$id[sampledata$gp == "M"]

normalized.counts$Mean.N <- rowMeans(normalized.counts[, N.ids])
normalized.counts$Mean.T <- rowMeans(normalized.counts[, T.ids])
normalized.counts$Mean.M <- rowMeans(normalized.counts[, M.ids])

# Adding gene name and description
normalized.counts.annotated <- add_annotations(normalized.counts, geneID.details, variables = annotations)
write.xlsx(normalized.counts.annotated, file = "Normalized_Gene_counts.xlsx", colNames = T, rowNames = F, append = F, overwrite = T)
######################

# Variance Stabilizing Transformation
transf.data <- varianceStabilizingTransformation(ds.deseq)
```

## Explore confounders
### PCA
```{r}
pca12 <- nice_PCA(transf.data, PCs = c(1, 2), ntop =  nrow(assay(transf.data)),
                  variables = c("group", "sex"),
                  legend_names = c("Group", "Sex"),
                  size = 9, alpha = 1, colors = c(my_colors), shapes = 21:22, # Add violet to colors when grouping by age
                  legend_title = 10, legend_elements = 8, legend_pos = NULL,
                  labels = c(var = "num", size = 3))#, name_tags = c(var = "num", size = 2, minlen = 2, box = 0.6))

pca12

ggsave("plots/dim_reduction/PCA12_sex.png", pca12, width = 7, height = 6, dpi = 600)
ggsave("plots/dim_reduction/PCA12_sex.svg", pca12, width = 7, height = 5, dpi = 600)

#ggsave("plots/dim_reduction/PCA12_age.png", pca12, width = 7, height = 6, dpi = 300)
#ggsave("plots/dim_reduction/PCA12_age.svg", pca12, width = 7, height = 5, dpi = 600)
```

### tSNE
```{r}
iter <- 10000
seed <- 0
plex <- 3

p.tsne <- nice_tSNE(object = transf.data, seed = seed, perplexity = plex,
                    max_iterations = iter, returnData = FALSE,
                    variables = c("group", "sex"), colors = my_colors,
                    shapes = 21:22, size = 9, alpha = 1,
                    legend_names = c("Group", "Sex"),
                    legend_title = 10, legend_elements = 8, legend_pos = NULL,
                    #name_tags = c(var = "id", size = 2, minlen = 2, box = 0.6),
                    labels = c(var = "num", size = 3))
p.tsne

ggsave(paste0("plots/dim_reduction/tSNE_s", seed, "_p", plex, "_i", (iter/1000), "k_sex.png"),
       p.tsne, width = 6, height = 5, dpi = 600)
ggsave(paste0("plots/dim_reduction/tSNE_s", seed, "_p", plex, "_i", (iter/1000), "k_sex.svg"),
       p.tsne, width = 6, height = 5, dpi = 600)

#ggsave(paste0("plots/dim_reduction/tSNE_s", seed, "_p", plex, "_i", (iter/1000), "k_age.png"),
#       p.tsne, width = 6, height = 5, dpi = 300)
#ggsave(paste0("plots/dim_reduction/tSNE_s", seed, "_p", plex, "_i", (iter/1000), "k_age.svg"),
#       p.tsne, width = 6, height = 5, dpi = 600)
```

### UMAP
```{r}
umap.params = umap.defaults
umap.params$n_neighbors=4
umap.params$n_components=3
umap.params$n_epochs=20000
umap.params$random_state=1
umap.params$transform_state=1
umap.params$verbose=TRUE

############
# 2D UMAP
############
p.umap <- nice_UMAP(object = transf.data, neighbors = 4, components = 3, epochs = 20000,
                    returnData = FALSE,
                    variables = c("group", "sex"), colors = my_colors,
                    shapes = 21:22, size = 9, alpha = 1,
                    legend_names = c("Group", "Sex"),
                    legend_title = 10, legend_elements = 8, legend_pos = NULL,
                    #name_tags = c(var = "id", size = 2, minlen = 2, box = 0.6),
                    labels = c(var = "num", size = 3))
p.umap

ggsave(paste0("plots/dim_reduction/Umap_n", umap_data$config$n_neighbors, "_",
              umap_data$config$n_components, "D", ".png"),
       p.umap, width = 6, height = 5.5, dpi = 600)
ggsave(paste0("plots/dim_reduction/Umap_n", umap_data$config$n_neighbors, "_",
              umap_data$config$n_components, "D", ".svg"),
       p.umap, width = 6, height = 5.5, dpi = 600)

############
# 3D UMAP
############
umap_data <- umap(t(assay(transf.data)), config = umap.params)

### my_colors <- c("paleturquoise1", "khaki1", "lightpink")
u_shapes <- c("circle", "square")
u_font <- list(family = "arial", size = 10, color = toRGB("grey10"))
u_strokes <- c("black", "red", "blue") # Color-blind safe

df.umap <- data.frame(umap_data$layout) %>%
  tibble::rownames_to_column("id") %>%
  dplyr::inner_join(sampledata, by = "id")


## 3D UMAP plot
umap_plot <- plot_ly(data = df.umap, x = ~X1, y = ~X2, z = ~X3, 
                     type="scatter3d", mode="markers",
                     color = ~gp, colors = c(my_colors),
                     marker = list(size = 10),
                     stroke = I("black"),
                     symbol = ~sex, symbols = u_shapes,
                     size = I(750), strokes = u_strokes) %>%
  add_text(text = ~num, textfont = u_font,
           textposition = "center", showlegend = FALSE) %>%
  layout(scene = list(xaxis = list(title = 'UMAP 1'),
                      yaxis = list(title = 'UMAP 2'),
                      zaxis = list(title = 'UMAP 3')))

htmlwidgets::saveWidget(umap_plot, paste0("plots/dim_reduction/Umap_n", umap.params$n_neighbors, "_",
                                          umap.params$n_components, "D", ".html"))
```

### Heatmap of samples
```{r}
######################
# Samples Heatmap
######################
sampleDists <- dist(t(assay(transf.data)))
sampleDistMatrix <- as.matrix(sampleDists)

# Sample IDs + complete phenotypes
rownames(sampleDistMatrix) <- paste(transf.data$id, transf.data$group, sep = "_")
colnames(sampleDistMatrix) <- paste(transf.data$id, transf.data$group, sep = "_")

heatmap_samples <- pheatmap(sampleDistMatrix, clustering_method = "complete", angle_col = 90)

ggsave("plots/dim_reduction/Heatmap_samples_v1.svg", plot = heatmap_samples,
       width = 7, height = 7, dpi = 600)
ggsave("plots/dim_reduction/Heatmap_samples_v1.png", plot = heatmap_samples,
       width = 7, height = 7, dpi = 600)

# Sample IDs + short phenotypes
rownames(sampleDistMatrix) <- paste(transf.data$id, transf.data$gp, sep = "_")
colnames(sampleDistMatrix) <- paste(transf.data$id, transf.data$gp, sep = "_")

heatmap_samples <- pheatmap(sampleDistMatrix, clustering_method = "complete", angle_col = 90)

ggsave("plots/dim_reduction/Heatmap_samples_v2.svg", plot = heatmap_samples,
       width = 7, height = 7, dpi = 600)
ggsave("plots/dim_reduction/Heatmap_samples_v2.png", plot = heatmap_samples,
       width = 7, height = 7, dpi = 600)

# Numbers + complete phenotypes
rownames(sampleDistMatrix) <- paste(transf.data$num, transf.data$group, sep = "_")
colnames(sampleDistMatrix) <- paste(transf.data$num, transf.data$group, sep = "_")

heatmap_samples <- pheatmap(sampleDistMatrix, clustering_method = "complete", angle_col = 90)

ggsave("plots/dim_reduction/Heatmap_samples_v3.svg", plot = heatmap_samples,
       width = 7, height = 7, dpi = 600)
ggsave("plots/dim_reduction/Heatmap_samples_v3.png", plot = heatmap_samples,
       width = 7, height = 7, dpi = 600)

# Numbers + short phenotypes
rownames(sampleDistMatrix) <- paste(transf.data$num, transf.data$gp, sep = "_")
colnames(sampleDistMatrix) <- paste(transf.data$num, transf.data$gp, sep = "_")

heatmap_samples <- pheatmap(sampleDistMatrix, clustering_method = "complete", angle_col = 90)

ggsave("plots/dim_reduction/Heatmap_samples_v4.svg", plot = heatmap_samples,
       width = 7, height = 7, dpi = 600)
ggsave("plots/dim_reduction/Heatmap_samples_v4.png", plot = heatmap_samples,
       width = 7, height = 7, dpi = 600)
```

## Obtain comparisons
```{r}
# Model matrix
ds.deseq <- DESeq(ds.deseq, modelMatrixType="standard", betaPrior=FALSE)

## DESeq2's dispersion estimates
plotDispEsts(ds.deseq)

# This will show the possible comparisons, according to the design provided
resultsNames(ds.deseq)

cutoff_alpha <- 0.25 # Cut off p-value
cutoff_fold <- 1 # Cut off fold-change
```


# 3. Results
```{r}
# Get the results of group Ap vs An
res.T_N <- results(ds.deseq, name = "gp_T_vs_N",
                     altHypothesis="greaterAbs",
                     alpha = cutoff_alpha,
                     # independentFiltering = FALSE, # Automatic filtering
                     pAdjustMethod = "BH") # Benjamini Hochberg = FDR

summary(res.T_N)
head(res.T_N[(res.T_N$padj < cutoff_alpha) & !is.na(res.T_N$padj), ], n = 10)

# Get the results of group Bp vs group An
res.M_N <- results(ds.deseq, name = "gp_M_vs_N",
                     altHypothesis="greaterAbs",
                     alpha = cutoff_alpha,
                     pAdjustMethod = "BH") # Benjamini Hochberg = FDR

summary(res.M_N)
head(res.M_N[(res.M_N$padj < cutoff_alpha) & !is.na(res.M_N$padj), ], n = 10)

# Get the results of group Bp vs group Ap
res.M_T <- results(ds.deseq, contrast=list("gp_M_vs_N", "gp_T_vs_N"),
                     altHypothesis="greaterAbs",
                     alpha = cutoff_alpha,
                     pAdjustMethod = "BH") # Benjamini Hochberg = FDR

summary(res.M_T)
head(res.M_T[(res.M_T$padj < cutoff_alpha) & !is.na(res.M_T$padj), ], n = 10)

```

## Annotate results
```{r}
# Ap vs An
res.T_N <- add_annotations(res.T_N, geneID.details, variables = annotations)
colnames(res.T_N)[7] <- "ensembl"

# Bp vs An
res.M_N <- add_annotations(res.M_N, geneID.details, variables = annotations)
colnames(res.M_N)[7] <- "ensembl"

# Bp vs Ap
res.M_T <- add_annotations(res.M_T, geneID.details, variables = annotations)
colnames(res.M_T)[7] <- "ensembl"

# Significance
res.T_N.sig <- subset(res.T_N, ((padj < cutoff_alpha) & !is.na(padj)))
res.M_N.sig <- subset(res.M_N, ((padj < cutoff_alpha) & !is.na(padj)))
res.M_T.sig <- subset(res.M_T, ((padj < cutoff_alpha) & !is.na(padj)))

write.xlsx(res.T_N.sig, file = "res_T_N_padj<0.25.xlsx", colNames = T, rowNames = F, append = F, overwrite = T)
write.xlsx(res.M_N.sig, file = "res_M_N_padj<0.25.xlsx", colNames = T, rowNames = F, append = F, overwrite = T)
write.xlsx(res.M_T.sig, file = "res_M_T_padj<0.25.xlsx", colNames = T, rowNames = F, append = F, overwrite = T)
```

## Exploring DEGs
```{r}
# Definte sets
sets_dys <- list("T vs N" = res.T_N.sig$ensembl,
                 "M vs N" = res.M_N.sig$ensembl,
                 "M vs T" = res.M_T.sig$ensembl)

# Create the Venn diagram for the five sets
venn.diagram(sets_dys, filename = "plots/venn_dysregulated.png",
             main.fontface = "bold", main.pos = c(0.5, 0.95),
             fill = c("paleturquoise1", "khaki1", "lightpink"), alpha = 0.5,
             cat.pos = c(0, -30, 360), cat.dist = c(0.2, 0.2, 0.2),
             height = 1000, width = 1000, resolution = 150,
             imagetype = "png", units = "px")

```

## Detectability
```{r}
# Detectable differentially expressed genes
detect_list <- detect_filter(norm.counts = normalized.counts[, 1:21],
                             df.BvsA = res.T_N.sig,
                             df.CvsA = res.M_N.sig,
                             df.DvsA = NULL,
                             cutoffs = c(50, 50, 0),
                             samples.baseline = 1:3,
                             samples.condition1 = 4:6,
                             samples.condition2 = 7:9,
                             samples.condition3 = NULL)
detect_list$DetectGenes
```

## Split cases
```{r}
# Exclusive cases
DEGs_sig <- split_cases(df.BvsA = res.T_N,
                        df.CvsA = res.M_N,
                        df.BvsC = res.M_T,
                        unique_id = "ensembl",
                        significance_var = "padj",
                        significance_cutoff = 0.25,
                        change_var = "log2FoldChange",
                        change_cutoff = 0)

# Consider applying this loop in order to filter the whole detectability list by a new threshold
for (i in names(DEGs_sig)) {
  DEGs_sig[[i]] <- DEGs_sig[[i]][rownames(DEGs_sig[[i]]) %in% detect_list$DetectGenes, ]
  DEGs_sig[[i]] <- DEGs_sig[[i]][DEGs_sig[[i]]$padj < 0.05, ]
}
```

## Filter results data frames
```{r}
DEGs_sig$Case4[, c("ensembl", "symbol", "trend")]
DEGs_sig$Case5[, c("ensembl", "symbol", "trend")]
DEGs_sig$Case6[, c("ensembl", "symbol", "trend")]
DEGs_sig$Case8[, c("ensembl", "symbol", "trend")]
DEGs_sig$Case9[, c("ensembl", "symbol", "trend")]

# Create detectable data frames
res.M_T.det <- res.M_T.sig[rownames(res.M_T.sig) %in% detect_list$DetectGenes, ]
res.M_N.det <- res.M_N.sig[rownames(res.M_N.sig) %in% detect_list$DetectGenes, ]
res.T_N.det <- res.T_N.sig[rownames(res.T_N.sig) %in% detect_list$DetectGenes, ]
```


# 4. Plotting
##  BSV plots
```{r}
#############
# Apply BSV #
#############
# Example applied to detectable genes in comparison M vs T (Metastasis vs Tumor)

for (i in 1:length(res.M_T.det[, "ensembl"])) {
    
    # Extracting the vector of counts for that gene
    gene_counts <- counts(ds.deseq, normalized = TRUE)[res.M_T.det[i, "ensembl"], ]
    log2_gc <- log2(gene_counts)
    
    # Making a dataframe for the plot
    df.box <- data.frame(ds.deseq@colData[, c("id", "group", "sex",
                                              "age", "age_cat", "num")], log2_gc)
    
    # Re-ordering sample_type for the plot
    df.box$group <- factor(df.box$group,
                           levels = c("N", "T", "M"),
                           labels = c("N", "T", "M"))
    
    # Plot
    p.bs <- ggplot(df.box, aes(x = group, y = log2_gc)) + theme_bw() +
      
      # Violin plot
      geom_violin(alpha = 0.1, scale = "width", fill = "yellow", colour = "peru",#aes(fill = gp), color = "transparent",
                  show.legend = FALSE, trim = TRUE) +
      
      # Scatter plot
      geom_point(data = df.box, aes(fill = sex), shape = 21,
                 size = 5, alpha = 1, position = position_jitter(width = 0.15)) +
      
      # Box plot with error bar
      geom_boxplot(width = 0.1, fill = "gray90") +
      geom_errorbar(stat = "boxplot", width = 0.4, linewidth = 1, color = "black",
                    aes(ymin = after_stat(middle), ymax = after_stat(middle))) +
      
      # Axis labels
      labs(x = NULL, y = expression("log"[2]*"(Norm. Counts)"), title = paste0(res.M_T.det[i, "symbol"])) +
      
      # Axis text size
      theme(axis.text.x = element_text(size = 24),#angle = 70, vjust = 1, hjust = 1)
            axis.text.y = element_text(size = 16),
            axis.title.y = element_text(size = 22),
            title = element_text(size = 26)) +
      
      # Legend setting
      ##scale_color_manual(name = "Sex", values = c("red", "blue"),
      ##                   guide = guide_legend(override.aes = aes(size = 7))) +
      scale_fill_manual(name = "Sex", values = c("M" = "#619CFF", "F" = "#F8766D"),
                        #"N" = "paleturquoise1",
                        #"T" = "khaki1",
                        #"M" = "lightpink"),
                        #guide = "none") + #,
                        guide = guide_legend(override.aes = aes(shape = 21, size = 7))) +
      
      # Legend size
      theme(legend.title = element_text(size= 14), legend.text=element_text(size = 13),
            panel.grid.major = element_line(colour = "gray80", linetype = "dashed", linewidth = 0.25),
            panel.grid.minor = element_line(colour = "gray80", linetype = "dashed", linewidth = 0.25))
    
    
    # Set max and min values without considering +/- Inf values
    min_value <- min(df.box$log2_gc[is.finite(df.box$log2_gc)])
    max_value <- max(df.box$log2_gc[is.finite(df.box$log2_gc)])
    
    # Generating y position for the significance bars
    ## Objective: maintain same proportion in distance for each plot
    y_range <- diff(range(df.box$log2_gc[is.finite(df.box$log2_gc)]))
    prop <- 0.08*y_range
    
    # Creating signif data for all genes
    sig_data <- data.frame(gene = rep(res.M_T.det[i, "symbol"], 3),
                           group1 = c("N",
                                      "T",
                                      "N"),
                           group2 = c("T",
                                      "M",
                                      "M"),
                           ypos = c(max_value + prop, max_value + prop,
                                    max_value + prop*2.2),
                           label = c(get_stars(res.M_T.det[i, "ensembl"], res.T_N),
                                     get_stars(res.M_T.det[i, "ensembl"], res.M_T),
                                     get_stars(res.M_T.det[i, "ensembl"], res.M_N)))
    
    # Ticks by same unit
    p.bs <- p.bs + scale_y_continuous(breaks = seq(floor(min_value), ceiling(max_value), by = 1))
    p.bs <- p.bs + geom_signif(data = sig_data, manual = TRUE, size = 2, colour = "grey5",
                               aes(xmin = group1, xmax = group2, annotations = label, y_position = ypos),
                               textsize = 6.5, vjust = 0, tip_length = 0.015)
    
    print(p.bs)
    
    ggsave(filename = paste0("plots/BSV/",
                             res.M_T.det[i, "symbol"], ".jpg"), width = 7, height = 7.5, dpi = 300)

}
```

## Volcano plots
```{r}
# Parameters
## padj and FC tresholds
ctf_nice <- 0.01
fld_nice <- cutoff_fold * 4

##########
## M vs T
##########
d.volcano <- data.frame(res.M_T)
d.volcano <- d.volcano[!is.na(d.volcano$padj), ]
d.volcano$log2FoldChange[d.volcano$log2FoldChange > 8] <- 8
d.volcano$log2FoldChange[d.volcano$log2FoldChange < -8] <- -8
d.volcano$padj[d.volcano$padj < 10**-3] <- 10**-3
d.volcano$padj[d.volcano$padj > 10**8] <- 10**8

# d.volcano$symbol[is.na(d.volcano$symbol) | (d.volcano$symbol == "")] <- "NS"

d.volcano$colors <- rep("other", nrow(d.volcano))
d.volcano$colors[(d.volcano$log2FoldChange >= cutoff_fold & d.volcano$padj < cutoff_alpha)] <- "over"
d.volcano$colors[(d.volcano$log2FoldChange <= -cutoff_fold & d.volcano$padj < cutoff_alpha)] <- "under"

d.volcano$shapes <- rep("nohits", nrow(d.volcano))
d.volcano$shapes[(abs(d.volcano$log2FoldChange) >= fld_nice & d.volcano$padj == 10**-3)] <- "hits"

cond1 <- !( (abs(d.volcano$log2FoldChange) <= cutoff_fold) | (d.volcano$padj > cutoff_alpha) )
cond2 <- !( (abs(d.volcano$log2FoldChange) <= fld_nice) | (d.volcano$padj > ctf_nice) )
cond3 <- d.volcano$ensembl %in% detect_list$DetectGenes & d.volcano$padj < 0.1 & abs(d.volcano$log2FoldChange) > 1
cond <- cond3 | (cond1 & cond2)


p.volcano <- ggplot() + theme_bw() +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(0, 3.07),
                     breaks = seq(1, 5, by = 1),
                     minor_breaks = seq(0, 5, by = 1)) +
  scale_x_continuous(expand = c(0,0),
                     limits = c(-9.5, 9.5),
                     breaks = seq(-9, 9, by = 3),
                     minor_breaks = seq(-9, 9, by = 1)) +
  
  # Vertical lines and labels:
  geom_vline(xintercept = c(-cutoff_fold, cutoff_fold), color = "red2", alpha = 0.8, linetype = 2, linewidth = 1.2) +
  geom_text(aes(x = -cutoff_fold - 1.3, y = 2.75, label = paste("x =", -cutoff_fold)), color = "red2", size = 6.2) +
  geom_text(aes(x = cutoff_fold + 1.3, y = 2.75, label = paste("x =", cutoff_fold)), color = "red2", size = 6.2) +
  
  # Horizontal line and label:
  geom_hline(yintercept = -log10(cutoff_alpha), color = "red2", alpha = 0.8, linetype = 2, linewidth = 1.2) +
  geom_text(aes(x = -7.35, y = -log10(cutoff_alpha) + 0.15, label = paste("q =", cutoff_alpha)), color = "red2", size = 6.2) +
  
  # Datapoints, in different colors:
  geom_point(data = d.volcano,
             aes(x = log2FoldChange, y = -log10(padj), fill = colors, shape = shapes),
             size = 5.5, color = "gray10", alpha = 0.8, show.legend = TRUE) +
  
  # Color setting
  ## "over" = "coral", "under" = "deepskyblue"
  scale_fill_manual(name = "Expression",
                    values = c("red", "grey50", "blue"),
                    guide = guide_legend(override.aes = aes(shape = 21, size = 5)),
                    breaks = c("over", "other", "under"),
                    labels = c("Up-regulated", "Not changed", "Down-regulated")) +
  
  # Change outliers shapes
  scale_shape_manual(values = c("hits" = 24, "nohits" = 21),
                     guide = "none") +
  
  # Labels for the datapoints selected:
  #geom_text_repel(aes(x = d.volcano$log2FoldChange[cond1 & cond2 & !cond3],
  #                    y = -log10(d.volcano$padj[cond1 & cond2 & !cond3]),
  #                    label = d.volcano$symbol[cond1 & cond2 & !cond3]),
  #                inherit.aes = FALSE, parse = FALSE, max.iter = 5000, color = "black", cex = 4.3,
  #                nudge_x = 0.2, nudge_y = -0.15, segment.alpha = 0,
  #                box.padding = unit(0.1, "lines"), min.segment.length = unit(0.01, "lines")) +
  
  # Labels for detectable genes
  geom_label_repel(aes(x = d.volcano$log2FoldChange[(d.volcano$log2FoldChange > 0) & cond3],
                       y = -log10(d.volcano$padj[(d.volcano$log2FoldChange > 0) & cond3]),
                       label = d.volcano$symbol[(d.volcano$log2FoldChange > 0) & cond3]),
                   inherit.aes = FALSE, parse = FALSE, max.iter = 5000, color = "black", cex = 8.2,
                   nudge_x = 0.2, nudge_y = -0.15, segment.alpha = 1,
                   box.padding = unit(0.8, "lines"), min.segment.length = unit(0.01, "lines")) +
  
  geom_label_repel(aes(x = d.volcano$log2FoldChange[(d.volcano$log2FoldChange < 0) & cond3],
                       y = -log10(d.volcano$padj[(d.volcano$log2FoldChange < 0) & cond3]),
                       label = d.volcano$symbol[(d.volcano$log2FoldChange < 0) & cond3]),
                   inherit.aes = FALSE, parse = FALSE, max.iter = 5000, color = "black", cex = 8.2,
                   nudge_x = -0.2, nudge_y = -0.15, segment.alpha = 1,
                   box.padding = unit(0.8, "lines"), min.segment.length = unit(0.01, "lines")) +
  
  # Axis labels:
  labs(x = expression("log"[2]*"(Fold Change)"),
       y = expression("-log"[10]*"FDR"),
       title = "M over B/Tc-") +
  
  # Labels size
  theme(plot.title = element_text(size = 26, face = "bold", hjust = 0.5),
        axis.text = element_text(size = 22),
        axis.title = element_text(size = 26),
        legend.text = element_text(size = 14),
        legend.title = element_text(size= 16))

# p.volcano <- p.volcano + theme(element_text(20))
p.volcano <- p.volcano + theme(legend.position = "none")
p.volcano

ggsave("plots/M_T_VolcanoPlot.jpg", p.volcano, width = 8, height = 8, dpi = 300)
ggsave("plots/M_T_VolcanoPlot.svg", p.volcano, width = 8, height = 8, dpi = 300)

```

## Heatmap samples vs genes
```{r}
varst.data <- getVarianceStabilizedData(ds.deseq)

colors_samples = list(Group = c("N" = "aquamarine2",
                                "T" = "#6691C7",
                                "M" = "#E54155"))

## Ap vs An
genestumor <- read.xlsx("res_M_T_padj<0.25.xlsx")

genestumor <- rbind(detect_list$Comparison1[detect_list$Comparison1$padj < 0.1, c("ensembl", "symbol")],
                    detect_list$Comparison2[detect_list$Comparison2$padj < 0.1, c("ensembl", "symbol")])
genestumor <- unique(genestumor)

varst.data.sig <- varst.data[rownames(varst.data) %in% genestumor$ensembl, ]
rownames(varst.data.sig) <- genestumor$symbol
anot <- data.frame(sampledata$group, row.names = colnames(varst.data.sig))
colnames(anot) <- c("Group")
anot$Group <- factor(anot$Group)
anot <- anot[order(anot$Group, decreasing = FALSE),, drop = FALSE ]
varst.data.sig <- varst.data.sig[, rownames(anot)]

rownames(anot) <- match(colnames(varst.data.sig), sampledata$id)
colnames(varst.data.sig) <- match(colnames(varst.data.sig), sampledata$id)

heatmap_M_T <- pheatmap(varst.data.sig, cluster_cols = FALSE, cluster_rows = TRUE, show_rownames = TRUE,
                        clustering_method = "mcquitty", cellwidth = 9, cellheight = 9, treeheight_col = 50,
                        treeheight_row = 50, scale = "row", annotation = anot, angle_col = 90,
                        annotation_colors = colors_samples, kmeans_k = NA, dpi = 100, drop_levels = FALSE,
                        cutree_rows = 2, gaps_col =  c(3, 6, 9))

ggsave("plots/HeatMap_samples_detect_(padj_0.25)_WPGMA.svg",
       plot = heatmap_M_T, width = 6.2, height = 5, dpi = 600, units = "in")
ggsave("plots/HeatMap_samples_detect_(padj_0.25)_WPGMA.png",
       plot = heatmap_M_T, width = 6.2, height = 5, dpi = 300, units = "in")
```

## More plots
```{r}
## DESeq2's dispersion estimates
plotDispEsts(ds.deseq)

## Histogram of p-values
hist(res.T_N.sig$pvalue, breaks=10, col="grey50", border="white")
hist(res.M_N.sig$pvalue, breaks=8, col="grey50", border="white")
hist(res.M_T.sig$pvalue, breaks=8, col="grey50", border="white")

## MA plots
plotMA(res.T_N, ylim = c(-5, 5))
plotMA(res.M_N, ylim = c(-5, 5))
plotMA(res.M_T, ylim = c(-5, 5))
```


# 5. Pathway Enrichment Analysis
## ORA with clusterProfiler
```{r}
# Prepare the list of significant genes
# Extract only the genes that have a significant p-value (e.g., p < 0.05)
significant_genes <- ranked.M_T$ensembl[ranked.M_T$pvalue < 0.05]

# BP (Biological Process)
enrichBP.M_T <- enrichGO(gene = significant_genes,
                         OrgDb = "org.Hs.eg.db",
                         keyType = "ENSEMBL",
                         ont = "BP",
                         pvalueCutoff = 0.05,
                         qvalueCutoff = 0.2,
                         readable = TRUE)

enrichBP.M_T.df <- as.data.frame(enrichBP.M_T)
View(enrichBP.M_T.df)

dotplot(enrichBP.M_T, showCategory = 20)
barplot(enrichBP.M_T, showCategory = 20)

# CC (Cellular Component)
enrichCC.M_T <- enrichGO(gene = significant_genes,
                         OrgDb = "org.Hs.eg.db",
                         keyType = "ENSEMBL",
                         ont = "CC",
                         pvalueCutoff = 0.05,
                         qvalueCutoff = 0.2,
                         readable = TRUE)

enrichCC.M_T.df <- as.data.frame(enrichCC.M_T)
View(enrichCC.M_T.df)

dotplot(enrichCC.M_T, showCategory = 20)
barplot(enrichCC.M_T, showCategory = 20)

# MF (Molecular Function)
enrichMF.M_T <- enrichGO(gene = significant_genes,
                         OrgDb = "org.Hs.eg.db",
                         keyType = "ENSEMBL",
                         ont = "MF",
                         pvalueCutoff = 0.05,
                         qvalueCutoff = 0.2,
                         readable = TRUE)

enrichMF.M_T.df <- as.data.frame(enrichMF.M_T)
View(enrichMF.M_T.df)

dotplot(enrichMF.M_T, showCategory = 20)
barplot(enrichMF.M_T, showCategory = 20)
```

## GSE with clusterProfiler
```{r}
##########
# M vs T #
##########
ranked.M_T <- res.M_T[order(res.M_T$stat, decreasing = TRUE), ]
ranked.M_T <- ranked.M_T[!is.na(ranked.M_T$stat), ]
ranked.list.M_T <- ranked.M_T$stat
names(ranked.list.M_T) <- ranked.M_T$ensembl

# BP
gseBP.M_T <- gseGO(geneList = ranked.list.M_T,
                     ont = "BP",
                     OrgDb = "org.Hs.eg.db",
                     keyType = "ENSEMBL",
                     eps = 1e-300,
                     verbose = TRUE)

gseBP.M_T.df <- as.data.frame(gseBP.M_T)
View(gseBP.M_T.df)

gseaplot(x = gseBP.M_T, geneSetID = 1)
dotplot(gseBP.M_T)

# CC
gseCC.M_T <- gseGO(geneList = ranked.list.M_T,
                     ont = "CC",
                     OrgDb = "org.Hs.eg.db",
                     keyType = "ENSEMBL",
                     eps = 1e-300,
                     verbose = TRUE)

gseCC.M_T.df <- as.data.frame(gseCC.M_T)
View(gseCC.M_T.df)

gseaplot(x = gseCC.M_T, geneSetID = 1)
dotplot(gseCC.M_T)

# MF
gseMF.M_T <- gseGO(geneList = ranked.list.M_T,
                     ont = "MF",
                     OrgDb = "org.Hs.eg.db",
                     keyType = "ENSEMBL",
                     eps = 1e-300,
                     verbose = TRUE)

gseMF.M_T.df <- as.data.frame(gseMF.M_T)
View(gseMF.M_T.df)

gseaplot(x = gseMF.M_T, geneSetID = 1)
dotplot(gseMF.M_T)
```

## GSEA with Broad Institute's CLI
### Prepare inputs
```{r}
# Expression data
expression_data_CCC <- normalized.counts.annotated
expression_data_CCC <- expression_data_CCC[, c(26, 33, 1:21)] # Change indexes according to the amount of samples
colnames(expression_data_CCC)[1:2] <- c("NAME", "DESCRIPTION")
expression_data_CCC$DESCRIPTION <- rep(NA, nrow(expression_data_CCC))

# Write to a tab-separated file
write.table(expression_data_CCC, file = "GSEA/expression_data_CCC.txt",
            sep = "\t", row.names = FALSE, quote = FALSE)

# Phenotype info
## Number of classes
n_classes <- length(levels(sampledata$gp))

## Create .cls file content
cls_header <- paste(nrow(sampledata), n_classes, "1")
cls_classes <- paste0("# ", "N T M")
cls_labels <- paste(sampledata$gp, collapse = " ")

cls_content <- paste(cls_header, cls_classes, cls_labels, sep = "\n")

## Write to .cls file
write(cls_content, file = "GSEA/phenotype_CCC.cls")
```

### Manage results
```{r}
########################################
# Function to manage GSEA results data #
########################################
sortGSEAdata <- function (gseaD, collection, phenotype, comparison) {
  gseaD <- gseaD[, -c(2:3, 10:12)]
  gseaD$COLLECTION <- rep(collection, nrow(gseaD))
  gseaD <- gseaD[, c(ncol(gseaD), 1:(ncol(gseaD)-1))]
  colnames(gseaD)[6:8] <- c("NOM_pval", "FDR_qval", "FWER_pval")
  gseaD[, "-Log10FDR"] <- -log10(gseaD$FDR_qval)
  gseaD$PHENOTYPE <- rep(phenotype, nrow(gseaD))
  gseaD$COMPARISON <- rep(comparison, nrow(gseaD))
  
  return(gseaD)
}

## Example: retrieve results from GSEA run in Hallmarks Collection in the M vs T comparion
M_T_gsea <- sortGSEAdata(data.frame(read.csv("GSEA/H/gsea_report_T.tsv", sep = "\t")),
                         collection = "H",
                         phenotype = "Metastasis",
                         comparison = "M_vs_T")
```

### Balloon plots
```{r}
balloon_data <- M_T_gsea[M_T_gsea$FDR_qval < 0.25, ]

ggplot(balloon_data, aes(x = COMPARISON, y = NAME, color = NES, size = `-Log10FDR`)) +
  geom_point() +
  #scale_size_continuous(range = c(0, 1.5), breaks = c(0, 0.5, 1, 1.5)) +  # Adjust the circle size
  scale_color_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0, na.value = "gray90") +
  facet_grid(COLLECTION ~ COMPARISON, space = "free", scales = "free") +
  theme_classic() +
  theme(
    axis.text.x = element_blank(),#element_text(angle = 30, hjust = 1),
    strip.background = element_rect(fill = "gray90", colour = "white", linewidth = 1),
    strip.text.x = element_text(face = "bold", size = 10),
    strip.text.y = element_text(face = "bold", size = 10, angle = 90, hjust = 0.5),
    panel.spacing = unit(0.5, "lines"),
    panel.grid.major = element_line(linewidth = 0.2, linetype = 'solid', color = "grey90"),
    panel.grid.minor = element_blank(),
    axis.title.x = element_blank(), #element_text(face = "bold"),
    axis.title.y = element_text(face = "bold")
  ) +
  labs(size = "-Log10FDR", color = "NES", y = "PATHWAYS", x = "PHENOTYPE") +
  guides(size = guide_legend(override.aes = list(shape = 21, fill = "white", color = "black")))

# Save the revised Balloon Plot
ggsave("GSEA/H_(qval<0.25).jpg", height = 13, width = 10, dpi = 600, limitsize = FALSE)
```