From 446f10119ff0af808b941321849438caac160704 Mon Sep 17 00:00:00 2001
From: Francois Kroll <francois@kroll.be>
Date: Fri, 15 Apr 2022 23:02:49 +0100
Subject: [PATCH] Add files via upload

---
 oprlengths/oprlengths.R | 392 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 392 insertions(+)
 create mode 100644 oprlengths/oprlengths.R

diff --git a/oprlengths/oprlengths.R b/oprlengths/oprlengths.R
new file mode 100644
index 0000000..a98bdf7
--- /dev/null
+++ b/oprlengths/oprlengths.R
@@ -0,0 +1,392 @@
+# histogram of OPR lengths
+
+library(here)
+library(tidyr)
+library(dplyr)
+library(ggplot2)
+library(ggpubr)
+library(openxlsx)
+
+
+
+# import metadata ---------------------------------------------------------
+
+# will mainly use for the colours to use
+meta <- read.xlsx(here('AdditionalFile1.xlsx'), sheet='samples')
+
+
+
+# small functions ---------------------------------------------------------
+
+
+beforeFirstChar <- function(stri, cha) {
+  as.character(sapply(stri, function(st){
+    substr(st,
+           1,
+           which(strsplit(st, '')[[1]] == cha) [length(which(strsplit(st, '')[[1]] == cha))] - 1)
+  }))
+}
+# eg. beforeFirstChar('1906_counts.txt', '_') returns '1906'
+
+# preallocate length vs count matrix --------------------------------------
+
+# minimum length = 0 = 5 OPRD (deletion of the entire OPR region)
+# maximum length = 123 + 100 * 3 * 8 = 2523 = 100 OPRI (can crop later)
+
+minl <- 0
+maxl <- 2523
+
+# file paths we will loop through
+fils <- c( list.files(here('haplotypephasing', 'bam_notag', 'readlengths'), full.names=TRUE) , 
+           list.files(here('haplotypephasing', 'bam_haplotagged', 'readlengths'), full.names=TRUE))
+
+# PDG # in the same order
+pdgs <- c( list.files(here('haplotypephasing', 'bam_notag', 'readlengths')) , 
+           list.files(here('haplotypephasing', 'bam_haplotagged', 'readlengths')))
+pdgs <- beforeFirstChar(pdgs, '_')
+
+# number of samples
+numsamples <- length(fils)
+
+# preallocate length vs count matrix
+opr <- as.data.frame(matrix(nrow=length(minl:maxl), ncol=numsamples+1)) # +1 as first column will be read length
+
+# add column names = sample's PDG#
+colnames(opr) <- c('length', pdgs)
+
+# fill in length column
+opr$length <- minl:maxl
+
+
+# fill in counts for each sample ------------------------------------------
+
+# small dummy dataframe which will allow us to pad the counts table
+pad <- as.data.frame(matrix(nrow=length(minl:maxl), ncol=1))
+colnames(pad) <- 'length'
+pad$length <- minl:maxl
+
+for (s in 1:length(fils)) { # for sample s
+  
+  # read the table
+  spl <- read.table(fils[s])
+  colnames(spl) <- c('count', 'length')
+  
+  # pad it with NA, so have one row for every possible length
+  splpad <- left_join(pad, spl, by='length')
+  
+  # now number of rows in sample counts should be same as OPR dataframe
+  if (nrow(opr) != nrow(splpad)) stop('\t \t \t \t >>> Error')
+  
+  # if yes -- can add the counts to the OPR dataframe
+    # match PDG# to find which column we need to fill
+  opr[,match(pdgs[s], colnames(opr))] <- splpad$count
+  
+}
+
+# now change the column names
+# (numbers as column names creates issues)
+colnames(opr)[2:ncol(opr)] <- paste0('pdg', colnames(opr)[2:ncol(opr)])
+
+
+# trim OPR ----------------------------------------------------------------
+
+# trim OPR to realistic lengths for plot
+# minimum boundary = 123 bp (WT OPR) - 48 bp (2 OPRD) - 10 bp (to safely account for minION noise) = 65 bp
+# maximum boundary = 123 bp (WT OPR) + 192 bp (8 OPRI) + 10 bp (minION noise) = 325 bp
+
+lowbound <- 65
+topbound <- 325
+
+oprt <- opr[which(opr$length==lowbound) : which(opr$length==topbound) ,] # OPR trimmed
+
+
+
+# convert to frequencies --------------------------------------------------
+
+# loop through samples, compute total number of reads, divide all counts by that
+
+count2freq <- function(sp) { # for one column of OPRt
+  freq <- sp / sum(sp, na.rm=TRUE)
+  return(freq)
+}
+
+oprtf <- as.data.frame(cbind(oprt$length, # OPR trimmed / frequencies
+                             apply(oprt[,2:ncol(oprt)], 2, count2freq)))
+colnames(oprtf)[1] <- 'length'
+
+
+
+# compute possible lengths of OPR variants --------------------------------
+# so can add some marks to plot
+
+ref <- 9 * 1 * 3 + 8 * 4 * 3 # reference = 1 nonapeptide + 4 octapeptide
+# OPR I / D are simply +/- full octapeptides, so +/- multiples of 8 * 3 = 24 bp
+oct <- 8 * 3
+
+possiblelengths <- c(ref-2*oct,
+                     ref-1*oct,
+                     ref,
+                     ref+1*oct,
+                     ref+2*oct,
+                     ref+3*oct,
+                     ref+4*oct,
+                     ref+5*oct,
+                     ref+6*oct,
+                     ref+7*oct,
+                     ref+8*oct,
+                     ref+9*oct,
+                     ref+10*oct)
+
+# matching labels
+oprlabels <- c('2OPRD',
+               '1OPRD',
+               'reference',
+               '1OPRI',
+               '2OPRI',
+               '3OPRI',
+               '4OPRI',
+               '5OPRI',
+               '6OPRI',
+               '7OPRI',
+               '8OPRI',
+               '9OPRI',
+               '10OPRI')
+
+
+# plot for one sample -----------------------------------------------------
+
+oprTrace <- ggplot(oprtf, aes(x=length, y=pdg1906)) +
+  geom_col() +
+  geom_vline(xintercept=possiblelengths, linetype=2) +
+  theme_minimal() +
+  theme(
+    #panel.grid.major.x=element_blank(),
+    #panel.grid.major.y=element_blank(),
+    panel.grid.minor.x=element_blank(),
+    panel.grid.minor.y=element_blank(),
+    axis.title.x=element_blank(),
+    axis.title.y=element_blank(),
+    
+  )
+oprTrace
+
+
+
+# plot for all samples ----------------------------------------------------
+
+oprTracer <- function(samplenum) {
+  
+  # create subset of OPR to plot
+  oprsub <- oprtf[, c(1,samplenum+1)]
+  colnames(oprsub) <- c('length', 'freq')
+  
+  # find the colour we are supposed to use
+  pdgcol <- meta$colour[which(meta$sample == pdgs[samplenum])]
+  
+  oprTrace <- ggplot(oprsub, aes(x=length, y=freq)) +
+    geom_col(fill=pdgcol) +
+    geom_vline(xintercept=possiblelengths, linetype=2) +
+    theme_minimal() +
+    theme(
+      #panel.grid.major.x=element_blank(),
+      #panel.grid.major.y=element_blank(),
+      panel.grid.minor.x=element_blank(),
+      panel.grid.minor.y=element_blank(),
+      axis.title.x=element_blank(),
+      axis.title.y=element_blank(),
+      axis.text.y=element_blank()
+      
+    ) +
+    coord_cartesian(xlim=c(lowbound-20, topbound+20),
+                    ylim=c(0, 0.1)) +
+    ggtitle(pdgs[samplenum])
+  
+  return(oprTrace)
+  
+}
+
+# preallocate list where to put all the plots
+oprtraces <- vector(mode='list', length=numsamples)
+names(oprtraces) <- pdgs
+
+# fill it
+for (s in 1:length(pdgs)) {
+  oprtraces[[s]] <- oprTracer(s)
+}
+
+# list of plots is now in PDG order (from smallest PDG number to biggest)
+# need to order according to order in Meta file
+oprtraces <- oprtraces[as.character(meta$sample)]
+
+# arrange in a grid & save
+oprGrid <- ggarrange(plotlist=oprtraces, nrow=13, ncol=2)
+ggsave(filename=here('oprlengths', 'oprtrace_grid.pdf'), plot=oprGrid, height=1000, width=240, units='mm')
+
+
+
+# grid but actual number of reads -----------------------------------------
+
+
+oprTracerReads <- function(samplenum) {
+  
+  # create subset of OPR to plot
+  oprsub <- oprt[, c(1,samplenum+1)]
+  colnames(oprsub) <- c('length', 'freq')
+  
+  # find the colour we are supposed to use
+  pdgcol <- meta$colour[which(meta$sample == pdgs[samplenum])]
+  
+  oprTrace <- ggplot(oprsub, aes(x=length, y=freq)) +
+    geom_col(fill=pdgcol) +
+    geom_vline(xintercept=possiblelengths, linetype=2) +
+    theme_minimal() +
+    theme(
+      #panel.grid.major.x=element_blank(),
+      #panel.grid.major.y=element_blank(),
+      panel.grid.minor.x=element_blank(),
+      panel.grid.minor.y=element_blank(),
+      axis.title.x=element_blank(),
+      axis.title.y=element_blank(),
+      axis.text.y=element_blank()
+      
+    ) +
+    coord_cartesian(xlim=c(lowbound-20, topbound+20),
+                    ylim=c(0, 100)) +
+    ggtitle(pdgs[samplenum])
+  
+  return(oprTrace)
+  
+}
+
+# preallocate list where to put all the plots
+oprreads <- vector(mode='list', length=numsamples)
+names(oprreads) <- pdgs
+
+# fill it
+for (s in 1:length(pdgs)) {
+  oprreads[[s]] <- oprTracerReads(s)
+}
+
+# list of plots is now in PDG order (from smallest PDG number to biggest)
+# need to order according to order in Meta file
+oprreads <- oprreads[as.character(meta$sample)]
+
+# arrange in a grid & save
+oprReads <- ggarrange(plotlist=oprreads, nrow=13, ncol=2)
+ggsave(filename=here('oprlengths', 'oprtrace_reads.pdf'), plot=oprReads, height=1000, width=240, units='mm')
+
+
+
+
+
+# prepare overlay plots ---------------------------------------------------
+
+# melt the counts
+oprtfm <- oprtf %>%
+  pivot_longer(cols=-length,
+               names_to='sample',
+               values_to='freq')
+
+# fix factors in oprtfm so same order of samples as in meta
+sampleorder <- paste0('pdg', as.character(unlist(meta$sample)))
+
+oprtfm$sample <- factor(oprtfm$sample,
+                        levels=sampleorder)
+
+# parameters common for all 3 plots
+pdfwidth <- 170
+pdfheight <- 42
+
+
+# overlay plot – inherited part1 ------------------------------------------
+
+samples2plot <- paste0('pdg', as.character(unlist(subset(meta, cohort=='inherited' & prion_disease!='control', sample))))
+colours2use <- as.character(unlist(subset(meta, cohort=='inherited' & prion_disease!='control' , colour)))
+
+
+opr_inherited <- ggplot(subset(oprtfm, sample %in% samples2plot),
+                        aes(x=length, y=freq, fill=sample)) +
+  geom_col(width=1, alpha=0.7, colour=NA) +
+  #geom_vline(xintercept=possiblelengths, linetype=2) +
+  #geom_vline(xintercept=c(lowbound-10, topbound+10)) +
+  scale_fill_manual(values=colours2use) +
+  scale_x_continuous(breaks=seq(100, 350, 50), labels=rep('', length(seq(100, 350, 50)))) +
+  theme_minimal() +
+  theme(
+    panel.background = element_rect(fill = 'white', colour=NA),
+    panel.grid.minor.x=element_blank(),
+    panel.grid.minor.y=element_blank(),
+    axis.title.x=element_blank(),
+    axis.title.y=element_blank(),
+    axis.text.y=element_blank(),
+    legend.position='none'
+  ) +
+  coord_cartesian(xlim=c(lowbound-10, topbound+10),
+                  ylim=c(0, 0.2))
+
+opr_inherited
+
+ggsave(filename=here('oprlengths', 'opr_inherited.pdf'), plot=opr_inherited, height=pdfheight, width=pdfwidth, units='mm')
+
+
+
+# overlay plot – sCJD -----------------------------------------------------
+
+samples2plot <- paste0('pdg', as.character(unlist(subset(meta, cohort=='sporadic', sample))))
+colours2use <- as.character(unlist(subset(meta, cohort=='sporadic', colour)))
+
+
+opr_sporadic <- ggplot(subset(oprtfm, sample %in% samples2plot),
+                        aes(x=length, y=freq, fill=sample)) +
+  geom_col(width=1, alpha=0.7, colour=NA) +
+  #geom_vline(xintercept=possiblelengths, linetype=2) +
+  #geom_vline(xintercept=c(lowbound-10, topbound+10)) +
+  scale_fill_manual(values=colours2use) +
+  scale_x_continuous(breaks=seq(100, 350, 50), labels=rep('', length(seq(100, 350, 50)))) +
+  theme_minimal() +
+  theme(
+    panel.background = element_rect(fill = 'white', colour=NA),
+    panel.grid.minor.x=element_blank(),
+    panel.grid.minor.y=element_blank(),
+    axis.title.x=element_blank(),
+    axis.title.y=element_blank(),
+    axis.text.y=element_blank(),
+    legend.position='none'
+  ) +
+  coord_cartesian(xlim=c(lowbound-10, topbound+10),
+                  ylim=c(0, 0.2))
+
+opr_sporadic
+
+ggsave(filename=here('oprlengths', 'opr_sporadic.pdf'), plot=opr_sporadic, height=pdfheight, width=pdfwidth, units='mm')
+
+
+
+# overlay plot – inherited x tissue ---------------------------------------
+
+samples2plot <- paste0('pdg', as.character(unlist(subset(meta, cohort=='inherited_tissuecomparison', sample))))
+colours2use <- as.character(unlist(subset(meta, cohort=='inherited_tissuecomparison', colour)))
+
+opr_tissue <- ggplot(subset(oprtfm, sample %in% samples2plot),
+                   aes(x=length, y=freq, fill=sample)) +
+  geom_col(width=1, alpha=0.7, colour=NA) +
+  #geom_vline(xintercept=possiblelengths, linetype=2) +
+  #geom_vline(xintercept=c(lowbound-10, topbound+10)) +
+  scale_fill_manual(values=colours2use) +
+  scale_x_continuous(breaks=seq(100, 350, 50), labels=rep('', length(seq(100, 350, 50)))) +
+  theme_minimal() +
+  theme(
+    panel.background = element_rect(fill = 'white', colour=NA),
+    panel.grid.minor.x=element_blank(),
+    panel.grid.minor.y=element_blank(),
+    axis.title.x=element_blank(),
+    axis.title.y=element_blank(),
+    axis.text.y=element_blank(),
+    legend.position='none'
+  ) +
+  coord_cartesian(xlim=c(lowbound-10, topbound+10),
+                  ylim=c(0, 0.2))
+
+opr_tissue
+
+ggsave(filename=here('oprlengths', 'opr_tissue.pdf'), plot=opr_tissue, height=pdfheight, width=pdfwidth, units='mm')
\ No newline at end of file