diff --git a/needleSam/needleSam_exploration_v2.R b/needleSam/needleSam_exploration_v2.R
deleted file mode 100644
index e38df17..0000000
--- a/needleSam/needleSam_exploration_v2.R
+++ /dev/null
@@ -1,2734 +0,0 @@
-# after needleSam.R
-  # needleSam.R creates the dataset allOPRreads.csv
-  # needleSam_exploration.R: exploring the dataset for possible OPR somatic mutations
-
-exportOrNo <- FALSE # Do you want to export the plots (useful to put FALSE when just want to load everything)
-
-# packages ----------------------------------------------------------------
-
-library(here)
-library(tidyr)
-library(dplyr)
-library(ggplot2)
-library(openxlsx)
-library(ggbeeswarm)
-library(ggpubr)
-
-
-
-# small functions ---------------------------------------------------------
-
-# function to pool columns from a dataframe into one column
-  # ! assumes that each row, there is only one element from the columns to pool which is not NA
-    # eg. at row 5, the columns we want to pool are NA, 9, NA >> gives 9
-    # but if 2, 9, NA >> Error! (as more than one element)
-
-# usage example; poolColumns(df, c('col2', 'col5'))
-poolColumns <- function(dataframe, vectorOfColNames) {
-  apply(dataframe, 1, function(row) {
-    tmp <- as.vector(unlist(c(row[vectorOfColNames]))) # concatenate the columns we need to pool at this row
-    # check there is only one not NA
-    if (sum(!is.na(tmp)) != 1) stop ('\t \t \t \t >>> Error: One or more rows has more than one element in the columns to be pooled \n')
-    # if ok, remove the NA and we are left with a single element
-    return(tmp[!is.na(tmp)]) # return for within the apply
-  })
-}
-
-###
-
-# counts & frequencies within bins of a continuous variable (each bin = one range of values) per group
-# e.g. data is number of mismatches for each read
-# and 2 groups: reference / mutated
-# for each group; how many reads in bin 0--10 mismatches; how many reads in bin 11-20 etc.
-# both in counts and frequencies (% of total within each group)
-# Note; bins represent the right boundary, e.g. a case where we have proportions, 0.1 means the bin that contains the data from 0 to 0.1
-# same in the dataframe it returns
-# example usage/ freqInBins(df = mydata, datcol = 'mismatchpercentage', grpcol = 'treated', bins=seq(0, 1, by=0.1))
-# by default grpcol = NA, meaning it does do it by any group, just takes all the data in column datcol
-freqInBins <- function(df, datcol, grpcol=NA, bins) {
-  
-  ###
-  # if no grouping
-  if (is.na(grpcol)) {
-    dat <- as.numeric(df[, datcol]) # data we need to bin
-    cos <- hist(dat, breaks=bins, include.lowest=TRUE, plot=FALSE)$counts # counts
-    fre <- cos / sum(cos) # frequencies
-    gdf <- data.frame(bin=bins[-1], counts=cos, freq=fre) # small dataframe for this group, two columns counts and frequencies
-    
-    return(gdf)
-    
-  }
-  
-  ###
-  # if grouping
-  
-  else {
-    
-    ngrps <- length(unique(df[,grpcol])) # number of groups
-    grpnms <- unique(df[,grpcol]) # group names
-    
-    cfpg <- vector(mode='list', length=ngrps) # preallocate list counts + frequencies per group
-    
-    for(i in 1:length(grpnms)) { # loop through groups
-      dat <- as.numeric(df[ which(df[,grpcol] == grpnms[i]) , datcol]) # data for this group (the data we need to bin)
-      cos <- hist(dat, breaks=bins, include.lowest=TRUE, plot=FALSE)$counts # counts
-      fre <- cos / sum(cos) # frequencies
-      
-      gdf <- data.frame(grp=grpnms[i], bin=bins[-1], counts=cos, freq=fre) # small dataframe for this group, two columns counts and frequencies
-      
-      # group column should be called the same as grpcol
-      colnames(gdf)[1] <- grpcol
-      
-      # add it to the list
-      cfpg[[i]] <- gdf
-    }
-    
-    # stick the list together and return it
-    fbing <- as.data.frame(rbindlist(cfpg)) # frequencies in each bin by group
-    return(fbing)
-  }
-  
-}
-
-
-# plot colours ------------------------------------------------------------
-
-# somatic call TRUE/FALSE
-somcols <- c('#f1876b', '#B3BDC4')
-
-# strand
-  # will use same colours as IGV
-  # forward = pink; reverse = blue
-strcols <- c('#ebb0af', '#aeafd8') # ! Forward, Reverse
-
-# haplotypes
-hapcols <- c('#f1876b', '#4D4D4D', '#B3BDC4') # ! 1, 2, 0
-
-# cohorts
-  # inherited = yellow
-  # sCJD = khaki
-  # control = grey
-cohcols <- c('#fcb505', '#78ac63', '#B3BDC4')
-
-
-
-
-# import ------------------------------------------------------------------
-
-# all the reads
-ied <- read.csv(here('needleSam', 'allOPRreads.csv'))
-
-# sample metadata
-meta <- read.xlsx(here('AdditionalFile1.xlsx'), sheet='samples')
-
-# format meta as in needleSam.R
-  # - >> NA
-meta$SV[which(meta$SV=='-')] <- NA
-
-  # special case: PDG 46345 is 5 OPRI/1 OPRD
-    # make a new column SV2 for it (just for the 1 OPRD)
-meta <- meta %>%
-  mutate(SV2=NA, .after=SV)
-meta[which(meta$sample==46345), 'SV2'] <- unlist(strsplit(meta[which(meta$sample==46345), 'SV'], '/'))[2]
-meta[which(meta$sample==46345), 'SV'] <- unlist(strsplit(meta[which(meta$sample==46345), 'SV'], '/'))[1]
-
-  # 2 OPRD >> OPRD2, etc.
-    # first lapply takes what is after the space = OPRD or OPRI
-    # second lapply takes what is before the space = 2, 1, ...
-meta$SV <- paste0(lapply(strsplit(meta$SV, ' '), function(i) {i[2]}),
-                  lapply(strsplit(meta$SV, ' '), function(i) {i[1]}))
-    # the NA above become 'NANA', put them back to NA
-meta$SV[which(meta$SV=='NANA')] <- NA
-
-  # same for SV2
-meta$SV2 <- paste0(lapply(strsplit(meta$SV2, ' '), function(i) {i[2]}),
-                   lapply(strsplit(meta$SV2, ' '), function(i) {i[1]}))
-meta$SV2[which(meta$SV2=='NANA')] <- NA
-
-  # sample: 52331 >> pdg52331, etc.
-meta$sample <- sprintf('pdg%s', meta$sample)
-
-
-# haplotype phasing metadata
-  # eg. how many SNVs available for haplotagging
-hpmeta <- read.xlsx(here('AdditionalFile1.xlsx'), sheet='haplotypephasing')
-    # sample: 52331 >> pdg52331, etc.
-hpmeta$sample <- sprintf('pdg%s', hpmeta$sample)
-
-  # from this:
-    # pdgs of samples that we could haplotag
-pdgs_tag <- as.character(unlist(subset(hpmeta, haplotagged, sample)))
-    # pdgs of samples we could not haplotag
-pdgs_notag <- as.character(unlist(subset(hpmeta, !haplotagged, sample)))
-
-# sequencing results summary
-seqmeta <- read.xlsx(here('AdditionalFile1.xlsx'), sheet='sequencing_summary')
-# sample: 52331 >> pdg52331, etc.
-seqmeta$sample <- sprintf('pdg%s', seqmeta$sample)
-
-# we will also need all pdgs
-pdgs <- meta$sample
-# do not take the ones we do not have loaded (only applies if running script on subset of samples)
-pdgs <- intersect(unique(ied$sample), pdgs)
-
-
-
-
-# add sample info to ied for simplicity -----------------------------------
-
-ied <- left_join(ied, meta, by='sample')
-
-ied <- hpmeta %>%
-  select('sample', 'haplotagged', 'genebody_heterozygousSNVs') %>%
-  left_join(ied, ., by='sample')
-
-ied <- seqmeta %>%
-  select('sample', 'genebody_coverage', 'promoter_coverage') %>%
-  left_join(ied, ., by='sample')
-
-
-
-
-# Possible somatic mutations: approach 1 ----------------------------------
-
-# there is no read that has both an OPRD and a OPRI that pass mismatch threshold, which makes things easier
-subset(ied,
-       (opri_pass & oprd_pass))
-
-# interesting reads are:
-  # a- there is a mutation (not reference)
-    # ! except for PDG 46345 (compound homozygous 1 OPRD / 5 OPRI), for this sample 'reference' may be a somatic mutation
-  # b- Sanger SV is NA (eg. sCJD cases) or is different than the mutation in the read (not interesting if matches the Sanger genotype)
-  # c- One of OPRI or OPRD is below mismatch threshold
-
-# in practice:
-  # 1- all reads that match what we know from Sanger (so uninteresting here) have a true_nummis number, so can take only true_nummis = NA
-  # (true_nummis = number of mismatches vs reference, when it matches what is expected)
-    # by doing so, no need to worry about PDG 46345 (compound homozygous 1 OPRD / 5 OPRI), true_nummis is present for both OPRD1 or OPRI5 reads
-      # (and is NA if read says reference, as it should)
-  # 2- ! true_nummis = NA also includes unassigned reads, so need to exclude those
-    # (! remember to explicitly include othercategory = NA otherwise != excludes NA)
-  # 3- make use of mismatch threshold, opri_pass or oprd_pass
-
-iedSom1 <- subset(ied,
-                 is.na(true_nummis) & # 1- 
-                   (is.na(othercategory) | othercategory!='unassigned') & # 2-
-                   (opri_pass | oprd_pass | ref_pass)) # 3-, one of opri_pass, oprd_pass, ref_pass is TRUE
-
-
-# Possible somatic mutations: approach 2 ----------------------------------
-# Another approach...
-  # allows to confirm the above if we reach the same conclusion with two different logics
-
-# tag the 'expected' reads, i.e. those that do not look like a somatic mutation
-ied$expected <- NA # preallocate all as NA
-
-  # all reads whose OPRI matches their sample's SV are 'expected'
-ied[which( ied$opri == ied$SV ), 'expected'] <- TRUE
-
-  # all reads whose OPRI matches their sample's SV2 are 'expected' (only applies to PDG 46345)
-ied[which( ied$opri == ied$SV2 ), 'expected'] <- TRUE
-
-  # all reads whose OPRD matches their sample's SV are 'expected'
-ied[which( ied$oprd == ied$SV ), 'expected'] <- TRUE
-
-  # all reads whose OPRD matches their sample's SV2 are 'expected' (only applies to PDG 46345)
-ied[which( ied$oprd == ied$SV2 ), 'expected'] <- TRUE
-
-  # all reads that are reference are 'expected' (! except for PDG 46345, read above) 
-ied[which( ied$sample != 'pdg46345' & ied$othercategory == 'reference' ), 'expected'] <- TRUE
-
-# >>> at this stage, expected should tag the same rows as trueread
-identical(subset(ied, trueread, read), subset(ied, expected, read))
-
-  # lots of reads are unassigned to an OPR category
-    # >> will tag them as expected for simplicity; this will be the difference vs trueread
-ied[which( ied$othercategory == 'unassigned' ), 'expected'] <- TRUE
-
-# great majority of reads should be 'expected'
-cat('\t \t \t \t >>> ', sum(ied$expected, na.rm=TRUE), 'out of', nrow(ied), 'considered expected \n')
-cat('\t \t \t \t \t i.e.', round((sum(ied$expected, na.rm=TRUE) / nrow(ied))*100, 1), '% \n')
-
-  # tag the rest as 'not expected' (FALSE), i.e. potentially somatic mutation
-ied[which(is.na(ied$expected)), 'expected'] <- FALSE
-
-  # check there is no NA left
-sum(is.na(ied$expected))
-
-# possible somatic = 'not expected', and pass mismatch threshold
-iedSom <- subset(ied,
-                 !expected &
-                   (opri_pass | oprd_pass | ref_pass)) # one of opri_pass, oprd_pass, ref_pass is TRUE
-
-
-# >>> does it give the same reads as first logic?
-identical(iedSom$read , iedSom1$read)
-# yes, same result
-# will only keep iedSom
-rm(iedSom1)
-
-# will be useful for later;
-  # add column 'isSomatic' (TRUE or FALSE) to ied, i.e. this read is a somatic mutation call
-ied$isSomatic <- FALSE # preallocate as FALSE (most of them)
-ied[match(iedSom$read, ied$read), 'isSomatic'] <- TRUE # for the reads that are in iedSom, replace by TRUE
-# put isSomatic in order I want
-ied$isSomatic <- factor(ied$isSomatic, levels=c('TRUE', 'FALSE'))
-
-
-
-
-
-# Illustrate mismatch threshold -------------------------------------------
-
-# in read length vs number of mismatches;
-  # plot all 'potentially somatic mutations' reads and colour the ones that pass the mismatch threshold
-    # the reads that have both OPRI1 and OPRD1 make this annoyingly complicated
-    # for those, I think will simply plot the minimum mismatch of OPRI/OPRD (read length does not change)
-    # (to be honest it does not really matter as all of these reads are above the mismatch threshold anyways)
-
-# all somatic mutations candidates
-  # (i.e. iedSom from above + all not expected but which does not pass the mismatch threshold)
-iedsomall <- subset(ied,
-                    !expected)
-
-iedsomall$conmis <- NA # number of mismatches vs consensus
-
-# for reference reads, fill in conmis with reference_nummis
-iedsomall[which(!is.na(iedsomall$reference_nummis)) , 'conmis'] <- iedsomall[which(!is.na(iedsomall$reference_nummis)) , 'reference_nummis']
-
-# for OPRI reads, fill in conmis with opri_nummis
-iedsomall[which(!is.na(iedsomall$opri_nummis)) , 'conmis'] <- iedsomall[which(!is.na(iedsomall$opri_nummis)) , 'opri_nummis']
-
-# for OPRD reads, fill in conmis with oprd_nummis
-iedsomall[which(!is.na(iedsomall$oprd_nummis)) , 'conmis'] <- iedsomall[which(!is.na(iedsomall$oprd_nummis)) , 'oprd_nummis']
-
-# check no more NA
-sum(is.na(iedsomall$conmis))
-
-# ! now, any reads that has both OPRD & OPRI got the number of mismatches vs OPRD consensus (as OPRD was last)
-# for these reads, go back and take the minimum number of mismatches between OPRD and OPRI (the best match)
-rows2edit <- which( !is.na(iedsomall$opri) & !is.na(iedsomall$oprd) )
-for (r in rows2edit) {
-  iedsomall[r,'conmis'] <- min(c(iedsomall[r,'opri_nummis'] , iedsomall[r,'oprd_nummis']))
-}
-# >>> we have number of mismatches vs consensus for all candidate somatic mutations 
-
-# mismatch threshold
-  # mean of % mismatch of all true reads (those that match what we know is there) + sd
-thrMis <- mean(ied$true_misbp, na.rm=TRUE) + sd(ied$true_misbp, na.rm=TRUE) # as calculated in needleSam.R
-
-# more intuitively, the threshold excludes the
-1 - (nrow(subset(ied, trueread & true_misbp < thrMis)) / nrow(subset(ied, trueread))) # least accurate reads
-
-
-# on plot, will add markers for reference OPR lengths (eg. reference 123 bp, 1 OPRI = 123 + 24 bp, etc.)
-ref <- 9 * 1 * 3 + 8 * 4 * 3 # reference = 1 nonapeptide + 4 octapeptide
-# OPR I / D are simply +/- full Rs, so +/- multiples of 8 * 3 = 24 bp
-oct <- 8 * 3
-
-oprlengths <- c(ref-4*oct,
-                ref-3*oct,
-                ref-2*oct,
-                ref-1*oct,
-                #ref,
-                ref+1*oct,
-                ref+2*oct,
-                ref+3*oct,
-                ref+4*oct,
-                ref+5*oct,
-                ref+6*oct,
-                ref+7*oct,
-                ref+8*oct,
-                ref+9*oct,
-                ref+10*oct,
-                ref+11*oct,
-                ref+12*oct,
-                ref+13*oct)
-
-somMisThr <- ggplot(iedsomall, aes(x=read_length, y=conmis, colour=isSomatic)) +
-  geom_vline(xintercept=ref, linetype=2, colour='#b0b0b0', size=0.8) +
-  geom_vline(xintercept=oprlengths, linetype=2, colour='#ebebeb', size=0.8) +
-  geom_segment(x=0, xend=450,
-               y=0, yend=450*thrMis,
-               size=1.0, linetype=2, colour='#595E60') +
-  geom_point(size=0.8) +
-  scale_colour_manual(values=somcols) +
-  coord_cartesian(xlim=c(30,377), ylim=c(0, 100.1)) + # when cropping to 450 bp (see below), no read above 150 mismatches
-  theme_minimal() +
-  theme(
-    legend.position='none',
-    panel.grid.major.x=element_blank(),
-    panel.grid.minor.x=element_blank(),
-    axis.title.x=element_text(size=9, margin = margin(t=0, r=0, b=0, l=0)),
-    axis.text.x=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
-    axis.title.y=element_text(size=9, margin = margin(t=0, r=-2, b=0, l=0)),
-    axis.text.y=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
-  ) +
-  xlab('read length') + ylab('mismatches vs template')
-somMisThr
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'somMisThr.pdf'), somMisThr, width=77, height=72, units='mm')
-}
-
-# Note; the few reads above 450 bp are crazy repetitions eg. GCGCGCGCGCGC, I think fair to crop the plot at 450 bp
-# (and not somatic calls as they have terrible % mismatch)
-
-# Note; now lowered max X to 377, it is excluding 1 read extra
-
-
-
-
-
-# Do somatic calls have worse % mismatch? ---------------------------------
-# or; are somatic reads less accurate than 'true' reads?
-
-# calculate mismatch per bp for all somatic candidates
-iedsomall$som_misbp <- iedsomall$conmis / iedsomall$read_length
-
-# mismatch per bp in iedSom currently in three columns (ref / opri / oprd), need to pool them in one
-iedSom$som_misbp <- as.numeric(unlist(poolColumns(iedSom, c('ref_misbp', 'opri_misbp', 'oprd_misbp'))))
-
-# check reads that reads which are both in iedsomall and iedSom have the same misbp
-round(iedsomall[match(iedSom$read, iedsomall$read) , 'som_misbp'], 2) == round(iedSom$som_misbp, 2)
-# (need to round or it says FALSE for numbers which are clearly equal, numbers are just stored with different levels of precision)
-
-
-# binning for mismatch (will use for histograms below)
-misbins <- seq(0, 1, by=0.01) # bins of 1% mismatch
-
-###
-###
-# first do all somatic reads (i.e. before mismatch threshold) vs trueread
-
-# all somatic reads before mismatch filtering are in iedsomall
-# counts / frequencies in bins
-som <- freqInBins(df=iedsomall, datcol='som_misbp', grpcol=NA, bins=misbins)
-# add column source
-som$source <- 'somatic'
-
-# same for all truereads
-tru <- freqInBins(df=subset(ied, trueread), datcol='true_misbp', grpcol=NA, bins=misbins)
-# add column source
-tru$source <- 'trueread'
-
-# stick the two together
-somtru <- rbind(som, tru)
-
-# plot
-misTruevsSomAll <- ggplot(somtru, aes(x=bin, y=freq, fill=source)) +
-  geom_col(position='identity', alpha=0.5) +
-  scale_fill_manual(values=somcols) +
-  theme_minimal() +
-  theme(
-    legend.position='none',
-  ) +
-  coord_cartesian(xlim=c(0, 1)) +
-  xlab('mismatch per bp') + ylab('frequency')
-misTruevsSomAll
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'misTruevsSomAll.pdf'), misTruevsSomAll, width=120, height=100, units='mm')
-}
-
-###
-###
-# now only selected somatic reads (i.e. after mismatch threshold) vs trueread
-
-# ! need to also crop trueread by the same mismatch threshold if we want to compare frequencies
-trure <- subset(ied, trueread & true_misbp < thrMis) # true reads below mismatch threshold
-tru <- freqInBins(df=trure, datcol='true_misbp', grpcol=NA, bins=misbins)
-# add column source
-tru$source <- 'trueread'
-
-# for somatic, use iedSom (i.e. iedsomall + mismatch threshold)
-som <- freqInBins(df=iedSom, datcol='som_misbp', grpcol=NA, bins=misbins)
-# add column source
-som$source <- 'somatic'
-
-# stick the two together
-somtru <- rbind(som, tru)
-
-# plot
-misTruevsSomThr <- ggplot(somtru, aes(x=bin, y=freq, fill=source)) +
-  geom_col(position='identity', alpha=0.5) +
-  scale_fill_manual(values=somcols) +
-  theme_minimal() +
-  theme(
-    legend.position='none',
-  ) +
-  coord_cartesian(xlim=c(0, thrMis+0.3*thrMis)) +
-  xlab('mismatch per bp') + ylab('frequency')
-misTruevsSomThr
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'misTruevsSomThr.pdf'), misTruevsSomThr, width=120, height=100, units='mm')
-}
-
-
-###
-# in this comparison, do somatic reads have significantly worse mismatch per bp?
-# compare with a t-test
-
-# gather the reads we plotted above
-trure <- select(trure, true_misbp)
-colnames(trure) <- 'misbp'
-trure$source <- 'trueread'
-
-somre <- select(iedSom, som_misbp)
-colnames(somre) <- 'misbp'
-somre$source <- 'somatic'
-
-misre <- rbind(trure, somre) # mismatches for reads to compare
-
-wilcox.test(misre$misbp ~ misre$source)
-t.test(misre$misbp ~ misre$source)
-
-###
-###
-
-# within truereads, are % mismatch different reference vs mutated?
-  # if yes -- above is not a fair comparison as somatic reads are almost all mutated
-  # (only PDG 46345 can have somatic calls that are reference)
-
-# take all the 'true' reads + their 'othercategory' column
-  # within truereads, in 'othercategory' column, should only be reference or NA
-  # can assume all NA are OPRI or OPRD in the other columns
-
-trumut <- subset(ied, trueread & is.na(othercategory)) # true reads mutated
-truref <- subset(ied, trueread & othercategory=='reference') # true reads reference
-
-trumutb <- freqInBins(df=trumut, datcol='true_misbp', grpcol=NA, bins=misbins)
-# add column source
-trumutb$source <- 'trueread_mutated'
-
-trurefb <- freqInBins(df=truref, datcol='true_misbp', grpcol=NA, bins=misbins)
-# add column source
-trurefb$source <- 'trueread_reference'
-
-# stick the two together
-somtru <- rbind(trurefb, trumutb)
-
-# plot
-misTrueRefvsTrueMut <- ggplot(somtru, aes(x=bin, y=freq, fill=source)) +
-  geom_col(position='identity', alpha=0.5) +
-  scale_fill_manual(values=somcols) +
-  theme_minimal() +
-  theme(
-    legend.position='none',
-  ) +
-  coord_cartesian(xlim=c(0, 0.5)) +
-  xlab('mismatch per bp') + ylab('frequency')
-misTrueRefvsTrueMut
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'misTrueRefvsTrueMut.pdf'), misTrueRefvsTrueMut, width=120, height=100, units='mm')
-}
-
-
-# compare statistically
-
-# gather the reads we plotted
-trumut <- select(trumut, true_misbp)
-trumut$source <- 'trueread_mutated'
-
-truref <- select(truref, true_misbp)
-truref$source <- 'trueread_reference'
-
-trumis <- rbind(trumut, truref)
-
-wilcox.test(trumis$true_misbp ~ trumis$source)
-t.test(trumis$true_misbp ~ trumis$source)
-# yes - it is different
-
-###
-###
-# consequence: comparing somatic reads vs all true reads is unfair
-  # as most true reads are reference, but most somatic reads are mutated
-# to tell if somatic reads really have more mismatches, need to compare with mutated truereads
-
-# take the mutated truereads as above + apply mismatch threshold
-trumut <- subset(ied, trueread & is.na(othercategory) & true_misbp < thrMis) # true reads mutated
-trumutb <- freqInBins(df=trumut, datcol='true_misbp', grpcol=NA, bins=misbins)
-# add column source
-trumutb$source <- 'trueread_mutated'
-
-# take the somatic reads as above
-somb <- freqInBins(df=iedSom, datcol='som_misbp', grpcol=NA, bins=misbins)
-# add column source
-somb$source <- 'somatic'
-
-# stick the two together
-somtru <- rbind(trumutb, somb)
-
-# plot
-misTrueMutvsSomThr <- ggplot(somtru, aes(x=bin, y=freq, fill=source)) +
-  geom_col(position='identity', alpha=0.5) +
-  scale_fill_manual(values=somcols) +
-  theme_minimal() +
-  theme(
-    legend.position='none',
-  ) +
-  coord_cartesian(xlim=c(0, thrMis+0.3*thrMis)) +
-  xlab('mismatch per bp') + ylab('frequency')
-misTrueMutvsSomThr
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'misTrueMutvsSomThr.pdf'), misTrueMutvsSomThr, width=120, height=100, units='mm')
-}
-
-
-# compare statistically
-
-# gather the reads we plotted
-trumut <- select(trumut, true_misbp)
-colnames(trumut) <- 'misbp'
-trumut$source <- 'trueread_mutated'
-
-som <- select(iedSom, som_misbp)
-colnames(som) <- 'misbp'
-som$source <- 'somatic'
-
-mis <- rbind(trumut, som)
-
-wilcox.test(mis$misbp ~ mis$source)
-t.test(mis$misbp ~ mis$source)
-# yes - it is different
-
-# it did reduce the difference but excess around 5% mismatch still present
-# difference in means now is 0.5%
-
-
-###
-###
-
-# Can it be a coincidence?
-# as we have thousands of reference reads but 'only' ~ 100--150 somatic reads
-
-# draw from mutated truereads the same number of somatic reads multiple times and test with Mann-Whitney U Test
-ndraws <- 1000
-mwu <- c() # Mann-Whitney U Test p-values
-
-for (i in 1:ndraws) {
-  
-  trudraw <- trumut[sample(nrow(trumut), nrow(iedSom)),]
-  mis <- rbind(trudraw, som)
-  
-  pval <- wilcox.test(mis$misbp ~ mis$source)$p.value
-  mwu <- c(mwu, pval)
-  
-}
-
-length(which(mwu>0.05)) / ndraws
-# it is not significant ~ 18% of the time
-
-###
-###
-# In summary; do somatic calls have worse % mismatch?
-# in our dataset, the answer is yes; precisely there seems to be a surplus of reads in the 5% mismatch range
-# however, it is possible that we were simply 'unlucky' with the batch of somatic reads we got
-# indeed, comparing our batch of somatic reads against subsets of mutated truereads only give a significant difference 82% of the time
-
-# we could be more stringent with the threshold mismatch (lower it to mean for example) and this difference would probably go away
-# however, it is important to note 5% mismatch is not surprising for reads that are correct ('trueread'), so would likely exclude some reads that are ok
-# and the consequence would be to have low number of somatic calls, which would make all the comparisons between samples below difficult/impossible
-# there is also the chance that it represents something real that is not allowed by the OPR consensus sequences
-# I will keep the mismatch threshold as is
-
-# Update: I think I found where the excess of ~ 5% mismatch reads come from
-# it might be mostly reads where the R1 is affected
-# to my knowledge this has never been reported in patients, but I do not know why strand slippage would not affect R1
-# (I guess it could only be template strand slippage?)
-# from http://www.mad-cow.org/prion_repeat_insertions.html#References; should now allow extra R1 or R4
-# this holds in the reads I have inspected closely
-# in contrast, I found at least one clean R1 deletion
-
-
-
-
-# Do somatic calls have worse haplotype scores? ---------------------------
-
-# haplotype scores is my rough attempt at putting a score for confidence of assignment to a haplotype
-# see needleSam.R for comments
-
-# if somatic calls all have bad haplotype assignment scores, would be worrying
-
-# bin the hap scores for somatic calls
-bns <- seq(0, 1, by=0.1)
-hscns <- hist(as.numeric(unlist(subset(ied, isSomatic==TRUE, hapscore))), # haplotype score counts
-              breaks=bns, include.lowest=TRUE, plot=FALSE)$counts
-somhs <- data.frame(source='somatic', binend=bns[-1], counts=hscns, freq=hscns/sum(hscns)) # d for dataframe 
-
-# bin the hap scores for trueread
-hscns <- hist(as.numeric(unlist(subset(ied, trueread, hapscore))), # haplotype score counts
-              breaks=bns, include.lowest=TRUE, plot=FALSE)$counts
-truhs <- data.frame(source='trueread', binend=bns[-1], counts=hscns, freq=hscns/sum(hscns)) # d for dataframe 
-
-# stick them together
-hsf <- rbind(somhs, truhs) # haplotype score frequencies
-
-
-# plot haplotype quality score vs somatic call or not
-hapScoreSomatic <- ggplot(hsf, aes(x=binend, y=freq, fill=source)) +
-  geom_col(position='identity', alpha=0.4) +
-  theme_minimal() +
-  theme(
-    legend.position='none',
-    axis.text.y=element_blank()
-  ) +
-  scale_fill_manual(values=somcols) +
-  xlab('haplotype assignment score') + ylab('frequency')
-hapScoreSomatic
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'hapScoreSomatic.pdf'), plot=hapScoreSomatic, width=100, height=100, units='mm')
-}
-
-
-# for test; select the reads we are plotting
-iedhs <- ied %>% # ied to compare haplotype scores
-  subset(isSomatic==TRUE | trueread)
-t.test(iedhs$hapscore ~ iedhs$isSomatic)
-# somatic calls do not have worse haplotype assignment scores
-
-
-
-
-# Any evidence of strand bias? --------------------------------------------
-
-# make sure it is in same order in ied & iedSom
-ied$strand <- factor(ied$strand, levels=c('+', '-'))
-iedSom$strand <- factor(iedSom$strand, levels=c('+', '-'))
-
-# number of calls by strand
-cbstr <- iedSom %>%
-  group_by(strand) %>%
-  tally(name='countStr')
-
-# add total number of Forward/Reverse reads
-cbstr <- ied %>%
-  group_by(strand) %>%
-  tally(name='totalStr') %>%
-  left_join(cbstr, ., by='strand')
-
-# ratio of Forward/Reverse of all reads serves as baseline
-cbstr <- cbstr %>%
-  mutate(ratioTot = totalStr / sum(totalStr))
-
-# add as percentage of Forward/Reverse reads
-cbstr$somPer <- (cbstr$countStr / cbstr$totalStr) * 100 # somatic percentage
-
-# ratio of Forward/Reverse reads * total number of somatic calls will give expected counts
-cbstr$expCount <- cbstr$ratioTot * sum(cbstr$countStr)
-
-# add a dummy category for x axis below
-cbstr$dummy <- factor(1)
-
-# annoyingly, need to reverse the factor levels
-cbstr$strand <- factor(cbstr$strand, levels=c('-', '+'))
-# ! reverse as well the colours
-strcols2 <- rev(strcols)
-
-# plot as horizontal bar with ratios
-  # essentially horizontal stacked barplot
-strandPro <- ggplot(cbstr, aes(x=dummy, y=countStr, fill=strand)) +
-  geom_col(position='stack') +
-  coord_flip() +
-  scale_fill_manual(values=strcols2) +
-  geom_hline(yintercept=cbstr$expCount[2], linetype=2, colour='#ebebeb', size=0.5) +
-  theme_minimal() +
-  theme(
-    axis.title.y=element_blank(),
-    axis.text.y=element_blank(),
-    axis.title.x=element_blank(),
-    axis.text.x=element_blank(),
-    panel.grid=element_blank(),
-    legend.position='none'
-  )
-strandPro
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'strandProportion.pdf'), strandPro, width=60, height=15, units='mm')
-}
-
-
-# plot number of calls by strand
-
-# put back order
-cbstr$strand <- factor(cbstr$strand, levels=c('+', '-'))
-
-callsByStrand <- ggplot(cbstr, aes(x=strand, y=countStr, fill=strand)) +
-  geom_col() +
-  scale_fill_manual(values=strcols) +
-  geom_segment(aes(x=0.5, xend=1.5, y=as.numeric(cbstr[1, 'expCount']), yend=as.numeric(cbstr[1, 'expCount'])), # add expected counts markers
-               linetype=2, colour='#ebebeb', size=2) +
-  geom_segment(aes(x=1.5, xend=2.5, y=as.numeric(cbstr[2, 'expCount']), yend=as.numeric(cbstr[2, 'expCount'])),
-               linetype=2, colour='#ebebeb', size=2) +
-  theme_minimal() +
-  theme(
-    legend.position='none',
-    panel.grid.minor.y=element_blank()
-  ) +
-  ylab('somatic mutation calls')
-callsByStrand
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsByStrand.pdf'), callsByStrand, width=75, height=100, units='mm')
-}
-
-
-# anything unexpected (would be sign of artefact) or within what we expect?
-# statistical approach: Chi2 test
-# build contingency table
-cont <- table(ied$isSomatic, ied$strand)
-  # should not do Chi2 test if any expected count below 5 (from http://www.sthda.com/english/wiki/chi-square-test-of-independence-in-r)
-  # in all the reads, there is some slight surplus of Reverse reads, but we pretty much expect 50/50 Forward/Reverse in the somatic calls
-  # we have 139 calls, so around 70/70 Forward/Reverse, so no expected count will be below 5
-# now Chi2 test on contingency table
-chi <- chisq.test(cont, correct=FALSE)
-chi
-chi$expected
-chi$observed
-
-
-# another approach I like to visualise this:
-  # draw reads at random & see how many Forward/Reverse we get
-    # ! better to do the draws properly sample by sample (if any sample has some strand bias)
-    # eg. PDG 58398 has 24 calls >> draw at random 24 reads from all of its reads
-
-# how many somatic calls per sample?
-cps <- iedSom %>% # calls per sample
-  group_by(sample) %>%
-  tally(name='nSom')
-
-# function to run the random draws and output the data ready to plot
-strandDraws <- function(nsim) {
-  
-  # preallocate dataframe
-  simres <- as.data.frame(matrix(nrow=nsim, ncol=3)) # simulation results;
-    # rows = simulations, columns = simulation # / number of forward draws / number of reverse draws
-  colnames(simres) <- c('sim', 'plus', 'minus')
-  simres$sim <- 1:nrow(simres)
-  
-  # for each simulation;
-  # for each sample, how many reads do we need to draw? (i.e. number of somatic calls for this sample)
-  # draw that many reads (! from that sample)
-  # pool all the draws together
-  # count how many Forward, Reverse we got
-  # add the results to the simres
-  
-  for (i in 1:nsim) {
-    
-    cat ('\t \t \t \t Simulation', i, 'out of', nsim, '\n')
-    
-    stradra <- c() # strand drawn (will pool all the draws there)
-    
-    for (p in 1:length(cps$sample)) { # for each sample (or row of cps)
-      spl <- as.character(cps[p, 'sample']) # sample we are talking about
-      ndraw <- as.integer(cps[p, 'nSom']) # how many reads do we need to draw?
-      drawfrom <- as.character(unlist(subset(ied, sample==spl, strand))) # ! only draw from that sample's reads
-      stradra <- c(stradra, sample ( drawfrom , ndraw)) # draw that many reads from that sample's reads & add them to hapdra
-    }
-    
-    # length of stradra now should be = total number of somatic calls
-    if (length(stradra) != nrow(iedSom)) stop ('\t \t \t \t >>> Error: something wrong with the draws \n')
-    
-    # count how many of each strand we got, and add that to our simulation results
-    simres[i, 'plus'] <- sum(stradra == '+') # how many forward strand?
-    simres[i, 'minus'] <- sum(stradra == '-') # how many reverse strand?
-  }
-  
-  return(simres)
-}
-
-nsim <- 100 # ideally 1000, but takes a couple of minutes
-stradraws <- strandDraws(nsim=nsim)
-
-# real count of somatic calls from Forward strand
-nplus <- nrow(subset(iedSom, strand=='+'))
-
-# real count of somatic calls from Reverse strand
-nminus <- nrow(subset(iedSom, strand=='-'))
-
-# plot for each strand
-plusDraws <- ggplot(stradraws, aes(plus)) +
-  geom_histogram(binwidth=1) +
-  geom_vline(xintercept=nplus, linetype=2) +
-  theme_minimal() +
-  theme(
-    axis.text.y=element_blank()
-  ) +
-  coord_cartesian(xlim=c(25, 100), ylim=c(0, 80)) +
-  ylab('frequency') + xlab('somatic calls from forward strand')
-plusDraws
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'plusDraws.pdf'), plusDraws, width=100, height=100, units='mm')
-}
-
-
-# plot for each strand
-minusDraws <- ggplot(stradraws, aes(minus)) +
-  geom_histogram(binwidth=1) +
-  geom_vline(xintercept=nplus, linetype=2) +
-  theme_minimal() +
-  theme(
-    axis.text.y=element_blank()
-  ) +
-  coord_cartesian(xlim=c(25, 100), ylim=c(0, 80)) +
-  ylab('frequency') + xlab('somatic calls from reverse strand')
-minusDraws
-ggsave(filename=here('needleSam', 'minusDraws.pdf'), plusDraws, width=100, height=100, units='mm')
-
-
-
-
-# Somatic mutations calls -- from which cohort? ---------------------------
-
-# make sure it is in this order in ied / iedSom
-ied$prion_disease <- factor(ied$prion_disease, levels=c('inherited', 'sCJD', 'control'))
-iedSom$prion_disease <- factor(iedSom$prion_disease, levels=c('inherited', 'sCJD', 'control'))
-
-# v1: removing calls from brain samples here in order to keep one sample per individual
-# but does not match number in text (129 calls) if do this
-
-# calls by cohort
-cbyco <- iedSom %>%
-  group_by(prion_disease) %>%
-  tally(name='nSom')
-
-# first observation: majority from Inherited cases
-nrow(subset(iedSom, prion_disease=='inherited')) / nrow(iedSom) # all samples
-
-cat('\t \t \t \t >>> ', nrow(subset(iedSom, prion_disease=='inherited')), '/', nrow(iedSom), 'calls = ',
-    round(nrow(subset(iedSom, prion_disease=='inherited')) / nrow(iedSom) * 100, 2),
-    '% from Inherited \n')
-
-# add total number of reads from each cohort
-cbyco <- ied %>%
-  group_by(prion_disease) %>%
-  tally(name='totalCoh') %>%
-  left_join(cbyco, ., by='prion_disease')
-
-# somatic calls as percentage of reads from that cohort
-cbyco$somPer <- (cbyco$nSom / cbyco$totalCoh) * 100 # somatic percentage
-
-# add ratio each cohort of total number of reads
-cbyco <- cbyco %>%
-  mutate(proCoh=totalCoh/sum(totalCoh))
-
-# ratio each cohort * total number of somatic calls will give expected counts
-cbyco$expCount <- cbyco$proCoh * sum(cbyco$nSom)
-
-# can also add expected percentage
-# Note, this is same for all cohorts (as expected counts basically assumes always same percentage)
-cbyco$expPer <- (cbyco$expCount / cbyco$totalCoh) * 100
-
-# plot histogram of counts
-callsbyCoh <- ggplot(cbyco, aes(x=prion_disease, y=nSom, fill=prion_disease)) +
-  geom_col(width=0.7) +
-  # add expected counts markers
-  # geom_segment(aes(x=0.5, xend=1.5, y=as.numeric(cbyco[1, 'expCount']), yend=as.numeric(cbyco[1, 'expCount'])), linetype=2, colour='#ebebeb', size=2) +
-  # geom_segment(aes(x=1.5, xend=2.5, y=as.numeric(cbyco[2, 'expCount']), yend=as.numeric(cbyco[2, 'expCount'])), linetype=2, colour='#ebebeb', size=2) +
-  # geom_segment(aes(x=2.5, xend=3.5, y=as.numeric(cbyco[3, 'expCount']), yend=as.numeric(cbyco[3, 'expCount'])), linetype=2, colour='#ebebeb', size=2) +
-  scale_fill_manual(values=cohcols) +
-  theme_minimal() +
-  theme(
-    legend.position='none',
-    axis.title.x=element_blank(), 
-    axis.text.x=element_text(size=9, margin = margin(t=0, r=0, b=0, l=0), angle=90),
-    axis.title.y=element_text(size=9, margin = margin(t=0, r=0, b=0, l=0)),
-    axis.text.y=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
-  ) +
-  scale_x_discrete(labels=c('inherited', 'sporadic', 'control')) +
-  ylab('somatic mutation calls')
-callsbyCoh
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsbyCohort.pdf'), callsbyCoh, width=40, height=75, units='mm')
-}
-
-
-
-# same but percentages
-callsbyCohPer <- ggplot(cbyco, aes(x=prion_disease, y=somPer, fill=prion_disease)) +
-  geom_col() +
-  # add expected percentage
-  geom_hline(yintercept=mean(cbyco$expPer), linetype=2, size=2, colour='#ebebeb') +
-  scale_fill_manual(values=cohcols) +
-  theme_minimal() +
-  theme(
-    legend.position='none',
-    axis.title.x=element_blank(),
-  ) +
-  scale_x_discrete(labels=c('inherited', 'sporadic', 'control')) +
-  ylab('somatic mutation calls (% of reads)')
-callsbyCohPer
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsbyCohortPercentage.pdf'), callsbyCohPer, width=100, height=100, units='mm')
-}
-
-
-# Is there really a surplus of Inherited calls?
-# Chi2 test
-  # build contingency table
-# v1: removing brain samples from ied
-# v2: keeping all samples
-cont <- table(ied$prion_disease, ied$isSomatic)
-chi <- chisq.test(cont, correct=FALSE)
-chi
-# ! check no expected count below 5
-chi$expected
-chi$observed
-
-# alternative approach to visualise this
-  # by drawing reads at random and counting how many we have from each cohort
-
-# function to run the random draws and output the data ready to plot
-cohortDraws <- function(nsim) {
-  
-  # preallocate dataframe
-  simres <- as.data.frame(matrix(nrow=nsim, ncol=4)) # simulation results;
-  # rows = simulations, columns = simulation # / number of inherited draws / number of sCJD draws / number of control draws
-  colnames(simres) <- c('sim', 'inherited', 'sCJD', 'control')
-  simres$sim <- 1:nrow(simres)
-  
-  # for each simulation;
-  # for each sample, how many reads do we need to draw? (i.e. number of somatic calls for this sample)
-  # draw that many reads (! from that sample)
-  # pool all the draws together
-  # count how many inherited, sCJD, control
-  # add the results to the simres
-  
-  for (i in 1:nsim) {
-    
-    cat ('\t \t \t \t Simulation', i, 'out of', nsim, '\n')
-    cohdra <- sample(iedo$prion_disease, nrow(iedSomo)) # cohorts drawn
-    
-    # count how many of each cohort we got, and add that to our simulation results
-    simres[i, 'inherited'] <- sum(cohdra == 'inherited') # how many inherited?
-    simres[i, 'sCJD'] <- sum(cohdra == 'sCJD') # how many sCJD?
-    simres[i, 'control'] <- sum(cohdra == 'control') # how many sCJD?
-  }
-  return(simres)
-}
-
-nsim <- 1000
-cohdraws <- cohortDraws(nsim=nsim)
-
-# real count of somatic calls from inherited
-ninh <- nrow(subset(iedSom, prion_disease=='inherited'))
-
-# real count of somatic calls from sCJD
-nspo <- nrow(subset(iedSom, prion_disease=='sCJD'))
-
-# real count of somatic calls from control
-ncon <- nrow(subset(iedSom, prion_disease=='control'))
-
-# plot for inherited
-inhDraws <- ggplot(cohdraws, aes(inherited)) +
-  geom_histogram(binwidth=1) +
-  geom_vline(xintercept=ninh, linetype=2) +
-  theme_minimal() +
-  theme(
-    axis.text.y=element_blank()
-  ) +
-  coord_cartesian(xlim=c(0, 150), ylim=c(0, 100)) +
-  ylab('frequency') + xlab('somatic calls from inherited samples')
-inhDraws
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'inheritedDraws.pdf'), inhDraws, width=100, height=100, units='mm')
-}
-
-
-# plot for sporadic
-spoDraws <- ggplot(cohdraws, aes(sCJD)) +
-  geom_histogram(binwidth=1) +
-  geom_vline(xintercept=nspo, linetype=2) +
-  theme_minimal() +
-  theme(
-    axis.text.y=element_blank()
-  ) +
-  coord_cartesian(xlim=c(0, 150), ylim=c(0, 100)) +
-  ylab('frequency') + xlab('somatic calls from sporadic samples')
-spoDraws
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'sporadicDraws.pdf'), spoDraws, width=100, height=100, units='mm')
-}
-
-
-# plot for control
-conDraws <- ggplot(cohdraws, aes(control)) +
-  geom_histogram(binwidth=1) +
-  geom_vline(xintercept=ncon, linetype=2) +
-  theme_minimal() +
-  theme(
-    axis.text.y=element_blank()
-  ) +
-  coord_cartesian(xlim=c(0, 150), ylim=c(0, 100)) +
-  ylab('frequency') + xlab('somatic calls from control samples')
-conDraws
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'controlDraws.pdf'), conDraws, width=100, height=100, units='mm')
-}
-
-
-
-
-
-# Somatic mutations calls -- from which samples? --------------------------
-
-# will prepare a dataframe with lots of info for each sample
-  # first compute coverage (will use below)
-cabs <- ied %>% # number of CAlls By Sample
-  group_by(sample) %>%
-  tally(name='iedCov') # coverage = simply number of rows per sample in ied
-
-# add all the metadata info
-cabs <- left_join(cabs, meta, by='sample')
-
-# check one row per sample
-nrow(cabs) == length(unique(ied$sample))
-
-nsomps <- iedSom %>% # number of somatic mutation calls per sample
-  group_by(sample) %>%
-  tally(name='nSom')
-
-# join this to cabs
-cabs <- left_join(x=cabs, y=nsomps, by='sample')
-
-# if sample not in iedSom it means it had 0 calls
-  # it should be NA for column nSom in cabs, but check to be safe
-nocp <- unique(ied$sample) [ !unique(ied$sample) %in% unique(iedSom$sample) ] # no-call PDGs
-
-as.character(unlist(subset(cabs, is.na(nSom), sample))) %in% nocp
-nocp %in% as.character(unlist(subset(cabs, is.na(nSom), sample)))
-# ok
-
-# replace cabs column nSom NA in cabs by 0
-cabs[which(is.na(cabs$nSom)), 'nSom'] <- 0
-# check no more NA in cabs nSom
-sum(is.na(cabs$nSom))
-
-# compute % of reads somatic
-  # i.e. number of somatic calls / coverage
-cabs$somPro <- cabs$nSom / cabs$iedCov
-cabs$somPer <- cabs$somPro * 100
-
-# make sure levels of prion_disease are in order of cohcols above
-cabs$prion_disease <- factor(cabs$prion_disease, levels=c('inherited', 'sCJD', 'control'))
-
-# order from sample with largest number of somatic calls to lowest
-cabs <- cabs[order(-cabs$nSom),]
-cabs$sample <- factor(cabs$sample, levels=cabs$sample)
-
-# plot histogram sample vs number of somatic calls
-
-# temporarily replace 0 by a low number (0.1) so it plots the colour of the sample in plot
-cabs2 <- cabs
-cabs2$nSom <- as.numeric(cabs2$nSom)
-cabs2[which(cabs2$nSom==0) , 'nSom'] <- 0.1
-
-cabsple <- ggplot(cabs2, aes(x=sample, y=nSom, fill=prion_disease)) +
-  geom_col() +
-  scale_fill_manual(values=cohcols, labels=c('inherited', 'sporadic', 'control')) +
-  theme_minimal() +
-  theme(
-    axis.title.x=element_blank(),
-    axis.text.x=element_text(angle=90),
-    legend.title=element_blank()
-  ) +
-  ylab('somatic mutation calls')
-cabsple
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsbySample.pdf'), cabsple, width=200, height=150, units='mm')
-}
-
-
-
-# plot with percentages
-
-# order from sample with largest percentage of somatic calls to lowest
-cabs <- cabs[order(-cabs$somPer),]
-cabs$sample <- factor(cabs$sample, levels=cabs$sample)
-
-# as above, raise a little bit the samples with 0 somatic calls so can see them
-cabs2 <- cabs
-cabs2$nSom <- as.numeric(cabs2$nSom)
-cabs2[which(cabs2$somPer==0) , 'somPer'] <- 0.002
-
-spllabels <- paste0('#', substr(cabs2$sample, 4, 99))
-
-cabsple2 <- ggplot(cabs2, aes(x=sample, y=somPer, fill=prion_disease, alpha=gDNA_tissue)) +
-  geom_col(width=0.8) +
-  scale_alpha_discrete(range=c(1.0, 0.5)) +
-  scale_fill_manual(values=cohcols, labels=c('inherited', 'sporadic', 'control')) +
-  theme_minimal() +
-  theme(
-    axis.title.x=element_blank(),
-    legend.title=element_blank(),
-    legend.position='none',
-    axis.text.x=element_text(size=7, angle=90, margin = margin(t=0, r=0, b=0, l=0), hjust=0, vjust=0.5),
-    axis.title.y=element_text(size=9, margin = margin(t=0, r=2, b=0, l=0)),
-    axis.text.y=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
-  ) +
-  ylab('somatic mutation calls (% of reads)') +
-  scale_x_discrete(labels=spllabels)
-cabsple2
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsbySamplePer.pdf'), cabsple2, width=80, height=70, units='mm')
-}
-
-
-# version with each sample's colour
-
-# samples are ranked by % somatic, so cannot change that
-# need to match the colour to each sample
-splcols <- meta$colour[match(cabs$sample, meta$sample)]
-
-cabsple3 <- ggplot(cabs2, aes(x=sample, y=somPer, fill=sample)) +
-  geom_col() +
-  scale_fill_manual(values=splcols) +
-  theme_minimal() +
-  theme(
-    axis.title.x=element_blank(),
-    axis.text.x=element_text(angle=90),
-    legend.title=element_blank(),
-    legend.position='none'
-  ) +
-  ylab('somatic mutation calls (% of reads)')
-cabsple3
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsbySamplePer_col.pdf'), cabsple3, width=200, height=150, units='mm')
-}
-
-
-# Number of calls vs coverage ---------------------------------------------
-# does number of calls for each sample correlate with coverage?
-  # would not make the calls wrong (it would be expected we pick up more stuff with higher coverage)
-  # but I think would mean we are not getting a full picture
-callsvscoverage <- ggplot(cabs, aes(x=iedCov, y=nSom, colour=prion_disease)) +
-  geom_point(size=1.5) +
-  scale_colour_manual(values=cohcols) +
-  theme_minimal() +
-  theme(
-    legend.title=element_blank()
-  ) +
-  theme(
-    legend.position='none',
-    #panel.grid.major.x=element_blank(),
-    #panel.grid.minor.x=element_blank(),
-    axis.title.x=element_text(size=9, margin = margin(t=0, r=0, b=0, l=0)),
-    axis.text.x=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
-    axis.title.y=element_text(size=9, margin = margin(t=0, r=0, b=0, l=0)),
-    axis.text.y=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
-  ) +
-  xlab('coverage') + ylab('somatic mutations calls') +
-  scale_x_continuous(breaks=c(0, 10000, 20000, 30000), labels=c('0', '10,000', '20,000', '30,000')) +
-  coord_cartesian(xlim=c(0, 30000))
-callsvscoverage
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvscoverage.pdf'), callsvscoverage, width=60, height=60, units='mm')
-}
-
-
-cor(x=cabs$iedCov, y=cabs$nSom, method='pearson')
-cor.test(x=cabs$iedCov, y=cabs$nSom, method='pearson')
-
-cor(x=cabs$iedCov, y=cabs$nSom, method='spearman')
-cor.test(x=cabs$iedCov, y=cabs$nSom, method='spearman')
-
-
-# useful to have a version with labels
-callsvscoverage2 <- ggplot(cabs, aes(x=iedCov, y=nSom, colour=prion_disease, label=sample)) +
-  geom_point(size=2) +
-  geom_text(size=1.8, hjust=1.0, vjust=2.0) +
-  scale_colour_manual(values=cohcols) +
-  theme_minimal() +
-  theme(
-    legend.title=element_blank()
-  ) +
-  xlab('coverage') + ylab('somatic mutations calls')
-callsvscoverage2
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvscoverage_labels.pdf'), callsvscoverage2, width=120, height=100, units='mm')
-}
-
-
-
-
-# Number of calls vs flowcell ---------------------------------------------
-  # anything weird about flowcell?
-callsvsflowcell <- ggplot(cabs, aes(x=flowcell, y=nSom, colour=prion_disease)) +
-  geom_quasirandom(width=0.1) +
-  scale_colour_manual(values=cohcols) +
-  scale_x_continuous(breaks=1:3) +
-  theme_minimal() +
-  theme(
-    legend.title=element_blank()
-  ) +
-  ylab('somatic mutation calls')
-callsvsflowcell
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvsflowcell.pdf'), callsvsflowcell, width=80, height=100, units='mm')
-}
-
-
-# with percentages
-callsvsflowcellPer <- ggplot(cabs, aes(x=flowcell, y=somPer, colour=prion_disease)) +
-  geom_quasirandom(width=0.1) +
-  scale_colour_manual(values=cohcols) +
-  scale_x_continuous(breaks=1:3) +
-  theme_minimal() +
-  theme(
-    legend.title=element_blank()
-  ) +
-  ylab('somatic mutation calls')
-callsvsflowcellPer
-ggsave(filename=here('needleSam', 'callsvsflowcellPercentage.pdf'), callsvsflowcellPer, width=80, height=100, units='mm')
-
-
-
-# Number of calls vs tissue -----------------------------------------------
-# first version with all the samples
-
-callsvstissue <- ggplot(cabs, aes(x=gDNA_tissue, y=nSom, label=sample, colour=prion_disease)) +
-  geom_quasirandom(width=0.2) +
-  scale_colour_manual(values=cohcols, labels=c('inherited', 'sporadic', 'control')) +
-  theme_minimal() +
-  theme(
-    legend.title=element_blank(),
-    panel.grid.minor.x=element_blank(),
-  ) +
-  coord_cartesian(ylim=c(0,18)) +
-  xlab('') + ylab('somatic calls')
-callsvstissue
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvstissue.pdf'), callsvstissue, width=90, height=100, units='mm')
-}
-
-
-# with percentages
-callsvstissuePer <- ggplot(cabs, aes(x=gDNA_tissue, y=somPer, label=sample, colour=prion_disease)) +
-  geom_quasirandom(width=0.2) +
-  scale_colour_manual(values=cohcols, labels=c('inherited', 'sporadic', 'control')) +
-  theme_minimal() +
-  theme(
-    legend.title=element_blank(),
-    panel.grid.minor.x=element_blank(),
-  ) +
-  xlab('') + ylab('somatic calls')
-callsvstissuePer
-ggsave(filename=here('needleSam', 'callsvstissuePercentage.pdf'), callsvstissuePer, width=90, height=100, units='mm')
-
-# Chi2 test: is there a surplus/deficit of calls in brain or blood
-cont <- table(ied$isSomatic, ied$gDNA_tissue)
-chi <- chisq.test(cont, correct=FALSE)
-chi
-chi$observed
-chi$expected # none below 5
-# there are *more* calls from brain samples than we would have expected
-# or less calls from blood samples than we would have expected
-
-# this does not mean much
-# ! we will see below OPR length predicts well number of somatic calls
-# and brain samples are all long OPRs, so expect more somatic calls
-
-# to demonstrate this, removing samples which have OPRD or reference OPR before Chi2 would be predicted to cancel out the effect
-# = keep only OPRI samples
-iedI <- subset(ied, substr(SV, 1, 4) == 'OPRI')
-cont <- table(iedI$isSomatic, iedI$gDNA_tissue)
-chi <- chisq.test(cont, correct=FALSE)
-chi
-chi$observed
-chi$expected # none below 5
-# correct, p-val is not significant anymore
-
-# plot matched blood v brain samples as slope plot
-# prepare data so one row for each individual
-cabsma <- subset(cabs, cohort=='inherited_tissuecomparison') # cabs matched samples
-
-# order them by individual then by tissue
-cabsma <- cabsma[order(cabsma$individual, cabsma$gDNA_tissue),]
-
-# and use that as factor level
-cabsma$sample <- factor(cabsma$sample, levels=cabsma$sample)
-# also fix factor level of gDNA tissue
-cabsma$gDNA_tissue <- factor(cabsma$gDNA_tissue, levels=unique(cabsma$gDNA_tissue))
-
-# just for the purpose of the plot, need to slightly move apart two dots to avoid overplotting but keep the connecting lines correct
-tissueMatch <- ggplot(cabsma, aes(x=gDNA_tissue, y=nSom, fill=sample)) +
-  geom_line(aes(group=individual), size=1.0, colour='#a7a7a7') +
-  scale_fill_manual(values=cabsma2$colour) +
-  geom_quasirandom(pch=21, size=5.0, colour='white', stroke=1.5, width=0.8, groupOnX=FALSE) +
-  theme_minimal() +
-  theme(
-    axis.title.x=element_blank(),
-    legend.position='none'
-  ) +
-  coord_cartesian(ylim=c(0, 20)) +
-  ylab('somatic mutations calls')
-tissueMatch
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'tissueMatch.pdf'), tissueMatch, width=90, height=100, units='mm')
-}
-
-# with percentages
-tissueMatchPer <- ggplot(cabsma, aes(x=gDNA_tissue, y=somPer, fill=sample)) +
-  geom_line(aes(group=individual), size=1.0, colour='#a7a7a7') +
-  scale_fill_manual(values=cabsma$colour) +
-  coord_cartesian(ylim=c(0,0.3)) +
-  geom_quasirandom(pch=21, size=5.0, colour='white', stroke=1.5, width=0.8, groupOnX=FALSE) +
-  theme_minimal() +
-  theme(
-    axis.title.x=element_blank(),
-    legend.position='none',
-    axis.text.x=element_text(size=9, margin = margin(t=0, r=0, b=0, l=0)),
-    axis.title.y=element_text(size=9, margin = margin(t=0, r=2, b=0, l=0)),
-    axis.text.y=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
-  ) +
-  ylab('somatic mutations calls (% of reads)')
-tissueMatchPer
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'tissueMatchPercentage.pdf'), tissueMatchPer, width=60, height=68, units='mm')
-}
-
-# one-sample t-test
-
-# make a simpler dataframe for this
-tisco <- cabsma %>% # tissue comparison
-  select(individual, gDNA_tissue, somPer) %>%
-  pivot_wider(names_from=gDNA_tissue,
-             values_from=somPer) %>%
-  mutate(tissuediff=blood-brain) # compute delta blood - brain
-
-# one-sample t-test: I am testing if the % decrease is significant
-  # so null hypothesis = no decrease = 0% (hence one-sample)
-  # I am specifically testing decrease, so one-sided
-  # why specifically decrease: strand slippage model relies on replication,
-    # hence should occur more often in tissues that divides a lot vs post-mitotic tissues
-t.test(x=tisco$tissuediff, mu=0, alternative='greater')
-
-# mean +- sd of difference
-mean(tisco$tissuediff)
-sd(tisco$tissuediff)
-
-
-
-
-# Number of calls vs MRC intersect/slope ----------------------------------
-
-# vs MRC_t0
-# i.e. score when patient first started getting followed
-callsvsMRCt0 <- ggplot(cabs, aes(x=MRC_t0, y=nSom, colour=prion_disease, alpha=gDNA_tissue)) +
-  geom_point() +
-  scale_colour_manual(values=cohcols) +
-  scale_alpha_discrete(range=c(1.0, 0.5)) +
-  theme_minimal() +
-  theme(
-    legend.position='none'
-  ) +
-  coord_cartesian(xlim=c(20, 0)) +
-  ylab('somatic mutation calls') + xlab('MRC scale at first assessment')
-callsvsMRCt0
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvsMRCt0.pdf'), callsvsMRCt0, width=120, height=100, units='mm')
-}
-
-
-# with percentages
-callsvsMRCt0Per <- ggplot(cabs, aes(x=MRC_t0, y=somPer, colour=prion_disease, alpha=gDNA_tissue)) +
-  geom_point() +
-  scale_colour_manual(values=cohcols) +
-  scale_alpha_discrete(range=c(1.0, 0.5)) +
-  theme_minimal() +
-  theme(
-    legend.position='none'
-  ) +
-  coord_cartesian(xlim=c(20, 0)) +
-  ylab('somatic mutation calls (% of reads)') + xlab('MRC scale at first assessment')
-callsvsMRCt0Per
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvsMRCt0Percentage.pdf'), callsvsMRCt0, width=120, height=100, units='mm')
-}
-
-
-# vs MRC_intersect
-  # just having a look, but I do not think there is a clear interpretation for MRC intersect
-  # also it is very close to MRC_t0
-callsvsMRCint <- ggplot(cabs, aes(x=MRC_intersect, y=nSom, colour=prion_disease, alpha=gDNA_tissue)) +
-  geom_point() +
-  scale_colour_manual(values=cohcols) +
-  scale_alpha_discrete(range=c(1.0, 0.5)) +
-  theme_minimal() +
-  theme(
-    legend.position='none'
-  ) +
-  coord_cartesian(xlim=c(20, 0)) +
-  ylab('somatic mutation calls') + xlab('MRC scale at first assessment')
-callsvsMRCint
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvsMRCint.pdf'), callsvsMRCint, width=120, height=100, units='mm')
-}
-
-  
-# with percentages
-callsvsMRCintPer <- ggplot(cabs, aes(x=MRC_intersect, y=somPer, colour=prion_disease, alpha=gDNA_tissue)) +
-  geom_point() +
-  scale_colour_manual(values=cohcols) +
-  scale_alpha_discrete(range=c(1.0, 0.5)) +
-  theme_minimal() +
-  theme(
-    legend.position='none'
-  ) +
-  coord_cartesian(xlim=c(20, 0)) +
-  ylab('somatic mutation calls (% of reads)') + xlab('MRC scale at first assessment')
-callsvsMRCintPer
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvsMRCintPercentage.pdf'), callsvsMRCint, width=120, height=100, units='mm')
-}
-
-# vs MRC_slope
-  # goal is to quantify speed of decline
-callsvsMRCslope <- ggplot(cabs, aes(x=MRC_slope, y=nSom, colour=prion_disease, alpha=gDNA_tissue)) +
-  geom_point() +
-  scale_colour_manual(values=cohcols) +
-  scale_alpha_discrete(range=c(1.0, 0.5)) +
-  theme_minimal() +
-  theme(
-    legend.position='none',
-    panel.grid.minor.x=element_blank(), 
-    axis.title.x=element_text(size=9, margin = margin(t=2, r=0, b=0, l=0)),
-    axis.text.x=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
-    axis.title.y=element_text(size=9, margin = margin(t=0, r=2, b=0, l=0)),
-    axis.text.y=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0))
-  ) +
-  coord_cartesian(xlim=c(0, -0.7)) +
-  ylab('somatic mutation calls') + xlab('MRC scale slope')
-callsvsMRCslope
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvsMRCslope.pdf'), callsvsMRCslope, width=75, height=75, units='mm')
-}
-
-
-# with percentages
-callsvsMRCslopePer <- ggplot(cabs, aes(x=MRC_slope, y=somPer, colour=prion_disease, alpha=gDNA_tissue)) +
-  geom_point() +
-  scale_colour_manual(values=cohcols) +
-  scale_alpha_discrete(range=c(1.0, 0.5)) +
-  theme_minimal() +
-  theme(
-    legend.position='none',
-    panel.grid.minor.x=element_blank(), 
-    axis.title.x=element_text(size=9, margin = margin(t=2, r=0, b=0, l=0)),
-    axis.text.x=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
-    axis.title.y=element_text(size=9, margin = margin(t=0, r=2, b=0, l=0)),
-    axis.text.y=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0))
-  ) +
-  coord_cartesian(xlim=c(0, -0.7)) +
-  ylab('somatic mutation calls (% of reads)') + xlab('MRC scale slope')
-callsvsMRCslopePer
-cor(cabs$MRC_slope, cabs$somPer, use='pairwise.complete.obs')
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvsMRCslopePercentage.pdf'), callsvsMRCslopePer, width=75, height=75, units='mm')
-}
-
-
-
-# Number of calls vs age --------------------------------------------------
-
-# number of somatic mutation calls vs age of patient when sample taken
-  # blood samples: when blood drawn
-  # brain samples: age at death
-callsvsage <- ggplot(cabs, aes(x=age_atsample, y=nSom, colour=prion_disease, alpha=gDNA_tissue)) +
-  geom_point() +
-  scale_colour_manual(values=cohcols) +
-  scale_alpha_discrete(range=c(1.0, 0.5)) +
-  theme_minimal() +
-  theme(
-    legend.position='none',
-    panel.grid.minor.x=element_blank()
-  ) +
-  xlab('age of patient') + ylab('somatic mutation calls')
-callsvsage
-cor(cabs$nSom, cabs$age_atsample, method='pearson', use='pairwise.complete.obs')
-cor(cabs$nSom, cabs$age_atsample, method='spearman', use='pairwise.complete.obs')
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvsage.pdf'), callsvsage, width=120, height=100, units='mm')
-}
-
-# with percentages
-callsvsagePer <- ggplot(cabs, aes(x=age_atsample, y=somPer, colour=prion_disease, alpha=gDNA_tissue)) +
-  geom_point() +
-  scale_colour_manual(values=cohcols) +
-  scale_alpha_discrete(range=c(1.0, 0.5)) +
-  theme_minimal() +
-  theme(
-    legend.position='none',
-    panel.grid.minor.x=element_blank(),
-    axis.title.x=element_text(size=9, margin = margin(t=2, r=0, b=0, l=0)),
-    axis.text.x=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
-    axis.title.y=element_text(size=9, margin = margin(t=0, r=2, b=0, l=0)),
-    axis.text.y=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0))) +
-  xlab('age of individual') + ylab('somatic mutation calls (% of reads)')
-callsvsagePer
-
-cor(cabs$somPer, cabs$age_atsample, method='pearson', use='pairwise.complete.obs')
-cor.test(cabs$somPer, cabs$age_atsample, method='pearson', use='pairwise.complete.obs')
-cor(cabs$somPer, cabs$age_atsample, method='spearman', use='pairwise.complete.obs')
-cor.test(cabs$somPer, cabs$age_atsample, method='spearman', use='pairwise.complete.obs')
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvsagePercentage.pdf'), callsvsagePer, width=75, height=75, units='mm')
-}
-
-
-
-# Number of calls vs gender -----------------------------------------------
-
-callsvsgender <- ggplot(cabs, aes(x=gender, y=nSom, colour=prion_disease)) +
-  geom_quasirandom(width=0.2) +
-  scale_colour_manual(values=cohcols) +
-  theme_minimal() +
-  theme(
-    legend.position='none',
-    panel.grid.minor.x=element_blank(),
-    axis.title.x=element_blank()
-  ) +
-  scale_x_discrete(labels=c('female', 'male')) +
-  ylab('somatic mutation calls')
-callsvsgender
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvsgender.pdf'), callsvsgender, width=90, height=100, units='mm')
-}
-
-# with percentages
-callsvsgenderPer <- ggplot(cabs, aes(x=gender, y=somPer, colour=prion_disease)) +
-  geom_quasirandom(width=0.2) +
-  scale_colour_manual(values=cohcols) +
-  theme_minimal() +
-  theme(
-    legend.position='none',
-    panel.grid.minor.x=element_blank(),
-    axis.title.x=element_blank()
-  ) +
-  scale_x_discrete(labels=c('female', 'male')) +
-  ylab('somatic mutation calls (% of reads)')
-callsvsgenderPer
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvsgenderPercentage.pdf'), callsvsgenderPer, width=90, height=100, units='mm')
-}
-
-
-
-# Number of calls vs Sanger OPR -------------------------------------------
-
-# by strand slippage model, would predict
-  # more repeats, more opportunities for slippage, so more calls
-  # less repeats, less opportunities for slippage, so fewer calls
-
-# add a 'OPR difference' column
-  # eg. OPRI1 = +1 / OPRD1 = -1
-cabs$sanOPRdiff <- sapply(cabs$SV, function(o){ # Sanger OPR difference (eg. 1 OPRD = -1)
-  
-  if (is.na(o)) {
-    return(0) # is NA, that means no SV, so OPRdiff = 0
-  } else {
-    diff <- as.numeric(substr(o, 5, 999)) # get the number; eg. OPRI8 >> 8
-    
-    if (substr(o, 4, 4)=='D') return(-diff) # if OPRD >> put a negative sign
-    if (substr(o, 4, 4)=='I') return(diff) # if OPRI >> leave positive
-  }
-})
-# Note; for PDG 46345 writing +5 for the 5 OPRI
-# Note; could also do total OPR length (i.e. Sanger OPR difference + 5)
-  # in a way it fits better the strand slippage model as what is determinant would be the total number of repeats
-  # but I find the difference more intuitive as can still read the sample's genotype
-
-# number of somatic calls vs Sanger OPR difference
-callsvsSanOPRdiff <- ggplot(cabs, aes(x=sanOPRdiff, y=nSom, colour=prion_disease)) +
-  geom_quasirandom(width=0.3) +
-  scale_colour_manual(values=cohcols) +
-  scale_x_continuous(breaks=-2:8) +
-  theme_minimal() +
-  theme(
-    legend.position='none',
-    panel.grid.minor.x=element_blank()
-  ) +
-  xlab('OPR change vs reference') + ylab('somatic mutation calls')
-callsvsSanOPRdiff
-cor(x=cabs$sanOPRdiff, y=cabs$nSom, method='pearson')
-cor.test(x=cabs$sanOPRdiff, y=cabs$nSom, method='pearson')
-cor(x=cabs$sanOPRdiff, y=cabs$nSom, method='spearman')
-cor.test(x=cabs$sanOPRdiff, y=cabs$nSom, method='spearman')
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvsSangerOPRdiff.pdf'), callsvsSanOPRdiff, width=120, height=100, units='mm')
-}
-
-# add linear regression
-# ! lm (y ~ x)
-# x = predictor // y = to be predicted
-# here: x = OPR length // y = number of somatic calls
-somfit <- lm(nSom ~ sanOPRdiff, data=cabs)
-# formula is y = intercept + slope * x
-# here: number of somatic calls = 3.1803 + 0.8938 * OPR difference
-
-intercept <- somfit[[1]][1]
-slope <- somfit[[1]][2]
-
-# X start
-# is shortest OPR
-xstart <- min(cabs$sanOPRdiff)
-
-# X stop
-# is longest OPR
-xstop <- max(cabs$sanOPRdiff)
-
-# Y start
-# >> y = intercept + slope * xstart
-ystart = as.numeric(intercept + slope * xstart)
-
-# Y stop
-# >> y = intercept + slope * xstop
-ystop = as.numeric(intercept + slope * xstop)
-
-# Notes;
-summary(somfit)
-
-# version plot above with fitted line
-callsvsSanOPRdiffFit <- ggplot(cabs, aes(x=sanOPRdiff, y=nSom, colour=prion_disease)) +
-  geom_segment(aes(x=xstart, xend=xstop,
-                   y=ystart, yend=ystop),
-               linetype=2, colour='#ebebeb', size=1.0) +
-  geom_quasirandom(width=0.3) +
-  scale_colour_manual(values=cohcols) +
-  scale_x_continuous(breaks=-2:8) +
-  theme_minimal() +
-  theme(
-    legend.position='none',
-    panel.grid.minor.x=element_blank()
-  ) +
-  xlab('OPR change vs reference') + ylab('somatic mutation calls')
-callsvsSanOPRdiffFit
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvsSangerOPRdiffFit.pdf'), callsvsSanOPRdiffFit, width=120, height=100, units='mm')
-}
-
-###
-
-# same but with percentages
-callsvsSanOPRdiffPer <- ggplot(cabs, aes(x=sanOPRdiff, y=somPer, colour=prion_disease)) +
-  geom_quasirandom(width=0.3) +
-  scale_colour_manual(values=cohcols) +
-  scale_x_continuous(breaks=-2:8) +
-  theme_minimal() +
-  theme(
-    legend.position='none',
-    panel.grid.minor.x=element_blank()
-  ) +
-  xlab('OPR length vs reference') + ylab('somatic mutation calls (% of reads)')
-callsvsSanOPRdiffPer
-cor(x=cabs$sanOPRdiff, y=cabs$somPer, method='pearson')
-cor.test(x=cabs$sanOPRdiff, y=cabs$somPer, method='pearson')
-cor(x=cabs$sanOPRdiff, y=cabs$somPer, method='spearman')
-cor.test(x=cabs$sanOPRdiff, y=cabs$somPer, method='spearman')
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvsSangerOPRdiffPercentage.pdf'), callsvsSanOPRdiffPer, width=120, height=100, units='mm')
-}
-  
-# add linear regression
-# ! lm (y ~ x)
-# x = predictor // y = to be predicted
-# here: x = OPR length // y = number of somatic calls
-somfit <- lm(somPer ~ sanOPRdiff, data=cabs)
-# formula is y = intercept + slope * x
-# here: number of somatic calls = 3.1803 + 0.8938 * OPR difference
-
-intercept <- somfit[[1]][1]
-slope <- somfit[[1]][2]
-
-# X start
-# is shortest OPR
-xstart <- min(cabs$sanOPRdiff)
-
-# X stop
-# is longest OPR
-xstop <- max(cabs$sanOPRdiff)
-
-# Y start
-# >> y = intercept + slope * xstart
-ystart = as.numeric(intercept + slope * xstart)
-
-# Y stop
-# >> y = intercept + slope * xstop
-ystop = as.numeric(intercept + slope * xstop)
-
-# Notes;
-summary(somfit)
-
-# version plot above with fitted line
-callsvsSanOPRdiffPerFit <- ggplot(cabs, aes(x=sanOPRdiff, y=somPer, colour=prion_disease, alpha=gDNA_tissue)) +
-  geom_segment(aes(x=xstart, xend=xstop,
-                   y=ystart, yend=ystop),
-               linetype=2, colour='#ebebeb', size=1.0) +
-  geom_quasirandom(width=0.3, size=1.2) +
-  scale_colour_manual(values=cohcols) +
-  scale_alpha_discrete(range=c(1.0, 0.5)) +
-  scale_x_continuous(breaks=-2:8) +
-  theme_minimal() +
-  theme(
-    legend.position='none',
-    panel.grid.minor.x=element_blank(),
-    axis.title.x=element_text(size=9, margin = margin(t=2, r=0, b=0, l=0)),
-    axis.text.x=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
-    axis.title.y=element_text(size=9, margin = margin(t=0, r=2, b=0, l=0)),
-    axis.text.y=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
-  ) +
-  xlab('OPR length vs reference') + ylab('somatic mutation calls (% of reads)')
-callsvsSanOPRdiffPerFit
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvsSangerOPRdiffPercentageFit.pdf'), callsvsSanOPRdiffPerFit, width=80, height=70, units='mm')
-}
-
-# version with age labels
-  # exploring more a possible relationship below
-callsvsSanOPRdiffPerFitAgelabels <- ggplot(cabs, aes(x=sanOPRdiff, y=somPer, colour=prion_disease, label=age_atsample)) +
-  geom_segment(aes(x=xstart, xend=xstop,
-                   y=ystart, yend=ystop),
-               linetype=2, colour='#ebebeb', size=1.0) +
-  geom_quasirandom(width=0.3) +
-  geom_text() +
-  scale_colour_manual(values=cohcols) +
-  scale_x_continuous(breaks=-2:8) +
-  theme_minimal() +
-  theme(
-    legend.position='none',
-    panel.grid.minor.x=element_blank()
-  ) +
-  xlab('OPR change vs reference') + ylab('somatic mutation calls (% of reads)')
-callsvsSanOPRdiffPerFitAgelabels
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvsSanOPRdiffAgelabelsPercentageFit.pdf'), callsvsSanOPRdiffPerFitAgelabels, width=120, height=100, units='mm')
-}
-
-
-
-# Number of calls vs Sanger OPR -- haplotype resolution -------------------
-
-# same idea as above, but treating haplotypes as separate samples
-# re-create cabs but including haplotype info
-cabsh <- ied %>% # number of CAlls By Sample by Haplotype
-  group_by(sample, haplotype) %>%
-  tally(name='iedCov') %>% # coverage = simply number of rows per sample in ied
-  mutate(sam_hap=paste(sample, haplotype, sep='_'), .before='sample') # add sample_haplotype ID column
-
-# add all the metadata info
-  # ! naturally, will be repeated for each haplotype
-cabsh <- left_join(cabsh, meta, by='sample')
-
-nsomps <- iedSom %>% # number of somatic mutation calls per sample per haplotype
-  group_by(sample, haplotype) %>%
-  tally(name='nSom') %>%
-  mutate(sam_hap=paste(sample, haplotype, sep='_'), .before='sample') %>% # add sample_haplotype ID column
-  ungroup() %>%
-  select(-sample, -haplotype)
-
-# join this to cabsh by matching sam_hap
-cabsh <- left_join(x=cabsh, y=nsomps, by='sam_hap')
-
-# nSom = NA actually means 0 somatic calls
-cabsh[which(is.na(cabsh$nSom)) , 'nSom'] <- 0
-
-# compute % of reads somatic
-  # there are 2 options
-  # 1- % of reads from that haplotype
-  # 2- % of reads from that sample
-# I think best to do 1- so haplotype proportions do not contribute
-# i.e. number of somatic calls / coverage
-cabsh$somPro <- cabsh$nSom / cabsh$iedCov
-cabsh$somPer <- cabsh$somPro * 100
-
-# make sure levels of prion_disease are in order of cohcols above
-cabsh$prion_disease <- factor(cabsh$prion_disease, levels=c('inherited', 'sCJD', 'control'))
-
-# order from sam_hap with largest number of somatic calls to lowest
-cabsh <- cabsh[order(-cabsh$nSom),]
-cabsh$sam_hap <- factor(cabsh$sam_hap, levels=cabsh$sam_hap)
-
-# add sanger OPR difference column
-cabsh$sanOPRdiff <- sapply(cabsh$SV, function(o){ # Sanger OPR difference (eg. 1 OPRD = -1)
-  
-  if (is.na(o)) {
-    return(0) # is NA, that means no SV, so OPRdiff = 0
-  } else {
-    diff <- as.numeric(substr(o, 5, 999)) # get the number; eg. OPRI8 >> 8
-    
-    if (substr(o, 4, 4)=='D') return(-diff) # if OPRD >> put a negative sign
-    if (substr(o, 4, 4)=='I') return(diff) # if OPRI >> leave positive
-  }
-})
-# ! remember compound homozygous sample will show up as +5 (not -1)
-
-# now do plot Sanger OPR vs number of calls again, but essentially treating separate haplotypes as separate samples
-# number of somatic calls per haplotype vs Sanger OPR
-cabsh$haplotype <- factor(cabsh$haplotype, levels=c(1, 2, 0))
-
-callsvsSanOPRdiffHap <- ggplot(cabsh, aes(x=sanOPRdiff, y=nSom, colour=haplotype)) +
-  geom_quasirandom(width=0.2) +
-  scale_colour_manual(values=hapcols) +
-  scale_x_continuous(breaks=-2:8) +
-  theme_minimal() +
-  theme(
-    legend.position='none',
-    panel.grid.minor.x=element_blank()
-  ) +
-  xlab('OPR change vs difference') + ylab('somatic mutation calls')
-callsvsSanOPRdiffHap
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvsSanOPRdiffHap.pdf'), callsvsSanOPRdiffHap, width=120, height=100, units='mm')
-}
-
-# with percentages
-callsvsSanOPRdiffHapPer <- ggplot(cabsh, aes(x=sanOPRdiff, y=somPer, colour=haplotype)) +
-  geom_quasirandom(width=0.2) +
-  scale_colour_manual(values=hapcols) +
-  scale_x_continuous(breaks=-2:8) +
-  theme_minimal() +
-  theme(
-    legend.position='none',
-    panel.grid.minor.x=element_blank()
-  ) +
-  xlab('OPR change vs difference') + ylab('somatic mutation calls')
-callsvsSanOPRdiffHapPer
-ggsave(filename=here('needleSam', 'callsvsSanOPRdiffHapPercentage.pdf'), callsvsSanOPRdiffHapPer, width=120, height=100, units='mm')
-
-# Any striking if plotting sample per sample?
-  # could be for example an OPRI sample where the reference haplotype is affected
-# set OPRdiff column as factor for this, ggplot understands better what I am doing
-cabsh$sanOPRdiff <- factor(cabsh$sanOPRdiff)
-
-# list of plots, one per sample
-hapod <- vector(mode='list', length=length(pdgs)) # one plot per sample, each will be histogram number of calls per haplotype
-
-for (p in 1:length(pdgs)) {
-  
-  # get the Sanger genotype for that sample
-  osan <- as.character(unlist(subset(meta, sample==pdgs[p], SV))) # OPR Sanger
-  
-  # if osan is NA >> it means reference
-  if (is.na(osan)) {
-    osan <- 'reference'
-  }
-  
-  haplot <- ggplot( subset(cabsh, sample==pdgs[p]), aes(x=haplotype, y=nSom, fill=haplotype) ) +
-    geom_col(position='dodge') +
-    scale_fill_manual(values=hapcols) +
-    theme_minimal() +
-    ggtitle(osan) +
-    coord_cartesian(ylim=c(0, 12)) +
-    scale_y_continuous(breaks=seq(0, 12, 2)) +
-    ylab('somatic mutation calls')
-  
-  # place the plot in the list
-  hapod[[p]] <- haplot
-  
-}
-
-hapodGrid <- ggarrange(plotlist=hapod, ncol=1, nrow=length(pdgs))
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsVsHap_bysample.pdf'), plot=hapodGrid, height=1000, width=100, units='mm')
-}
-
-
-
-# Does age affect number of somatic calls irrespective of OPR? ------------
-
-# first; is there a correlation OPR vs age in our dataset
-  # we are assuming here age at sample is a proxy for age at onset
-  # from samples for which we have both age at onset + age at sample + age at death, can see that it correlates well
-  # however there are 4 individuals for whom this is evidently incorrect
-  # 2 controls + one 1 OPRI + one 6 OPRI
-  # (I imagine the OPRI ones are patients screened then followed over time but were not sick at time of recording)
-  # can tell these samples as they have an MRC slope of 0 (as not sick)
-  # it is possible that there is a couple of other samples like this
-    # especially when the individual is dead but we do not have information about disease progression
-    # it may be that they were being followed because they had an OPR mutation but died from something else entirely
-  # however, I can still see for some of those that sample was taken on the year of death or the year before
-  # which strongly suggests this was because they were sick
-
-# exclude individuals which are not sick (so age at sample does not represent onset)
-onidv <- as.numeric(unlist(cabs[which(cabs$MRC_slope==0), 'individual'])) # onidv for onset individuals
-cabson <- subset(cabs, ! individual %in% onidv)
-
-# fit, lm(y ~ x)
-agefit <- lm(cabson$age_atsample ~ cabson$sanOPRdiff)
-agefit
-# age onset = 70 - 1.46 * sanOPRdiff
-# i.e. if reference OPR: age at onset is 70
-# less repeats increase age at onset (i.e. delay onset)
-# more repeats decrease age at onset (i.e. advance onset)
-# for each extra repeat, onset is 1.5 years earlier
-# (need to highlight this is a poor fit though; but replicates literature, see below)
-summary(agefit)
-
-intercept <- agefit[[1]][1]
-slope <- agefit[[1]][2]
-
-# X start
-# is shortest OPR
-xstart <- min(cabson$sanOPRdiff)
-
-# X stop
-# is longest OPR
-xstop <- max(cabson$sanOPRdiff)
-
-# Y start
-# >> y = intercept + slope * xstart
-ystart = as.numeric(intercept + slope * xstart)
-
-# Y stop
-# >> y = intercept + slope * xstop
-ystop = as.numeric(intercept + slope * xstop)
-
-sanOPRdiffvsage <- ggplot(cabson, aes(x=sanOPRdiff, y=age_atsample, colour=prion_disease)) +
-  geom_segment(aes(x=xstart, xend=xstop,
-                   y=ystart, yend=ystop),
-               linetype=2, colour='#ebebeb', size=1.0) +
-  geom_quasirandom(width=0.3) +
-  scale_colour_manual(values=cohcols) +
-  scale_x_continuous(breaks=-2:8) +
-  theme_minimal() +
-  theme(
-    legend.position='none',
-    panel.grid.minor.x=element_blank()
-  ) +
-  xlab('OPR change vs reference') + ylab('age of patient')
-sanOPRdiffvsage
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'sanOPRdiffvsage.pdf'), sanOPRdiffvsage, width=120, height=100, units='mm')
-}
-  
-cor(x=cabson$sanOPRdiff, y=cabson$age_atsample, method='pearson')
-cor.test(x=cabson$sanOPRdiff, y=cabson$age_atsample, method='pearson')
-
-cor(x=cabson$sanOPRdiff, y=cabson$age_atsample, method='spearman')
-cor.test(x=cabson$sanOPRdiff, y=cabson$age_atsample, method='spearman')
-# there is a NON-significant negative correlation
-  # negative is what we would expect; more repeats give lower age at onset
-
-# this looks fairly similar as Figure 1: https://jnnp.bmj.com/content/75/8/1166
-# which is reassuring
-
-# get the residuals from fit above
-  # = portion of onset age not explained by OPR
-  # e.g. for individual 3, onset is higher (delayed) vs what we would expect from their OPR (positive residual)
-cabson$sanOPRdiff_vs_age_residuals <- resid(agefit)
-
-# add the column to cabs by matching samples
-cabs <- cabson %>%
-  select(sample, sanOPRdiff_vs_age_residuals) %>%
-  left_join(x=cabs, y=., by='sample')
-
-# similarly, we have fit OPR vs % somatic calls from before
-# get the residuals
-  # = portion of % somatic that is not explained by OPR
-  # e.g. for this 8 OPRI individual, we would have expected many calls but it does not have a lot (negative residual)
-cabs$sanOPRdiff_vs_somPer_residuals <- resid(somfit) # we can directly add those to cabs
-
-
-# Approach 1
-# Is there a correlation somatic % residuals vs age?
-  # I think this is assuming there is no relationship age vs OPR which is probably untrue
-# positive correlation:
-  # less mutations than expected from OPR match younger age &&
-  # more mutations ... match older age
-    # >>> so more mutations as age increases, irrespective of OPR
-  # and vice-versa; negative correlation >>> less mutations as age increases, irrespective of OPR
-ageVsMutationResidual <- ggplot(cabs, aes(x=age_atsample, y=sanOPRdiff_vs_somPer_residuals, colour=prion_disease)) +
-  geom_point() +
-  scale_colour_manual(values=cohcols) +
-  theme_minimal() +
-  theme(
-    legend.position='none'
-  ) +
-  xlab('age of patient') + ylab('% somatic calls unexplained by OPR')
-ageVsMutationResidual
-
-cor(x=cabs$age_atsample, y=cabs$sanOPRdiff_vs_somPer_residuals, method='pearson')
-cor.test(x=cabs$age_atsample, y=cabs$sanOPRdiff_vs_somPer_residuals, method='pearson')
-
-cor(x=cabs$age_atsample, y=cabs$sanOPRdiff_vs_somPer_residuals, method='spearman')
-cor.test(x=cabs$age_atsample, y=cabs$sanOPRdiff_vs_somPer_residuals, method='spearman')
-# NON-significant negative correlation (or none with Spearman)
-# would (vaguely) suggest less mutations as age increases
-
-# Approach 2
-# residuals vs residuals
-# i.e. x = portion of age unexplained by OPR vs y = portion of % somatic unexplained by OPR
-  # positive correlation:
-    # younger onset than expected by OPR match less mutations than expected by OPR && 
-    # vice-versa; older onset ... more mutations
-    # >>> interpretation could be mutations are protective; i.e. they delay onset
-  # negative correlation:
-    # older onset ... more mutations &&
-    # younger onset ... less mutations
-    # # >>> interpretation could be mutations are detrimental; i.e. they accelerate onset
-
-ageResidualsVsMutationResiduals <- ggplot(cabs, aes(x=sanOPRdiff_vs_age_residuals, y=sanOPRdiff_vs_somPer_residuals, colour=prion_disease)) +
-  geom_point() +
-  scale_colour_manual(values=cohcols) +
-  theme_minimal() +
-  theme(
-    legend.position='none'
-  ) +
-  xlab('difference in age unexplained by OPR') + ylab('% mutation unexplained by OPR')
-ageResidualsVsMutationResiduals
-
-cor(x=cabs$sanOPRdiff_vs_age_residuals, y=cabs$sanOPRdiff_vs_somPer_residuals, use='complete.obs', method='spearman')
-cor.test(x=cabs$sanOPRdiff_vs_age_residuals, y=cabs$sanOPRdiff_vs_somPer_residuals, use='complete.obs', method='spearman')
-
-cor(x=cabs$sanOPRdiff_vs_age_residuals, y=cabs$sanOPRdiff_vs_somPer_residuals, use='complete.obs', method='pearson')
-cor.test(x=cabs$sanOPRdiff_vs_age_residuals, y=cabs$sanOPRdiff_vs_somPer_residuals, use='complete.obs', method='pearson')
-# not convincing at all -- but if anything; negative correlation
-# which would suggest somatic mutations accelerate onset irrespective of OPR
-
-# from labels + 2 approaches here, I think nothing particularly convincing
-
-
-
-# Is OPR mutated haplotype preferentially affected? -----------------------
-
-# from needleSam.R: OPR mutated haplotype (if any) is set to 1
-# Note; PDG 46345, 5 OPRI haplotype is considered as mutated
-# ! here, should exclude samples where haplotype 1/2 distinction is not meaningful
-  # i.e. the samples we could not haplotag (0 SNV) + the samples with reference OPR
-pdgop <- as.character(unlist(subset(meta, !is.na(SV), sample))) # samples with a mutated OPR
-pdgoptag <- intersect(pdgop, pdgs_tag) # samples with a mutated OPR + haplotagged
-
-cabh <- iedSom %>% # calls by haplotype
-  subset(sample %in% pdgoptag) %>% # only somatic calls from samples with mutated OPR + haplotagged
-  group_by(haplotype) %>%
-  tally(name='nSom')
-
-cabh$haplotype <- factor(cabh$haplotype, levels=c(1, 2, 0))
-
-# is there really a surplus of haplotype1 calls?
-
-  # subset of ied, keeping only reads from samples pdgoptag
-iedoptag <- ied %>%
-  subset(sample %in% pdgoptag)
-
-  # build contingency table
-cont <- table(iedoptag$isSomatic, iedoptag$haplotype)
-
-  # Chi2 test
-chi <- chisq.test(cont, correct=FALSE)
-
-chi$expected
-chi$observed
-# the expected count for haplotype0 is low so throws a Warning
-# in this case it is usually recommended to use Fisher's exact test
-fisher.test(cont)
-# it gives very similar p-value (slightly lower)
-
-# I do not actually care about the haplotype0 calls, so perhaps better solution to exclude haplotype0 calls and do Chi2 again
-iedoptag2 <- iedoptag %>%
-  subset(haplotype!=0)
-iedoptag2$haplotype <- factor(iedoptag2$haplotype, levels=c(1,2))
-cont <- table(iedoptag2$isSomatic, iedoptag2$haplotype)
-chi <- chisq.test(cont, correct=FALSE) # p-value ~ 10x lower
-chi
-chi$expected
-chi$observed
-
-# all 3 approaches essentially give the same answer
-# I think will use last (test only on haplotype1/haplotype2)
-
-# get expected counts to add to plot
-hap1exp <- chi$expected[ which(row.names(chi$expected)=='TRUE'), which(colnames(chi$expected)=='1') ]
-hap2exp <- chi$expected[ which(row.names(chi$expected)=='TRUE'), which(colnames(chi$expected)=='2') ]
-
-callsbyHaplotype <- ggplot(cabh, aes(x=haplotype, y=nSom, fill=haplotype)) +
-  geom_col() +
-  #geom_segment(aes(x=0.5, xend=1.5, y=hap1exp, yend=hap1exp), linetype=2, colour='#ebebeb', size=1) +
-  #geom_segment(aes(x=1.5, xend=2.5, y=hap2exp, yend=hap2exp),linetype=2, colour='#ebebeb', size=1) +
-  scale_fill_manual(values=hapcols) +
-  scale_x_discrete(labels=c('OPR mutated', 'OPR reference', 'unassigned')) +
-  theme_minimal() +
-  theme(
-    legend.position='none',
-    #panel.grid.major.x=element_blank(),
-    #panel.grid.minor.x=element_blank(),
-    axis.title.x=element_blank(),
-    # axis.text.x=element_text(angle=90, size=7, margin = margin(t=0, r=0, b=0, l=0)),
-    axis.text.x=element_blank(),
-    axis.title.y=element_text(size=9, margin = margin(t=0, r=2, b=0, l=0)),
-    axis.text.y=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
-  ) +
-  ylab('somatic mutation calls')
-callsbyHaplotype
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsbyHaplotype.pdf'), callsbyHaplotype, width=40, height=60, units='mm')
-}
-
-# with percentages
-  # (should look fairly similar as haplotype1/2 are roughly 50/50)
-  # cannot use haplotype0 calls, very low count but of low total number of reads, so give higher percentage
-  # definitely misleading, will simply exclude for the purpose of this plot
-
-# add total count of each haplotype
-iedoptag2$haplotype <- factor(iedoptag2$haplotype, levels=c(1, 2))
-
-cabh2 <- subset(cabh, haplotype!=0)
-
-cabh2 <- iedoptag2 %>%
-  group_by(haplotype) %>%
-  tally(name='hapReads') %>%
-  left_join(cabh2, by='haplotype')
-
-# add proportion of each haplotype of whole dataset
-cabh2$hapPro <- cabh2$hapReads / nrow(iedoptag)
-
-# compute somatic % of haplotype reads
-cabh2$somPro <- (cabh2$nSom/cabh2$hapReads)
-cabh2$somPer <- (cabh2$nSom/cabh2$hapReads) * 100
-
-# compute expected counts & expected percentages
-# expected count = proportion of haplotype * total number of somatic calls
-cabh2 <- cabh2 %>%
-  mutate(expCount=hapPro * sum(cabh2$nSom)) %>%
-  mutate(expPer=(expCount/hapReads)*100)
-
-cabh2$haplotype <- factor(cabh2$haplotype, levels=c(1, 2))
-
-# plot as above
-callsbyHaplotypePer <- ggplot(cabh2, aes(x=haplotype, y=somPer, fill=haplotype)) +
-  geom_col() +
-  geom_hline(yintercept=cabh2$expPer[1], linetype=2, colour='#ebebeb', size=2) +
-  scale_fill_manual(values=hapcols) +
-  scale_x_discrete(labels=c('OPR mutated', 'OPR reference', 'unassigned')) +
-  theme_minimal() +
-  theme(
-    legend.position='none'
-  ) +
-  ylab('somatic mutation calls (% of reads)')
-callsbyHaplotypePer
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsbyHaplotypePercentage.pdf'), callsbyHaplotypePer, width=100, height=100, units='mm')
-}
-
-
-
-# What are the somatic calls? ---------------------------------------------
-# plot calls vs counts
-# calls as in eg. OPRD1, OPRI2, etc.
-
-# somatic OPR calls are separated in 3 columns mostly to allow cases where a read is both OPRD & OPRI
-
-# but in the calls after mismatch threshold...
-subset(iedSom, !is.na(opri) & !is.na(oprd)) # no row is both OPRI/OPRD
-# and by definition any read that is reference in $othercategory has to be NA in both OPRI & OPRD
-  # >>> so we can merge all three columns in one column
-iedSom$somOPR <- as.character(poolColumns(iedSom, c('othercategory', 'opri', 'oprd')))
-
-# check factor order as I want it
-iedSom$somOPR <- factor(iedSom$somOPR, levels=c(sprintf('OPRD%i', 4:1), 'reference', sprintf('OPRI%i', 1:24)))
-
-# now count number of calls per new OPR
-sowp <- iedSom %>% # number of somatic calls by new OPR 
-  group_by(somOPR) %>%
-  tally(name='nSom')
-
-callsvsnewOPR <- ggplot(sowp, aes(x=somOPR, y=nSom)) +
-  geom_col(fill=NA, colour='#202222') +
-  theme_minimal() +
-  theme(
-    axis.text.x=element_text(angle=90)
-  ) +
-  xlab('') + ylab('somatic mutation calls')
-callsvsnewOPR
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvsnewOPR.pdf'), callsvsnewOPR, width=150, height=100, units='mm')
-}
-
-# I do not think I can do in percentages here
-# I do not see percentage of what I would do...
-# actually I think it is not the most useful plot, as we know it is highly dependent on what the sample was originally
-
-# can fill with haplotype as stacked barplot
-sowph <- iedSom %>%
-  group_by(somOPR, haplotype) %>%
-  tally(name='nSom')
-sowph$haplotype <- factor(sowph$haplotype, levels=c(1, 2, 0))
-
-callsvsnewOPRHap <- ggplot(sowph, aes(x=somOPR, y=nSom, fill=haplotype)) +
-  geom_col(position='stack') +
-  theme_minimal() +
-  theme(
-    axis.text.x=element_text(angle=90),
-    axis.title.x=element_blank(),
-    legend.position='none'
-  ) +
-  scale_fill_manual(values=hapcols) +
-  ylab('somatic mutation calls')
-callsvsnewOPRHap
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvsnewOPRHap.pdf'), callsvsnewOPRHap, width=180, height=100, units='mm')
-}
-
-
-
-
-# Number of calls vs somatic OPR change -----------------------------------
-# more interesting than above as includes 'what the sample was'
-# eg. OPRI8 sample, somatic read is haplotype1 and says OPRI6 >>> -2
-# essentially 'how far does it go from the genotype'
-
-# first prepare column somOPRdiff
-# this is somatic OPR difference vs reference (whatever the sample)
-# eg. somatic calls is 8 OPRI >>> +8
-iedSom$somOPRdiff <- as.integer(unlist(sapply(iedSom$somOPR, function(o) {
-  if(substr(o, 1, 4) == 'OPRI') { # if somatic call is an insertion
-    return(as.integer(substr(o, 5, 99))) # return the number of OPR
-  }
-  
-  else if(substr(o, 1, 4) == 'OPRD') { # if somatic call is a deletion
-    return( - as.integer(substr(o, 5, 99))) # return the number of OPR with a minus sign
-  }
-  
-  else if (o == 'reference') { # if somatic call is reference (only possible for compound homozygous sample)
-    return(0)
-  }
-  
-})))
-
-# second prepare exphapOPR column
-# = Sanger OPR diff we would expect for this haplotype
-  # eg. sample is 8 OPRI
-  # haplotype1 read >>> expected +8
-  # haplotype2 read >>> expected 0
-tmp <- unlist(sapply(1:nrow(iedSom), function(r) { # for each somatic read
-  
-  # get that read's sample
-  spl <- iedSom[r, 'sample']
-  # get the SV of that read's sample
-  sv <- iedSom[r, 'SV']
-  # get that read's haplotype
-  hap <- iedSom[r, 'haplotype']
-  
-  # if SV is NA, always return 0 (reference)
-  # Note, it is also ok to return 0 for samples that we could not haplotag but do not have SV
-  if (is.na(sv)) {
-    return(0)
-  } 
-  
-  # if haplotype is 0 and there is an SV, return NA (as we do not know what to expect)
-  if (hap==0 & !is.na(sv)) {
-    return(NA)
-  }
-  
-  else if (spl=='pdg46345' & hap==1) { # if it is the compound homozygous sample and we are looking at haplotype1 (OPRI5)
-    return(as.integer(substr(sv, 5, 99))) # return the number of OPR (+5)
-  }
-  
-  else if (spl=='pdg46345' & hap==2) { # if it is the compound homozygous sample and we are looking at haplotype2 (OPRD1)
-    return(-1) # return -1
-  }
-  
-  else if (substr(sv, 1, 4) == 'OPRI' & hap==1) { # if SV is an insertion and we are looking at haplotype1 (mutated)
-    return(as.integer(substr(sv, 5, 99))) # return the number of OPR
-  }
-  
-  else if (substr(sv, 1, 4) == 'OPRI' & hap==2) { # if SV is an insertion but we are looking at haplotype2 (reference)
-    return(0)
-  }
-  
-  else if(substr(sv, 1, 4) == 'OPRD' & hap==1) { # if SV is a deletion and we are looking at haplotype1 (mutated)
-    return( - as.integer(substr(sv, 5, 99))) # return minus the number of OPR
-  }
-  
-  else if(substr(sv, 1, 4) == 'OPRD' & hap==2) { # if SV is a deletion but we are looking at haplotype2 (reference)
-    return(0)
-  }
-  
-  else{
-    cat('\t \t \t \t >>> Something unexpected row', r, '\n')
-  }
-  
-}))
-
-length(tmp) == nrow(iedSom) # ok can add to iedSom
-iedSom$exphapOPR <- tmp
-
-# most NA should be from samples which have a mutated OPR but were not haplotagged (PDG 53747, 59060, 58648)
-# as we do not know what to 'expect' from that read (was it from mutated OPR haplotype or not?)
-
-# others NA are from unassigned reads (haplotype0) of samples which have a mutated OPR
-
-# Note; samples which do not have a mutated OPR, we know both haplotypes are reference so what we expect the reads to be is reference = 0 OPRdiff
-
-# third prepare column somOPRchange
-  # = 'how far did the somatic call go from what is expected from the haplotype'
-  # eg. 3 OPRI somatic call in a 4 OPRI sample >> -1
-  # first get absolute difference between somatic OPR diff / Sanger OPR diff
-  # then add the appropriate sign
-tmp <- sapply(1:nrow(iedSom), function(re) { # for each read
-  san <- iedSom[re, 'exphapOPR'] # Sanger OPR count for that haplotype
-  som <- iedSom[re, 'somOPRdiff'] # somatic OPR count for that read
-  
-  # if exphapOPR is NA, return NA
-  # (somOPRdiff is never NA)
-  if (is.na(san)) {
-    return(NA)
-  }
-  
-  if (san < som) {
-    # eg. Sanger = -2 / somatic = -1 >> should return +1
-    # eg. Sanger = 0 / somatic = 4 >> should return +4
-    return(abs(san - som))
-  }
-  
-  else if (san > som) {
-    # eg. Sanger = -1 / somatic = -2 >> should return -1
-    # eg. Sanger = 4 / somatic = 0 >> should return -4
-    return( - abs(san - som))
-  }
-})
-
-iedSom$somOPRchange <- tmp # add the column
-
-
-
-# prepare counts for plot
-scoc <- iedSom %>% # somatic counts by somatic OPR change (eg. how many -1)
-  group_by(somOPRchange) %>%
-  tally(name='nSom')
-
-# plot somatic OPR change vs number of calls
-callsvssomaticOPRchange <- ggplot(scoc, aes(x=somOPRchange, y=nSom)) +
-  geom_col(fill=NA, colour='#4D4D4D', width=0.8) +
-  theme_minimal() +
-  theme(
-    panel.grid.minor.x=element_blank(),
-    axis.title.x=element_text(size=9, margin = margin(t=2, r=0, b=0, l=0)),
-    axis.text.x=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
-    axis.title.y=element_text(size=9, margin = margin(t=0, r=2, b=0, l=0)),
-    axis.text.y=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
-  ) +
-  scale_x_continuous(breaks=min(scoc$somOPRchange, na.rm=TRUE):max(scoc$somOPRchange, na.rm=TRUE)) +
-  xlab('OPR change vs haplotype') + ylab('somatic mutation calls')
-callsvssomaticOPRchange
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvssomaticOPRchange.pdf'), callsvssomaticOPRchange, width=74, height=70, units='mm')
-}
-  
-# prepare counts, similar as above but adding haplotype resolution
-scoch <- iedSom %>% # somatic counts by somatic OPR change (eg. how many -1) by haplotype
-  group_by(somOPRchange, haplotype) %>%
-  tally(name='nSom')
-
-# make sure haplotype is in order I want
-scoch$haplotype <- factor(scoch$haplotype, levels=c(1, 2, 0))
-
-# would adding haplotype fill informative?
-callsvssomaticOPRchangeHap <- ggplot(scoch, aes(x=somOPRchange, y=nSom , fill=haplotype)) +
-  geom_col() +
-  scale_fill_manual(values=hapcols) +
-  theme_minimal() +
-  theme(
-    legend.position='none'
-  ) +
-  scale_x_continuous(breaks=min(scoc$somOPRchange, na.rm=TRUE):max(scoc$somOPRchange, na.rm=TRUE)) +
-  xlab('OPR change vs haplotype') + ylab('somatic mutation calls')
-callsvssomaticOPRchangeHap
-# there are a few I am suspicious about
-  # eg. -5 calls are haplotype1, reference call from PDG 46345 (5 OPRI/1 OPRD)
-  # it may be more likely to be a +1 from 1 OPRD haplotype
-  # but I do not see any objective reason to exclude those
-
-
-# plot separately inherited (mostly mutated OPR) and sporadic/control (reference OPR)
-scocc <- iedSom %>% # somatic counts by somatic OPR change (eg. how many -1) by cohort
-  group_by(somOPRchange, prion_disease) %>%
-  tally(name='nSom')
-
-# add column inherited or not
-scocc$inhTF <- scocc$prion_disease == 'inherited' # inherited TRUE or FALSE
-
-# make sure inhTF is in order I want
-scocc$inhTF <- factor(scocc$inhTF, levels=c(TRUE, FALSE))
-
-# need to 'fill in' the dataframe so both inherited TRUE & FALSE have all OPRs
-# otherwise, eg. -5 is all inherited, so makes a large bar which is inconsistent with the others
-# or in other words, add the 0 counts (-5 non-inherited >> should be 0 nSom)
-
-# make the template dataframe and left_join it to scocc
-oprcgs <- min(scocc$somOPRchange, na.rm=TRUE) : max(scocc$somOPRchange, na.rm=TRUE) # OPR changes present in the data
-tmpl <- as.data.frame(matrix(nrow=length(oprcgs)*2, ncol=2)) # nrow = all OPR changes for both inherited TRUE and FALSE, so x 2
-colnames(tmpl) <- c('inhTF', 'somOPRchange')
-tmpl$inhTF <- c(rep(TRUE, length(oprcgs)) , rep(FALSE, length(oprcgs)))
-tmpl$inhTF <- factor(tmpl$inhTF, levels=c(TRUE, FALSE))
-tmpl$somOPRchange <- c(oprcgs, oprcgs)
-
-scocc <- left_join(tmpl, scocc, by=c('inhTF', 'somOPRchange'))
-
-# replace nSom NA we added by 0
-scocc[which(is.na(scocc$nSom)) , 'nSom'] <- 0
-
-# add total number of reads from inherited / other
-scocc <- ied %>%
-  mutate(inhTF = factor((ied$prion_disease == 'inherited'))) %>%
-  group_by(inhTF) %>%
-  tally(name='totalInhTF') %>%
-  left_join(scocc, by='inhTF')
-# above changes the factor levels
-scocc$inhTF <- factor(scocc$inhTF, levels=c(TRUE, FALSE))
-
-# compute percentage of cohort (inherited / other)
-scocc$somPer <- (scocc$nSom / scocc$totalInhTF) * 100
-
-# colours for inherited TRUE / FALSE
-inhTFcols <- c('#697a87', '#595E60')
-
-# counts
-callsvssomaticOPRchangeCoh <- ggplot(scocc, aes(x=somOPRchange, y=nSom , fill=inhTF)) +
-  geom_col(position='dodge') +
-  scale_fill_manual(values=inhTFcols) +
-  theme_minimal() +
-  theme(
-    legend.position='none'
-  ) +
-  scale_x_continuous(breaks=min(scoc$somOPRchange, na.rm=TRUE):max(scoc$somOPRchange, na.rm=TRUE)) +
-  xlab('OPR change vs haplotype') + ylab('somatic mutation calls')
-callsvssomaticOPRchangeCoh
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvssomaticOPRchangeCoh.pdf'), callsvssomaticOPRchangeCoh, width=120, height=100, units='mm')
-}
-  
-# percentages
-callsvssomaticOPRchangeCohPer <- ggplot(scocc, aes(x=somOPRchange, y=somPer , fill=inhTF)) +
-  geom_col(position='dodge') +
-  scale_fill_manual(values=inhTFcols) +
-  theme_minimal() +
-  theme(
-    legend.position='none'
-  ) +
-  scale_x_continuous(breaks=min(scoc$somOPRchange, na.rm=TRUE):max(scoc$somOPRchange, na.rm=TRUE)) +
-  xlab('OPR change vs haplotype') + ylab('somatic mutation calls (% of reads)')
-callsvssomaticOPRchangeCohPer
-
-if (exportOrNo) {
-  ggsave(filename=here('needleSam', 'callsvssomaticOPRchangeCohPercentage.pdf'), callsvssomaticOPRchangeCohPer, width=120, height=100, units='mm')
-}
-
-
-
-
-# number of somatic insertion/deletion by tissue --------------------------
-# in https://pubmed.ncbi.nlm.nih.gov/16713248/
-# there is an argument why expansions (insertions) may be more frequent in non-dividing cells
-
-# add a column to iedSom whether OPR change vs expected from haplotype is a deletion or an insertion
-  # i.e. whether iedSom$somOPRchange is positive or negative
-iedSom$somOPRchange_type <- NA
-iedSom$somOPRchange_type[which(iedSom$somOPRchange > 0)] <- 'insertion'
-iedSom$somOPRchange_type[which(iedSom$somOPRchange < 0)] <- 'deletion'
-
-# compare within tissue comparison cohort
-idtc <- iedSom %>% # insertion/deletion tissue comparison
-  subset(cohort=='inherited_tissuecomparison') %>%
-  group_by(sample, individual, gDNA_tissue, somOPRchange_type) %>%
-  tally(name='nSom') %>%
-  pivot_wider(names_from=somOPRchange_type,
-              values_from=nSom)
-# one pair of samples could not be haplotagged, so cannot know if insertion or deletion
-# can delete these samples
-inds2del <- match(pdgs_notag, idtc$sample)
-inds2del <- inds2del[!is.na(inds2del)]
-idtc <- idtc[-inds2del,]
-
-# >>> can see it will not be interesting. There are only 2 insertions, both in blood tissues
-
-
-# Can chimeric reads/index hopping be an issue? ---------------------------
-# stringent with filtering criteria in Guppy and before needleSam.R so I do not think likely
-# but here let's take a few examples to see if likely or not
-
-# somatic calls by new OPR by flowcell
-oprflo <- iedSom %>%
-  group_by(somOPR, flowcell) %>%
-  tally(name='nSom')
-
-# count total number of reads from each flowcell
-# and the counts to oprflo
-oprflo <- ied %>%
-  group_by(flowcell) %>%
-  tally(name='totreads') %>%
-  left_join(x=oprflo, y=., by='flowcell')
-
-# compute somatic calls in % of flowcell
-oprflo$somPer <- (oprflo$nSom / oprflo$totreads) * 100
-
-# convert flowcell to factor
-oprflo$flowcell <- factor(oprflo$flowcell, levels=c(1, 2, 3))
-
-# Example1/ flowcell3 has two samples OPRI8; others have 0
-# >> are there more OPRI8 somatic calls from flowcell3 than flowcell 1 & 2?
-subset(oprflo, somOPR=='OPRI8') # no 8 OPRI call from flowcell3
-
-# Example2/ flowcell3 has four samples OPRI4; other have 0
-nrow(subset(ied, flowcell==3 & opri=='OPRI4')) / nrow(subset(ied, flowcell==3)) # 20+ % of reads on flowcell3 are OPRI4
-# >> are there more OPRI4 somatic calls from flowcell3 than flowcell 1 & 2?
-subset(oprflo, somOPR=='OPRI4')
-# no 4 OPRI call from flowcell3
-
-# and how many somatic mutation from flowcell3 (only samples without a 4 OPRI)
-nrow(subset(iedSom, sample %in% as.character(unlist(subset(meta, flowcell==3 & SV != 'OPRI4', sample)))))
-# >>> I do not believe that chimeric read/index hopping can explain any somatic call
-
-
-
-
-
-# write the somatic calls -------------------------------------------------
-
-# export iedSom
-write.xlsx(iedSom, file=here('needleSam', 'somaticCalls.xlsx'))
-
-# export cabs
-write.xlsx(cabs, file=here('needleSam', 'callsBySample.xlsx'))
-
-
-
-
-
-
-# prepare a shortlist -----------------------------------------------------
-
-# some of the criteria below cannot be applied 'equitably' to all samples
-# only interested here in preparing a shortlist the best somatic mutation calls
-
-# will do by read IDs, safer than row indices
-
-# shortlist somatic calls
-cat('\t \t \t \t >>>', nrow(iedSom), 'calls before shortlisting \n')
-
-callsns <- c() # calls not shortlisted (read IDs)
-
-# 1-- delete any haplotype0 calls from samples we could haplotag
-# (the read is probably messy if Whatshap could not haplotag it)
-tmp <- iedSom[which(iedSom$sample %in% pdgs_tag & iedSom$haplotype==0), 'read']
-cat('\t \t \t \t >>>', lengths(tmp), 'calls are haplotype0 calls even though from a haplotagged sample \n')
-callsns <- c(callsns, tmp)
-
-
-# 2-- decrease allowed mismatches
-
-  # original threshold: mean + sd of mismatch by bp of true reads
-    # makes us keep the ~ 86% most accurate reads
-length(which(ied$true_misbp < thrMis)) / length(which(!is.na(ied$true_misbp)))
-
-# we saw somatic calls had a surplus of reads in the 5% mismatch range even after thresholding
-  # >> more stringent threshold: just mean
-    # makes us keep ~ 55% most accurate reads
-thrMis2 <- mean(ied$true_misbp, na.rm=TRUE)
-length(which(ied$true_misbp < thrMis2)) / length(which(!is.na(ied$true_misbp)))
-
-tmp <- iedSom[which(iedSom$som_misbp > thrMis2) , 'read']
-
-cat('\t \t \t \t >>>', length(tmp), 'calls are above stringent mismatch threshold \n')
-callsns <- c(callsns, tmp)
-
-
-# 3-- exclude any calls with low haplotype assignment score
-tmp <- iedSom[which(iedSom$hapscore < 1.0), 'read'] # setting threshold at 0.5, basically 50% of the expected SNVs are not present
-cat('\t \t \t \t >>>', length(tmp), 'calls have a low haplotype assignment score \n')
-callsns <- c(callsns, tmp)
-
-
-# remove these calls
-callsns <- unique(callsns)
-cat('\t \t \t \t >>> Excluding', length(callsns), 'calls \n')
-
-iedSomsl <- iedSom[ - match(callsns, iedSom$read) ,] # sl for shortlist
-
-cat('\t \t \t \t >>> Shortlisted to', nrow(iedSomsl), 'calls \n')
-
-# in shortlist; any replicates?
-  # i.e. more than 1 read same sample/haplotype/somOPR
-iedSomsl %>%
-  group_by(sample, haplotype, exphapOPR, somOPR, somOPRchange) %>%
-  tally(name='nSom') %>%
-  subset(nSom>1)
-
-
-
-
-# reads for figure --------------------------------------------------------
-
-# alignments in figure
-# from top to bottom
-figreads <- c('8f9756e2-1fb9-41d7-a6bf-a11463a55990',
-              'a5b5b295-3e6d-4f48-8839-24aa3fd50718',
-              'e1c59ac3-a547-4616-a120-4dd7d3463e24',
-              '2f8a89cb-73d7-4591-af8b-f814ac373a24',
-              'e67c028a-35e2-44b0-80b8-45ccaa100392',
-              '9d387703-c39e-47df-bd96-49b0a1d62b4d',
-              '86f5c651-7ce0-4778-a55f-bc01cacec79b',
-              'e43b902e-cbaf-4feb-9170-51caecb1cf81',
-              'aa058a24-7e53-4d55-ad91-f5dcdc54602a',
-              '5a5e7247-0550-4f16-aaf2-e51ad2b5c9a5')
-
-length(figreads)
-
-figReds <- ied[match(figreads, ied$read),]
-
-if (exportOrNo) {
-  write.xlsx(figReds, file=here('needleSam', 'figureSomaticReads.xlsx'))
-}
-
-
-
-# write shortlist ---------------------------------------------------------
-
-# write the shortlist
-
-if (exportOrNo) {
-  write.xlsx(iedSomsl, file=here('needleSam', 'somaticCallsShortlist.xlsx'))
-}