diff --git a/GCplot/GCplotter.R b/GCplot/GCplotter.R
new file mode 100644
index 0000000..8f1568a
--- /dev/null
+++ b/GCplot/GCplotter.R
@@ -0,0 +1,111 @@
+# takes a sequence, plots GC%
+
+library(here)
+library(data.table)
+library(ggplot2)
+
+# GC percent --------------------------------------------------------------
+# rolling sum in window
+
+GCpercent <- function(sequence, window) {
+  
+  seqsplit <- strsplit(sequence, split='')[[1]]
+  
+  gc1 <- as.numeric(sapply(seqsplit, function(nu){
+    if(nu=='A' | nu=='T') return(0)
+    if(nu=='C' | nu=='G') return(1)
+  }))
+  
+  gcroll <- frollsum(gc1, window,
+                     fill=NA, align='right')
+  
+  gcpe <- gcroll/window
+  
+  return(gcpe)
+  
+}
+
+# GC plot -----------------------------------------------------------------
+
+GCplot <-  function(GCpercent) {
+  
+  gcdf <- cbind(1:length(GCpercent), as.data.frame(GCpercent))
+  colnames(gcdf) <- c('pos', 'gcp')
+  
+  gcgg <- ggplot(gcdf, aes(x=pos, y=gcp)) +
+    geom_line(colour='#5d5e5d', size=0.5) +
+    coord_cartesian(ylim=c(0,1)) +
+    theme_minimal() +
+    theme(
+      axis.title.x=element_blank(),
+      axis.title.y=element_blank(),
+      axis.text.x=element_blank(),
+      axis.text.y=element_blank(),
+      panel.grid.minor.x=element_blank(),
+      panel.grid.minor.y=element_blank(),
+      panel.grid.major.x=element_blank()
+    )
+    
+  
+  return(gcgg)
+  
+}
+
+
+# GC plot – barplot -------------------------------------------------------
+
+GCplotbar <-  function(GCpercent) {
+  
+  gcdf <- cbind(1:length(GCpercent), as.data.frame(GCpercent))
+  colnames(gcdf) <- c('pos', 'gcp')
+  
+  gcgg <- ggplot(gcdf, aes(x=pos, y=gcp)) +
+    geom_col(colour='#5d5e5d', size=0.5) +
+    coord_cartesian(ylim=c(0,1)) +
+    theme_minimal() +
+    theme(
+      axis.title.x=element_blank(),
+      axis.title.y=element_blank(),
+      axis.text.x=element_blank(),
+      axis.text.y=element_blank(),
+      panel.grid.minor.x=element_blank(),
+      panel.grid.minor.y=element_blank(),
+      panel.grid.major.x=element_blank()
+    )
+  
+  
+  return(gcgg)
+  
+}
+
+
+# to use ------------------------------------------------------------------
+
+# 1- import sequence in fasta
+fasta <- here('GCplot', 'prnp_window1.fa')
+seq <- as.character(read.table(fasta, skip=1))
+
+# 2- run GCpercent & GCplot, with desired window
+gcp1 <- GCplotbar(GCpercent(seq, 100))
+
+ggsave((here('GCplot', 'gc_window1.pdf')), plot=gcp1, width=65, height=18, units='mm')
+
+# second half
+# 1- import sequence in fasta
+fasta <- here('GCplot', 'prnp_window2.fa')
+seq <- as.character(read.table(fasta, skip=1))
+
+# 2- run GCpercent & GCplot, with desired window
+gcp2 <- GCplotbar(GCpercent(seq, 100))
+
+ggsave((here('GCplot', 'gc_window2.pdf')), plot=gcp2, width=52, height=18, units='mm')
+
+# window3
+# 1- import sequence in fasta
+fasta <- here('GCplot', 'prnp_window3.fa')
+seq <- as.character(read.table(fasta, skip=1))
+
+# 2- run GCpercent & GCplot, with desired window
+gcp3 <- GCplotbar(GCpercent(seq, 100))
+
+ggsave(here('GCplot', 'gc_window3.pdf'), plot=gcp3, width=32, height=18, units='mm')
diff --git a/GCplot/gc_window1.pdf b/GCplot/gc_window1.pdf
new file mode 100644
index 0000000..e601b0c
Binary files /dev/null and b/GCplot/gc_window1.pdf differ
diff --git a/GCplot/gc_window2.pdf b/GCplot/gc_window2.pdf
new file mode 100644
index 0000000..f379276
Binary files /dev/null and b/GCplot/gc_window2.pdf differ
diff --git a/GCplot/gc_window3.pdf b/GCplot/gc_window3.pdf
new file mode 100644
index 0000000..d97b953
Binary files /dev/null and b/GCplot/gc_window3.pdf differ
diff --git a/GCplot/prnp_window1.fa b/GCplot/prnp_window1.fa
new file mode 100644
index 0000000..a5fad21
--- /dev/null
+++ b/GCplot/prnp_window1.fa
@@ -0,0 +1,2 @@
+> prnp_window1
+TTCTACGCCTCAAAATTTAAGAGTTTATGTGAAAATTCATAAATATTAATCTCAATCCAGGTTAAGCAAAATTTTTTGCTCTCCTCTTTAGAAATTTCTGGTTGCCAAAGTTCCAGAAATTGCTTCCTCATTCCTGAGCCTTTCATTTTCTCGATTTCTCCATTATGTAACGGGGAGCTGGAGCTTTGGGCCGAATTTCCAATTAAAGATGATTTTTACAGTCAATGAGCCACGTCAGGGAGCGATGGCACCCGCAGGCGGTATCAACTGATGCAAGTGTTCAAGCGAATCTCAACTCGTTTTTTCCGGTGACTCATTCCCGGCCCTGCTTGGCAGCGCTGCACCCTTTAACTTAAACCTCGGCCGGCCGCCCGCCGGGGGCACAGAGTGTGCGCCGGGCCGCGCGGCAATTGGTCCCCGCGCCGACCTCCGCCCGCGAGCGCCGCCGCTTCCCTTCCCCGCCCCGCGTCCCTCCCCCTCGGCCCCGCGCGTCGCCTGTCCTCCGAGCCAGTCGCTGACAGCCGCGGCGCCGCGAGCTTCTCCTCTCCTCACGACCGAGGCAGGTAAACGCCCGGGGTGGGAGGAACGCGGGCGGGGGCAGGGGAGCCGCGGGGGCCGAGTGAGGACCCCGGGCCTCGGGTCCCAGGCGCAAGGGTGCCCGGCCGGGCGGGGTCGGGACCCCAGTGAGGAGGGGCCGGGGGCTGCCCCGCGGGCGCGTGACGCGTCTCGGGCCTGCCCGGCTGCGCTGGTCTCCGCTCGGGTGAGGCGGCTTGGCTTCGCTTTTCAGGTTAGGAAAGCTCCCTTTACTGCGCGTTGGGGGGCTGGGGGAGCTGGCGGAGCCCCGTTAGGGAGGTCGGTGGCGCCGGGGTGTCTCAGCGCCCCCTGCACCCCGCGCGGGTCCGGCCCAGCGGGCGATCGCTGGCGCCCAGGGAACTCCGGGAGGGCCGCCAGCGGGCTCCGCAGGGCGCGGGGCGGGGAGGGGCGCCTGGGGGCCGCGGGGCTCGCGCTCCCCGCCCGTTGGCCGCCCCTCGGAGGCCGAGATCGGGGCCCAGAACGCCCCTTGGCAAGGCCTGGCGCTTCCGCGATGCCCAGAGGGTGCTTGGGGGGATGGAGAGAGGGGCGCCCGCCGGGGGAGTTCCGGGAGCCTCGGTGCCTCCCGCCGCAGCTGCAGCGTTCCTCCCGGGAGGCGGCCCAGCCCTTCATCCTCGCCGCCTGAGCTTCTCCGAGGGGGGCTGCAGCCTTGCGGCCGTTGCCACCGCCTGGAGAAGCGGCCCACGCGGACTGACGGGCGGGGGCGGGGCCTCGGGCCTCGGCGGGGGCGGGGTCCGGGGAGGCCCCACCCTCTGTTCTCCAGGGGCGGGGAGAGAGGAGCTGCAGGTCTGCGGCCTGGCCCCAGGTGCGATGGCGGACCCCAGCTTGGCCAGTCACATTCCTCCCAGTCCCCCTGGAGGGAGAACGCTGGCCATGGGGGGCTCCAAGGAACAACCAGCCTCGGATGACGACCCTTGGGTCACCGGTCTCCCCACCTGTGCGGCAGGCGCCTTCACGTTTCATTATTAAACAATGGGGAGAAATCCATGTTTACTGTCCTTTTTAAGGAATTTTTTGCTCTTCTCTTTGAGGTGGCTGTAGGAAATAGATTTTTTTTTTAACCTCGCAATTCCACCACGGTCACATCCATCCTCGCCATCGCAGAGCCACAGCTCTCCGTTTTTGTTTCCTAGCCTCCAGATTCTCACACAACACAGTGCAGTTTCACTGCTGTAATGATGAGGATCTTCATGGCCGCGTTATTTTCTTGTTCTGAGAGCATCACGGTTTAATTAGCAGTTCCCCATATGATTTGAAGTGTTTCCCGTTTCCTTAGGGAAAACTCCTGGTAGAATAGGATTAAGGATTTTTACAAATATAATTATCAAAAACATAGGAACAGGGAATTGGATAAATATGTTAAACTTCTGGAAAAATCAACAACGCTCTTAGATTTGTAGAAGAAAGGAAAAAATCACCAGTGGAAAGGAGCAATTTTACTTACACAAACACAGAGAAGGTCTTACAGTGAAAAAAAGCTAACCAGTAAGGGGAAAAGCAGGCAGAGGGGTAGGATGTGATTTGTATGTTATTTATATCTAACACAAGTCTTCCAC
diff --git a/GCplot/prnp_window2.fa b/GCplot/prnp_window2.fa
new file mode 100644
index 0000000..4b947f3
--- /dev/null
+++ b/GCplot/prnp_window2.fa
@@ -0,0 +1,2 @@
+> prnp_window2
+GTTAAATAATGAACTAAAAGTCATTCATCAAGTCCATAACTTAGGGTCACATTTGTCCTTGGAGCAGGAGAAAGAGTTGTGTTCACCCTTTTCTTACTTTTGCTTTTGTCCTAAGTGCTTCAGAGAAGTACAGGGTGGCAACAGTGTTTCTACTGAGCAGCTGATACCATTGCTATGCACTCATTCATTATGCAGGAAACATTTAGTAATTTCAACATAAATATGGGACTCTGACGTTCTCCTCTTCATTTTGCAGAGCAGTCATTATGGCGAACCTTGGCTGCTGGATGCTGGTTCTCTTTGTGGCCACATGGAGTGACCTGGGCCTCTGCAAGAAGCGCCCGAAGCCTGGAGGATGGAACACTGGGGGCAGCCGATACCCGGGGCAGGGCAGCCCTGGAGGCAACCGCTACCCACCTCAGGGCGGTGGTGGCTGGGGGCAGCCTCATGGTGGTGGCTGGGGGCAGCCTCATGGTGGTGGCTGGGGGCAGCCCCATGGTGGTGGCTGGGGACAGCCTCATGGTGGTGGCTGGGGTCAAGGAGGTGGCACCCACAGTCAGTGGAACAAGCCGAGTAAGCCAAAAACCAACATGAAGCACATGGCTGGTGCTGCAGCAGCTGGGGCAGTGGTGGGGGGCCTTGGCGGCTACATGCTGGGAAGTGCCATGAGCAGGCCCATCATACATTTCGGCAGTGACTATGAGGACCGTTACTATCGTGAAAACATGCACCGTTACCCCAACCAAGTGTACTACAGGCCCATGGATGAGTACAGCAACCAGAACAACTTTGTGCACGACTGCGTCAATATCACAATCAAGCAGCACACGGTCACCACAACCACCAAGGGGGAGAACTTCACCGAGACCGACGTTAAGATGATGGAGCGCGTGGTTGAGCAGATGTGTATCACCCAGTACGAGAGGGAATCTCAGGCCTATTACCAGAGAGGATCGAGCATGGTCCTCTTCTCCTCTCCACCTGTGATCCTCCTGATCTCTTTCCTCATCTTCCTGATAGTGGGATGAGGAAGGTCTTCCTGTTTTCACCATCTTTCTAATCTTTTTCCAGCTTGAGGGAGGCGGTATCCACCTGCAGCCCTTTTAGTGGTGGTGTCTCACTCTTTCTTCTCTCTTTGTCCCGGATAGGCTAATCAATACCCTTGGCACTGATGGGCACTGGAAAACATAGAGTAGACCTGAGATGCTGGTCAAGCCCCCTTTGATTGAGTTCATCATGAGCCGTTGCTAATGCCAGGCCAGTAAAAGTATAACAGCAAATAA
diff --git a/GCplot/prnp_window3.fa b/GCplot/prnp_window3.fa
new file mode 100644
index 0000000..5c36f9a
--- /dev/null
+++ b/GCplot/prnp_window3.fa
@@ -0,0 +1,2 @@
+> prnp_window3
+CTAGACACTGAAGGCAAATCTCCTTTGTCCATTTACCTGGAAACCAGAATGATTTTGACATACAGGAGAGCTGCAGTTGTGAAAGCACCATCATCATAGAGGATGATGTAATTAAAAAATGGTCAGTGTGCAAAGAAAAGAACTGCTTGCATTTCTTTATTTCTGTCTCATAATTGTCAAAAACCAGAATTAGGTCAAGTTCATAGTTTCTGTAATTGGCTTTTGAATCAAAGAATAGGGAGACAATCTAAAAAATATCTTAGGTTGGAGATGACAGAAATATGATTGATTTGAAGTGGAAAAAGAAATTCTGTTAATGTTAATTAAAGTAAAATTATTCCCTGAATTGTTTGATATTGTCACCTAGCAGATATGTATTACTTTTCTGCAATGTTATTATTGGCTTGCACTTTGTGAGTATTCTATGTAAAAATATATATGTATATAAAATATATATTGCATAGGACAGACTTAGGAGTTTTGTTTAGAGCAGTTAACATCTGAAGTGTCTAATGCATTAACTTTTGTAAGGTACTGAATACTTAATATGTGGGAAACCCTTTTGCGTGGTCCTTAGGCTTACAATGTGCACTGAATCGTTTCATGTAAGAATCCAAAGTGGACACCATTAACAGGTCTTTGAAATATGCATGTACTTTATATTTTCTATATTTGTAACTTTGCATGTTCTTGTTTTGTTATATAAAAAAATTGTAAATGTTTAATATCTGACTGAAATTAAACGAGCGAAGATGAGCACCACC
diff --git a/OPR_litsurvey/OPRLitCatalog.xlsx b/OPR_litsurvey/OPRLitCatalog.xlsx
new file mode 100644
index 0000000..475bc5a
Binary files /dev/null and b/OPR_litsurvey/OPRLitCatalog.xlsx differ
diff --git a/OPR_litsurvey/OPRLitCatalogSeq.xlsx b/OPR_litsurvey/OPRLitCatalogSeq.xlsx
new file mode 100644
index 0000000..9a85d2d
Binary files /dev/null and b/OPR_litsurvey/OPRLitCatalogSeq.xlsx differ
diff --git a/OPR_litsurvey/eachOPR.xlsx b/OPR_litsurvey/eachOPR.xlsx
new file mode 100644
index 0000000..70bcdd9
Binary files /dev/null and b/OPR_litsurvey/eachOPR.xlsx differ
diff --git a/OPR_litsurvey/seqFromOPRpattern.R b/OPR_litsurvey/seqFromOPRpattern.R
new file mode 100644
index 0000000..bc71862
--- /dev/null
+++ b/OPR_litsurvey/seqFromOPRpattern.R
@@ -0,0 +1,63 @@
+# generate sequence from OPR pattern
+
+library(openxlsx)
+library(here)
+library(dplyr)
+library(tidyr)
+
+
+# import OPR patterns -----------------------------------------------------
+
+oprs <- read.xlsx(here('OPR_litsurvey', 'OPRLitCatalog.xlsx'))
+
+# any duplicates in pattern?
+sum(duplicated(oprs$pattern)) # No, OK
+
+# any duplicates in allele?
+sum(duplicated(oprs$allele)) # No, OK
+
+
+# import reference for each OPR -------------------------------------------
+
+oref <- read.xlsx(here('OPR_litsurvey', 'eachOPR.xlsx'))
+
+
+
+# function to generate full sequence from a pattern -----------------------
+
+pattern2Seq <- function(pattern) {
+  
+  oseqs <- sapply(strsplit(pattern, '/')[[1]],
+                  function(r){
+                    as.character(subset(oref, Rname==r, sequence))
+                  })
+  
+  fullseq <- paste(oseqs, collapse='')
+  
+  return(fullseq)
+  
+}
+
+
+# apply the function ------------------------------------------------------
+
+oprs$fullsequence <- sapply(oprs$pattern,
+                            function(pa){
+                              pattern2Seq(pa)
+                              })
+
+
+# add OPRD1_Lee2016 sequence manually -------------------------------------
+# allele OPRD1_Lee2016 is a deletion that overlaps two OPRs, add sequence manually
+
+oprs[which(oprs$allele=='OPRD1_Lee2016'), 'fullsequence'] <-
+  'CCTCAGGGCGGTGGTGGCTGGGGGCAGCCTCATGGTGGTGGCTGGGGGCAGCCTCATGGTGGTGGCTGGGGGCAGCCCCATGGTGGTGGCTGGGGTCAA'
+
+# check that all the lengths make sense
+which((nchar(oprs$fullsequence) == oprs$OPR_length)==FALSE) # all correct
+
+
+# write the file ----------------------------------------------------------
+
+# will need to copy the $pattern column from original file to keep the colouring
+write.xlsx(oprs, here('OPR_litsurvey', 'OPRLitCatalogSeq.xlsx'))
diff --git a/README.md b/README.md
index 094012a..9c328e7 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,285 @@
-# prnp_nanopore
\ No newline at end of file
+# prnp_nanopore
+
+Data and code for
+
+François Kroll\*\, Athanasios Dimitriadis\*\, Tracy Campbell, Lee Darwent, John Collinge, Simon Mead, Emmanuelle Viré. 2022.
+
+**Prion protein gene mutation detection using long-read Nanopore sequencing.**
+
+_co-first authors*_
+
+https://www.medrxiv.org/content/10.1101/2022.03.06.22271294v1
+
+
+Below explains how to recreate the analyses.  
+
+Please cite if you use some of the data or code.  
+
+___
+
+## about .command bash scripts
+
+I use macOS.  
+.command bash scripts are included in /utilities/.  
+
+Add /utilities/ to PATH so scripts (and other scripts they depend on) are found.  
+
+You probably need to add permissions for each .command script in /utilities/ with
+
+    chmod u+x ~/.../utilities/XXX.command
+
+___
+
+Get in touch for questions
+
+  * [![alt text][1.2]][1] [@francois_kroll](https://twitter.com/francois_kroll)
+
+  * :email: francois@kroll.be
+
+<!-- icons with padding -->
+[1.1]: http://i.imgur.com/tXSoThF.png (twitter icon with padding)
+
+<!-- icons without padding -->
+[1.2]: http://i.imgur.com/wWzX9uB.png (twitter icon without padding)
+
+<!-- links to your social media accounts -->
+[1]: https://twitter.com/francois_kroll
+
+___
+
+## GC% plot
+
+Refers to GC% plot in Figure 1A.  
+
+In directory /GCplot/, find `GCplotter.R`
+
+Input =
+* prnp_window1.fa
+* prnp_window2.fa
+* prnp_window3.fa
+
+___
+
+## Filtering the gene-body bam alignment files
+
+Directory /genebodyBams/ contains .bam & .bam.bai files for each sample, gene-body amplicon.  
+Please find /genebodyBams/ in Zenodo archive at XXX.  
+
+In directory /utilities/, find `prnp_filterBam.command`.
+
+Script loops through bam files in /genebodyBams/ and filters each with command  
+
+    filterBam.command -i $bam -f 10000 -c 15000 -s 0.05 -p yes -o $out  
+
+Read about filtering parameters in comments in `prnp_filterBam.command`.
+
+`filterBam.command` is included in /utilities/.  
+
+About `filterBam.command`:
+* it calls a R script `readsToThrow.R`, included in /utilities/. Path is hard-coded.
+* Note another hard-coded path to `picard.jar`  
+
+In summary, to filter the gene-body bam files:
+
+    cd ~/.../prnp_nanopore/
+    prnp_filterBam.command
+
+Which creates filtered version of each bam file in folder /bamfilt/.  
+You can directly find this folder at Zenodo archive XXX.  
+
+___
+
+## Filtering the promoter bam alignment files
+
+Same logic as for the gene-body amplicon above.  
+
+Directory /promoterBams/ contains .bam & .bam.bai files for each sample, promoter amplicon.  
+Please find /promoterBams/ in Zenodo archive at XXX.  
+
+In directory /utilities/, find `promoter_filterBam.command`.
+
+Script loops through bam files in /promoterBams/ and filters each with command  
+
+    filterBam.command -i $bam -f 1000 -c 3500 -s 0.15 -p yes -o $out
+
+Read about filtering parameters in comments in `promoter_filterBam.command`.
+
+`filterBam.command` is included in /utilities/. It is the same script as used for the gene-body bam files above.  
+
+In summary, to filter the promoter bam files:
+
+    cd ~/.../prnp_nanopore/
+    promoter_filterBam.command
+
+Which creates filtered version of each bam file in folder /promoterbamfilt/.  
+You can directly find this folder at Zenodo archive XXX.  
+
+___
+
+## Haplotype phasing
+
+Haplotype phasing is done with `whatshap v1.1`.  
+
+The first step is to phase the SNVs in each sample's VCF file.  
+
+The SNVs were called by `nanopolish v0.13.2`. Find this step in athanadd's repo https://github.com/athanadd/prnp-nanopore-seq.
+
+Typically, haplotype phasing would be performed directly on the VCF from the variant calling algorithm, here from nanopolish. Here, we re-wrote a VCF file for each sample containing only SNVs after strand bias filtering, using _AdditionalFile1.xlsx_ sheet _SNVs_filtered_ as input.   
+
+In directory /utilities/ find `prepareVCFforWhatshap.R`.  
+Input is _AdditionalFile1.xlsx_, especially sheet _SNVs_filtered_.
+
+`prepareVCFforWhatshap.R` will write one VCF file for each sample, except for five samples which do not carry any SNV:
+* #58648
+* #54890
+* #55050
+* #59060
+* #53747
+
+As these samples do not carry any SNV, they cannot be haplotype-phased, hence there is no use creating a VCF file for them.  
+
+Output folder for the new VCF files is /haplotypephasing/vcf_unphased/. Each VCF is called _SAMPLEID_wha.vcf_, wha is for whatshap.  
+
+Note, whatshap cannot phase a single SNV, but a single heterozygous SNV is sufficient to haplotag the reads. Therefore, if the sample only carries a single SNV, `prepareVCFforWhatshap.R` will write the VCF 'as if' it already went through `whatshap`, i.e. it is written directly in /haplotypephasing/vcf_phased/ and is called _SAMPLEID_whap.vcf_, whap is for whatshap phased.  
+
+Once the VCFs are ready, the actual phasing is done by script `prnp_haplotypePhasing.command`. Find it in /utilities/.  
+
+The key steps `prnp_haplotypePhasing.command` are:  
+* phase the SNVs in each VCF using `whatshap phase`. For each sample, it writes phased VCF in /haplotypephasing/vcf_phased/ called _SAMPLEID_whap.vcf_, whap is for whatshap phased.  
+* using the phased VCF, haplotag the reads with `whatshap haplotag`. For each sample, it finds its filtered BAM in /bamfilt/.  
+
+The above is called possibility 2 in `prnp_haplotypePhasing.command`. Possibility 1 is for the samples which carry a single SNV. For these, `prepareVCFforWhatshap.R` directly created a phased VCF, so we skip `whatshap phase` and directly haplotag the reads (the filtered BAM file) using `whatshap haplotag`.  
+
+Note `prnp_haplotypePhasing.command` has hardcoded calls to the human reference genome hg38.fa. Cmd + F "hg38.fa" to find and modify them.  
+
+For each sample which can be haplotype phased/haplotagged, the main output from `prnp_haplotypePhasing.command` should be a phased VCF file in /haplotypephasing/vcf_phased/ and a haplotagged BAM in /haplotypephasing/bam_haplotagged/ called _SAMPLEID_hp.bam_ (hp for haplotype).  
+
+Five samples cannot be haplotype-phased as they do not carry any SNV. We want to manually move the BAM for these samples in a new folder /haplotypephasing/bam_notag/.  
+
+    mkdir haplotypephasing/bam_notag/
+
+    cp bamfilt/58648_f.bam haplotypephasing/bam_notag/58648_f.bam
+    cp bamfilt/58648_f.bam.bai haplotypephasing/bam_notag/58648_f.bam.bai
+
+    cp bamfilt/54890_f.bam haplotypephasing/bam_notag/54890_f.bam
+    cp bamfilt/54890_f.bam.bai haplotypephasing/bam_notag/54890_f.bam.bai
+
+    cp bamfilt/55050_f.bam haplotypephasing/bam_notag/55050_f.bam
+    cp bamfilt/55050_f.bam.bai haplotypephasing/bam_notag/55050_f.bam.bai
+
+    cp bamfilt/59060_f.bam haplotypephasing/bam_notag/59060_f.bam
+    cp bamfilt/59060_f.bam.bai haplotypephasing/bam_notag/59060_f.bam.bai
+
+    cp bamfilt/53747_f.bam haplotypephasing/bam_notag/53747_f.bam
+    cp bamfilt/53747_f.bam.bai haplotypephasing/bam_notag/53747_f.bam.bai
+
+You can find directly folders    
+* /haplotypephasing/vcf_unphased/
+* /haplotypephasing/vcf_phased/
+* /haplotypephasing/bam_haplotagged/
+* /haplotypephasing/bam_notag/
+
+at the Zenodo archive XXX.  
+
+___
+
+## Trim the reads to keep only the octapeptide repeat region (OPR)
+
+This is performed by `prnp_OPR.command`, found in /utilities/. It is ran once on all the BAM in /haplotypephasing/bam_haplotagged/ and once on all the /haplotypephasing/bam_notag/ (see below).  
+
+For each sample (BAM file), the key steps are:  
+* trim the reads in the BAM to keep only the OPR using `samtools ampliconclip`. Note, this step uses _OPRpos.bed_ in /needleSam/ folder, which are the positions to trim to keep only the OPR, i.e. the positions to clip/exclude. Beware, the path to _OPRpos.bed_ is hardcoded. The main output is the BAM containing only OPR reads, written in new folder /OPRtrim/ and named _SAMPLEID_opr.bam_. The step also creates a log file in new folder /clipLogs/, which we do not use further.  
+* filter the trimmed BAM (only OPR reads) using `filterBam.command`. Read the comments for explanations about the filtering parameters. This step will partly repeat the first filtering performed on the BAM files.  
+* count read lengths. This step will write a TXT file for each sample in new folder /readlengths/ named _SAMPLEID_lengths.txt_. The TXT file has two column: 1– number of reads of that length / 2– read length.  
+* convert the trimmed BAM to SAM format, which we will use in `needleSam.R` (see below).    
+
+First we run `prnp_OPR.command` on the haplotagged BAM files:  
+
+    cd ~/Dropbox/nanopore/haplotypephasing/bam_haplotagged/
+    prnp_OPR.command
+
+Second we run `prnp_OPR.command` on the BAM files which could not be haplotagged:  
+
+    cd ~/Dropbox/nanopore/haplotypephasing/bam_notag/
+    prnp_OPR.command
+
+Accordingly, folders
+* /clipLogs/
+* /OPRtrim/
+* /OPRtrim_sam/
+* /readlengths/
+
+are found in the folder /bam_haplotagged/ or in the folder /bam_notag/ depending on the sample.  
+
+As in the previous step, you can find directly folders    
+* /haplotypephasing/vcf_unphased/
+* /haplotypephasing/vcf_phased/
+* /haplotypephasing/bam_haplotagged/
+* /haplotypephasing/bam_notag/
+
+at the Zenodo archive XXX.  
+
+___
+
+## Generate catalog of OPR templates
+
+Finding candidate somatic mutations of the OPR will involve aligning OPR reads to template OPR sequences.  
+
+Please look at _AdditionalFile1.xlsx_, sheet _OPRconsensus_ at this stage. The OPR templates are built from a consensus sequence of one R (R being a single repeat unit) which makes use of IPUPAC flexible nucleotide codes (see https://www.bioinformatics.org/sms/iupac.html). By consensus sequence, we mean that all of R2, R3, R4 are included in the consensus sequence. For example, the first codon of the repeat unit is always CCT or CCC. Accordingly, it is written as CCY in the consensus sequence, where Y can be either C or T. R1 has an extra codon compared to R2, R3, R4 so it can be written with exactly the same consensus sequence but it is also written with flexible nucleotides to allow for the same changes at Wobble positions.  
+
+The catalog of OPR templates is generated by `generateOPRCatalog.R`, found in /needleSam/. Finding a somatic mutation of the OPR is like finding a needle in a SAM alignment file, hence 'needleSam'.  
+
+`generateOPRCatalog.R` writes _OPRConsensusCatalog.csv_ in folder /needleSam/. It contains 29 OPR templates, from 4 OPRD (deletion of all R repeats except R1) up to 24 OPRI (insertion of 24 extra R repeats). The table was copied to _AdditionalFile1.xlsx_, sheet _OPRtemplates_.  
+
+
+___
+
+## Somatic mutation search
+
+This is performed by `needleSam.R`, found in /needleSam/.  
+
+Inputs to `needleSam.R` are some sheets from _AdditionalFile1.xlsx_ and, most importantly, the OPR trimmed SAM files from above, found in folders /haplotypephasing/bam_haplotagged//OPRtrim_sam/ and /haplotypephasing/bam_notag/OPRtrim_sam/.  
+
+Follow the code/comments in `needleSam.R`. The key steps are:  
+* Import the SAM files  
+* Calculate the total insertion/deletion of each read by parsing the CIGAR  
+* Assign each read to a most likely OPR genotype based on its total insertion/deletion  
+* Align each read to its most likely OPR template sequence  
+* Calculate the maximum number of mismatches allowed based on the set of 'true reads', i.e. the reads which confirm the Sanger genotype of their sample.  
+* Identify candidate somatic OPR reads. A read is a candidate somatic OPR read if it is unexpected from its sample (typically it is different than reference or the known mutated OPR) and passes the mismatch threshold.  
+
+Note, the last step is actually performed by `needleSam_exploration.R`, see below.  
+
+The main output of `needleSam.R` is _allOPRreads.csv_, found in /needleSam/. It stores all the OPR reads from the various SAM files, with various information about each read, e.g. which is its most likely OPR and how many mismatches did it have with that OPR template.
+
+This the main analysis which I hope could be useful to someone else, so do not hesitate to get in touch for questions!
+
+francois@kroll.be or twitter @francois_kroll.
+
+___
+
+## Exploring candidate somatic mutations
+
+This is performed by `needleSam_exploration.R`, found in /needleSam/.  
+
+The main input is the database _allOPRreads.csv_ created by `needleSam.R`.  
+
+`needleSam_exploration.R` first identifies the candidate somatic mutations (see above), then does various analyses and plots.  It also writes _somaticCalls.xlsx_, which was copied in _AdditionalFile1.xlsx_, sheet _somaticmutationcalls_, minus some unnecessary columns. Read about the columns in _AdditionalFile1.xlsx_, sheet _TableofContents_.  
+
+Note, some personal information including precise ages were removed from _AdditionalFile1.xlsx_, sheet _samples_ to protect anonymity. Accordingly, some sub-analyses (not included in the publication) of `needleSam_exploration.R` will throw errors.  
+
+___
+
+## Control PCR
+
+___
+
+## Literature survey of OPR genotypes  
+
+Before having the idea of the OPR consensus sequence, we created a survey of all mutated OPRs reported in literature. The approach using the OPR consensus sequence is better as it generalises to more possible cases, but we are including this survey if it is any useful. It may not be exhaustive, but it should be close.  
+
+Find _OPRLitCatalog.xlsx_ in /OPR_litsurvey/.
+
+Publications typically give the R1/R2/... pattern and not the full nucleotide sequence. Script `seqFromOPRpattern.R` generates the full OPR sequence from the R1/R2/... pattern. It uses `eachOPR.xlsx` for the sequence of each repeat.  
+
+___
diff --git a/needleSam/OPRConsensusCatalog.csv b/needleSam/OPRConsensusCatalog.csv
new file mode 100644
index 0000000..02f9891
--- /dev/null
+++ b/needleSam/OPRConsensusCatalog.csv
@@ -0,0 +1,30 @@
+"total_R","n_Rx","genotype","OPR_length","pattern","consensusseq"
+1,0,"OPRD4",27,"R1/","CCYCAKGGYGGYGGYGGYTGGGRDCAR"
+2,1,"OPRD3",51,"R1/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+3,2,"OPRD2",75,"R1/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+4,3,"OPRD1",99,"R1/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+5,4,"reference",123,"R1/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+6,5,"OPRI1",147,"R1/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+7,6,"OPRI2",171,"R1/Rx/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+8,7,"OPRI3",195,"R1/Rx/Rx/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+9,8,"OPRI4",219,"R1/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+10,9,"OPRI5",243,"R1/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+11,10,"OPRI6",267,"R1/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+12,11,"OPRI7",291,"R1/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+13,12,"OPRI8",315,"R1/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+14,13,"OPRI9",339,"R1/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+15,14,"OPRI10",363,"R1/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+16,15,"OPRI11",387,"R1/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+17,16,"OPRI12",411,"R1/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+18,17,"OPRI13",435,"R1/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+19,18,"OPRI14",459,"R1/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+20,19,"OPRI15",483,"R1/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+21,20,"OPRI16",507,"R1/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+22,21,"OPRI17",531,"R1/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+23,22,"OPRI18",555,"R1/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+24,23,"OPRI19",579,"R1/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+25,24,"OPRI20",603,"R1/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+26,25,"OPRI21",627,"R1/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+27,26,"OPRI22",651,"R1/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+28,27,"OPRI23",675,"R1/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
+29,28,"OPRI24",699,"R1/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/Rx/","CCYCAKGGYGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCARCCYCAKGGYGGYGGYTGGGRDCAR"
diff --git a/needleSam/OPRpos.bed b/needleSam/OPRpos.bed
new file mode 100644
index 0000000..79e6ff6
--- /dev/null
+++ b/needleSam/OPRpos.bed
@@ -0,0 +1,2 @@
+chr20 4685060 4699370
+chr20 4699493 4701756
diff --git a/needleSam/generateOPRCatalog.R b/needleSam/generateOPRCatalog.R
new file mode 100644
index 0000000..768d56c
--- /dev/null
+++ b/needleSam/generateOPRCatalog.R
@@ -0,0 +1,63 @@
+# generate list of possible OPRs with R consensus sequence
+
+  # we have a common R consensus making use of flexible nucleotide (eg. Y can be C or T)
+  # here: generate possible OPRDs, OPRIs sequences making use of this consensus pattern
+  # then these sequences will be used to align candidate somatic mutation reads in needleSam.R
+
+
+
+# packages ----------------------------------------------------------------
+
+library(ggplot2)
+library(here)
+
+
+
+# prepare the OPR catalog -------------------------------------------------
+
+# 'Rx' consensus which works for R2, R3, R4, R2a, R2c, R3g and more that are likely possible but have not been observed (to my knowledge)
+rxcon <- 'CCYCAKGGYGGYGGYTGGGRDCAR'
+nchar(rxcon)
+
+# consensus for R1 (and leaves 'same' nucleotides flexible; although R1 seems to never or rarely be altered)
+r1con <- 'CCYCAKGGYGGYGGYGGYTGGGRDCAR'
+nchar(r1con)
+
+# essentially always start with R1, then gradually add more Rx
+
+# genotypes will be
+ges <- c(sprintf('OPRD%i', 4:1), # OPRD4 to ORD1
+         'reference',
+         sprintf('OPRI%i', 1:24)) # OPRI1 to OPRI24
+
+# OPR catalog table
+colnms_cato <- c('total_R', 'n_Rx', 'genotype', 'OPR_length', 'pattern', 'consensusseq')
+cato <- as.data.frame(matrix(nrow=length(ges), ncol=length(colnms_cato))) # catalog
+colnames(cato) <- colnms_cato
+
+# fill in OPR catalog
+cato$total_R <- 1:29 # total number of R (reference = 5 R)
+cato$n_Rx <- cato$total_R - 1 # total number of Rx (= anything except R1) (reference = 4 Rx)
+cato$genotype <- ges # genotypes: OPRD4 ...>>>... reference ...>>>... OPRI24
+
+cato$pattern <- sapply(cato$n_Rx, function(nr) {
+  paste0('R1/', paste(rep('Rx/', nr), sep='', collapse=''))
+  })
+
+cato$consensusseq <- sapply(cato$n_Rx, function(nr) {
+  paste0(r1con, paste(rep(rxcon, nr), sep='', collapse=''))
+  })
+
+cato$OPR_length <- nchar(cato$consensusseq)
+
+# as a sanity check
+ggplot(cato, aes(x=n_Rx, y=OPR_length)) +
+  geom_point() +
+  geom_hline(yintercept=seq(27, 27+(4+24)*24, 24))
+# ok
+
+
+
+# write the OPR catalog ---------------------------------------------------
+
+write.csv(cato, here('needleSam', 'OPRConsensusCatalog.csv'), row.names=FALSE)
\ No newline at end of file
diff --git a/needleSam/needleSam.R b/needleSam/needleSam.R
new file mode 100644
index 0000000..4dba69c
--- /dev/null
+++ b/needleSam/needleSam.R
@@ -0,0 +1,2242 @@
+# needle in a SAM
+
+# finding a possible somatic mutation in the reads
+# like a needle in a haystack!
+# needle = somatic mutation / haystack = SAM files
+
+oprlength <- 123
+# threshold used in readSam() to exclude reads that start or terminate in the OPR by CIGAR lengths
+
+
+
+# some common things we may need to change often --------------------------
+hapcol <- 21 # nth field in the SAM file contains the HP tags (looks like HP:i:1; it was 20th then 21th in a newer version of minimap2)
+hapcols <- c('#f1876b', '#4D4D4D', '#B3BDC4') # colours for the haplotypes, it goes 1, 2, 0
+
+
+
+# packages and functions --------------------------------------------------
+
+
+library(here)
+library(stringr)
+library(data.table)
+library(openxlsx)
+library(msa)
+library(ggplot2)
+library(tictoc)
+library(dplyr)
+library(tidyr)
+library(ggpubr)
+library(ggbeeswarm)
+
+# substrEnding = last n characters of a string
+# x = string (or vector of strings); n = last n characters
+substrEnding <- function(x, n){
+  substr(x, nchar(x)-n+1, nchar(x))
+}
+
+# beforeFirstCharacter = given a string, takes everything before first instance of a character
+# eg. beforeFirstCharacter('59060_file.bam', '_') >> '59060'
+beforeFirstCharacter <- function(stri, cha) {
+  as.character(sapply(stri, function(st){
+    substr(st,
+           1,
+           which(strsplit(st, '')[[1]] == cha) [length(which(strsplit(st, '')[[1]] == cha))]-1)
+  }))
+}
+
+###
+
+# getReadlengthCigar() is a small helper function for readSam() below
+  # the goal is to exlude reads which start or terminate in the OPR
+  # these get categorised as reference because their CIGAR are for example 56M
+# However, these reads do not span the whole OPR, so they should be excluded
+  # but excluding by readlength does not work, for example a 103-bp read could be a OPRD1 or a reference read that terminated early in the OPR
+# one solution is to compute the alignment length as told by the CIGAR (only Match/Insertion/Deletion)
+  # this essentially corresponds to the window of hg38 that is covered by the read (as it will include deletions)
+
+# get read length as told by CIGAR
+getReadlengthCigar <- function(read) { # works for one row of a sam file
+  cigar <- read[6]
+  tmp <- str_extract_all(cigar, '[0-9]+[MDI]')[[1]] # all the Match/Deletion/Insertion components of the CIGAR
+  rlci <- sum(as.numeric(unlist(strsplit(tmp, '[A-Z]')))) # rlci = readlength cigar
+  return(rlci)
+}
+
+###
+
+# readSam = reads SAM properly into R
+readSam <- function(sampath) {
+  
+  # check the path actually looks like it leads to a SAM file
+  if(substrEnding(sampath, 4) != '.sam') stop('\t \t \t \t >>> Error: that does not look like the path to a SAM file... \n')
+  
+  # if ok -- import the SAM file
+  # numcols <- max(count.fields(sampath, sep='\t'), na.rm=TRUE) 
+    # above is to count maximum number of columns to be used below as sprintf('col%i', numcols)
+    # works well, but when importing more than one SAM, can have variable number of columns
+    # alternative: set to a large number. It will create NA columns at the end, but at least we know all the SAMs look the same
+    # (can delete the NA columns after rbind all the SAMs, if that is what we want to do)
+  sam <- read.table(file=sampath, fill=TRUE, comment.char='@', sep='\t', na.strings=c('', 'NA'), 
+                    col.names=sprintf('col%i', 1:30))
+  
+  # check what should be the haplotype tag column, can be messy for different reasons
+  hpcol <- 21
+  
+  taghp <- substr(sam[,hpcol], 1, 2) # will take mostly NA or HP and we do not want anything else, hence below
+  cat('\t \t \t \t \t >>> Found', length(which(taghp=='HP')), 'haplotags in the expected column \n')
+  
+  if (length(which(taghp!='HP')) > 0) {
+    cat('\t \t \t \t \t >>> Removed', length(which(taghp!='HP')), 'unexpected tags from the HP column \n')
+    sam[which(taghp!='HP'), hpcol] <- NA # the rows that are not NA or HP, replace them with NA
+    # ! for these rows, it is possible that the HP tag was pushed to the next column
+      # no need to look only at these rows, just look in the next column for any HP
+    taghpnxt <- substr(sam[,hpcol+1], 1, 2)
+    
+    if(length(which(taghpnxt=='HP')) > 0) { # if found any HP tag to salvage from the next column
+      cat('\t \t \t \t \t \t >>> for', length(which(taghpnxt=='HP')), 'of these reads, salvaged the HP tag found in the next column \n')
+      sam[which(taghpnxt=='HP'), hpcol+1] <- sam[which(taghpnxt=='HP'), hpcol+1] # copy it to the HP column
+      
+    # ! Note this is far from an ideal solution, will copy the HP tag and it also means the columns are shifted for these rows
+      # but at the moment I am not using any other tag than HP
+    } 
+  }
+  
+  # check there is no HP tag in other columns before or after HP column
+  for (co in ((hpcol-4):(hpcol+4))[-5]) { # check all 4 columns before HP column, all 4 columns after HP column, except HP column itself
+    tagco <- substr(sam[,co], 1, 2)
+    if(length(which(tagco=='HP')) > 0) {
+      cat('\t \t \t \t \t >>> found ', length(which(tagco=='HP')), ' HP tags in column', co ,'\n')
+    }
+  }
+  
+  # check there is no duplicate read ID
+    # as I removed secondary alignments in filterBam.command it should be good
+  if (sum(duplicated(sam[1])) != 0) {
+    stop('\t \t \t \t \t >>> Error: found', sum(duplicated(sam[1])), 'duplicated read IDs \n')
+  } else {
+    cat('\t \t \t \t \t >>> All read IDs are unique \n')
+  }
+  
+  # exclude reads that start or terminate in the OPR (see explanations for getReadlengthCigar() above)
+  reads2del <- which(apply(sam, 1, getReadlengthCigar) < oprlength)
+  
+  if (length(reads2del) == 0) {
+    cat('\t \t \t \t \t >>> 0 read that start or terminate in the OPR \n')
+  } else {
+    cat('\t \t \t \t \t >>> Deleting', length(reads2del),'reads that start or terminate in the OPR \n')
+    sam <- sam[ - reads2del ,]
+  }
+  
+  # exclude reads that include letters that are not ACGT
+    # I think removing the reads that start/terminate reads above include this, but just to be safe
+  totnotACGT <- sapply(1:nrow(sam), function(ri) { # for each read, count the total of letters that are not ACGT
+    sum( ! strsplit(sam[ri,10], '')[[1]] %in% c('A', 'C', 'G', 'T'))
+  })
+  
+  reads2del <- which(totnotACGT!=0)
+  
+  if (length(reads2del) != 0) {
+    cat('\t \t \t \t \t >>> Deleting', length(reads2del),'reads with non-ACGT nucleotide \n')
+    sam <- sam[ - reads2del ,]
+  }
+  
+  # say final number of reads we are importing
+  cat('\t \t \t \t \t >>> IMPORTING', nrow(sam),'READS \n')
+  
+  return(sam)
+  
+}
+
+
+###
+
+# from CIGAR of one read; get total insertion
+getTotalInsertion <- function(read) {
+  
+  cigar <- read[6]
+  
+  if(length(grep(pattern='I', cigar))!=0) {
+    tmp <- str_extract_all(cigar, '[0-9]+I')[[1]] # extracts the I components; eg. 59I129M32I >> 59I 32I
+    totin <- sum(as.numeric(unlist(strsplit(tmp, '[A-Z]'))))
+    # split at any letter (here will be only I), so example above: 59 32
+    # convert to numeric & sum
+  } else {
+    totin <- 0
+    return(totin)
+  }
+}
+
+###
+
+# from CIGAR of one read; get total deletion
+getTotalDeletion <- function(read) {
+  
+  cigar <- read[6]
+  
+  if(length(grep(pattern='D', cigar))!=0) {
+    tmp <- str_extract_all(cigar, '[0-9]+D')[[1]] # extracts the I components; eg. 59I129M32I >> 59I 32I
+    totdel <- -sum(as.numeric(unlist(strsplit(tmp, '[A-Z]'))))
+    # split at any letter (here will be only I), so example above: 59 32
+    # convert to numeric & sum & turn to negative for deletion
+    return(totdel)
+  } else {
+    totdel <- 0
+    return(totdel)
+  }
+}
+
+###
+
+# counts & frequencies within bins of a continuous variable (each bin = one range of values) per group
+  # e.g. data is number of mismatches for each read
+  # and 2 groups: reference / mutated
+  # for each group; how many reads in bin 0--10 mismatches; how many reads in bin 11-20 etc.
+  # both in counts and frequencies (% of total within each group)
+# Note; bins represent the right boundary, e.g. a case where we have proportions, 0.1 means the bin that contains the data from 0 to 0.1
+  # same in the dataframe it returns
+# example usage/ freqInBins(df = mydata, datcol = 'mismatchpercentage', grpcol = 'treated', bins=seq(0, 1, by=0.1))
+
+freqInBins <- function(df, datcol, grpcol, bins) {
+  
+  ngrps <- length(unique(df[,grpcol])) # number of groups
+  grpnms <- unique(df[,grpcol]) # group names
+  
+  cfpg <- vector(mode='list', length=ngrps) # preallocate list counts + frequencies per group
+  
+  for(i in 1:length(grpnms)) { # loop through groups
+    dat <- as.numeric(df[ which(df[,grpcol] == grpnms[i]) , datcol]) # data for this group (the data we need to bin)
+    cos <- hist(dat, breaks=bins, include.lowest=TRUE, plot=FALSE)$counts # counts
+    fre <- cos / sum(cos) # frequencies
+    
+    gdf <- data.frame(grp=grpnms[i], bin=bins[-1], counts=cos, freq=fre) # small dataframe for this group, two columns counts and frequencies
+    
+    # group column should be called the same as grpcol
+    colnames(gdf)[1] <- grpcol
+    
+    # add it to the list
+    cfpg[[i]] <- gdf
+  }
+  
+  # stick the list together and return it
+  fbing <- as.data.frame(rbindlist(cfpg)) # frequencies in each bin by group
+  return(fbing)
+  
+}
+
+
+
+
+# 1- Import what we need --------------------------------------------------
+
+
+  # 1- Catalog of consensus OPR sequences
+    # OPRConsensusCatalog.csv written by generateOPRCatalog.R
+cato <- read.xlsx(here('AdditionalFile1.xlsx'), sheet='OPRtemplates') # consensus sequence from OPRD4 up to OPRI24
+
+
+  # 2- Meta information of samples
+meta <- read.xlsx(here('AdditionalFile1.xlsx'), sheet='samples')
+    # make the formatting match
+
+      # - >> NA
+meta$SV[which(meta$SV=='-')] <- NA
+
+      # special case: PDG 46345 is 5 OPRI/1 OPRD
+        # make a new column SV2 for it (just for the 1 OPRD)
+meta <- meta %>%
+  mutate(SV2=NA, .after=SV)
+meta[which(meta$sample==46345), 'SV2'] <- unlist(strsplit(meta[which(meta$sample==46345), 'SV'], '/'))[2]
+meta[which(meta$sample==46345), 'SV'] <- unlist(strsplit(meta[which(meta$sample==46345), 'SV'], '/'))[1]
+
+      # 2 OPRD >> OPRD2, etc.
+        # first lapply takes what is after the space = OPRD or OPRI
+        # second lapply takes what is before the space = 2, 1, ...
+meta$SV <- paste0(lapply(strsplit(meta$SV, ' '), function(i) {i[2]}),
+                  lapply(strsplit(meta$SV, ' '), function(i) {i[1]}))
+        # the NA above become 'NANA', put them back to NA
+meta$SV[which(meta$SV=='NANA')] <- NA
+
+        # same for SV2
+meta$SV2 <- paste0(lapply(strsplit(meta$SV2, ' '), function(i) {i[2]}),
+                  lapply(strsplit(meta$SV2, ' '), function(i) {i[1]}))
+meta$SV2[which(meta$SV2=='NANA')] <- NA
+
+      # sample: 52331 >> pdg52331, etc.
+meta$sample <- sprintf('pdg%s', meta$sample)
+
+
+  # 3- Import haplotype meta information
+    # eg. how many SNVs available for haplotagging
+hpmeta <- read.xlsx(here('AdditionalFile1.xlsx'), sheet='haplotypephasing')
+    # sample: 52331 >> pdg52331, etc.
+hpmeta$sample <- sprintf('pdg%s', hpmeta$sample)
+
+  # and sequencing_summary (mainly for coverage)
+seqmeta <- read.xlsx(here('AdditionalFile1.xlsx'), sheet='sequencing_summary')
+# sample: 52331 >> pdg52331, etc.
+seqmeta$sample <- sprintf('pdg%s', seqmeta$sample)
+
+
+  # 4- Import the SAMs
+    # preallocate list, each element = the SAM of one sample
+    # 2 folders where to find the SAMs:
+      # one for SAMs that could not be haplotagged
+      # the other for SAMs that could be haplotagged
+samfoldnotag <- here('haplotypephasing/bam_notag/OPRtrim_sam') # folder of no-tag SAMs
+samfoldtag <- here('haplotypephasing/bam_haplotagged/OPRtrim_sam') # folder of tagged SAMs
+
+samnms <- c(list.files(samfoldnotag), list.files(samfoldtag)) # names of all the SAMs
+    # check expected number
+length(samnms) == nrow(meta)
+
+sams <- vector(mode='list', length=length(samnms)) # preallocate list that will store all the SAMs
+pdgs <- sprintf('pdg%s', beforeFirstCharacter(samnms, '_')) # gives pdg58398 etc.
+names(sams) <- pdgs # put the pdg58398 etc. as names of the elements in the list
+
+    # get also pdgs of without-haplotag / with-haplotag, will be useful later
+pdgs_notag <- sprintf('pdg%s', beforeFirstCharacter(list.files(samfoldnotag), '_'))
+pdgs_tag <- sprintf('pdg%s', beforeFirstCharacter(list.files(samfoldtag), '_'))
+
+    # gets the full paths where to find each SAM
+sampaths <- c(list.files(samfoldnotag, full.names=TRUE),
+              list.files(samfoldtag, full.names=TRUE))
+
+    # fill in the list using function readSam from above
+for (s in 1:length(sams)) {
+  cat('\t \t \t \t >>>', s, '/', length(sams), 'Reading SAM of sample', names(sams)[s], '\n')
+  sams[[s]] <- readSam(sampaths[s])
+}
+
+
+
+
+# 2- Calculate total insertion/deletion of each read ----------------------
+# from the CIGAR; eg. 3I 2D 100M 3I 5D >> insertion = 6 bp // deletion = 5 bp
+
+  # readSam() should have removed anything funky in HP column, but check before running below just to be sure
+hpcol <- 21
+sum(unlist(lapply(sams, function(sa) {
+  which( substr(as.character(sa[,hpcol]), 1, 2) != 'HP') # any elements in all the HP columns that is not NA or does not start with HP
+}))) == 0
+
+  # preallocate list of indel dataframes
+  # same length as sams
+ies <- vector(mode='list', length=length(sams)) # ies for Insertion/dEletion dataframeS
+names(ies) <- pdgs # put the pdg58398 etc. as names of the elements in the list
+
+  # each element in the list will be a dataframe;
+    # column1 = sample
+    # column2 = haplotype (from whatshap haplotag)
+    # column2 = read name
+    # column3 = read sequence
+    # column3 = total insertion (in bp, as per CIGAR)
+    # column4 = total deletion (in bp, as per CIGAR)
+
+for (i in 1:length(ies)) { # for each element in the list (each sample)
+  
+  cat('\t \t \t \t >>> Format & parsing CIGARs for SAM', i, '/', length(ies), '\n')
+  
+  # preallocate the dataframe
+  colnms <- c('sample', 'haplotype', 'hapqual', 'read', 'samflag', 'seq', 'ins', 'del')
+  tmp <- as.data.frame(matrix(nrow=nrow(sams[[i]]), ncol=length(colnms)))
+  colnames(tmp) <- colnms
+  
+  # fill in sample name
+  tmp$sample <- rep(names(ies)[i], nrow(sams[[i]]))
+  
+  # fill in haplotype
+  tmp$haplotype <- as.numeric(substr(sams[[i]][,hpcol], 6, 6)) # simplifies the haplotype tag, from HP:i:1 / HP:i:2 to 1 / 2; will return NA if no present
+  
+  # fill in haplotype quality
+  tmp$hapqual <- as.numeric(substr(sams[[i]][,hpcol+1], 6, 999)) # simplifies the haplotype quality, from eg. PC:i:150 to 150
+  
+  # fill in read names
+  tmp$read <- sams[[i]][,1]
+  
+  # fill in sam flags
+  tmp$samflag <- sams[[i]][,2]
+  
+  # fill in read sequences
+  tmp$seq <- sams[[i]][,10]
+  
+  # fill in total insertions
+  tmp$ins <- apply(sams[[i]], 1, getTotalInsertion)
+  
+  # fill in total deletions
+  tmp$del <- apply(sams[[i]], 1, getTotalDeletion)
+  
+  # place that sample's dataframe in the list
+  ies[[i]] <- tmp
+}
+
+# turn the ies list into one big dataframe
+ied <- as.data.frame(rbindlist(ies)) # for Insertion/dEletion Dataframe
+
+# make sure haplotype is stored as factor
+ied$haplotype <- factor(ied$haplotype, levels=c(1, 2, 0))
+
+# check no duplicated read ID after pooling all samples
+sum(duplicated(ied$read))
+
+
+
+
+# 4- Check haplotype tags -------------------------------------------------
+
+# check a few things about haplotags
+# sample that were not phased should haplotype all NA
+subset(ied,
+       sample %in% pdgs_notag &
+         !is.na(haplotype))
+
+# haplotypes can only be NA, 1, 2
+subset(ied,
+       (!is.na(haplotype) & ! haplotype %in% c(1,2))) # any haplotype not NA and not 1 or 2
+
+# I think will be easier if we replace haplotype NA (i.e. unassigned read) by 0
+ied$haplotype[which(is.na(ied$haplotype))] <- 0
+
+# check no more NA in haplotype
+sum(is.na(ied$haplotype))
+
+# Are roughly 50% of the reads assigned to each haplotype?
+  # for each sample, plot the number of reads / proportion of reads (barplot) from haplotype0 (unassigned), haplotype1, haplotype2
+
+hco <- ied %>% # hco for haplotype counts
+  group_by(sample, haplotype) %>%
+  tally(name='nreads')
+
+# add total number of reads in each sample
+tmp <- ied %>%
+  count(sample, name='totalsamplereads')
+
+hco <- left_join(hco, tmp, by='sample')
+
+# weirdly this seems to be the easiest way to order hco by pdgs
+hco <- left_join(data.frame(sample=pdgs), hco, by='sample')
+
+# normalise the read counts per haplotype
+hco <- hco %>%
+  mutate(haplotype_pro=nreads/totalsamplereads) %>% # haplotype proportion, i.e. proportion of reads assigned to haplotype 1, 2, or unassigned (0)
+  mutate(sam_hap=paste(sample, haplotype, sep='_'), .after=haplotype) # add a unique sample_haplotype column, easier for plotting
+
+# put levels of sample and sam_hap in right order for plotting
+hco$sample <- factor(hco$sample, levels=unique(hco$sample))
+hco$sam_hap <- factor(hco$sam_hap, levels=hco$sam_hap)
+hco$haplotype <- factor(hco$haplotype, levels=c(1, 2, 0))
+
+# haplotype assignments as stacked barplot, actual number of reads (so height = OPR coverage)
+haploReads <- ggplot(hco, aes(x=sample, y=nreads, fill=haplotype)) +
+  geom_bar(stat='identity') +
+  scale_fill_manual(values=hapcols) +
+  theme_minimal() +
+  theme(
+    axis.text.x=element_text(angle=90),
+    axis.title.x=element_blank()
+  ) +
+  ylab('number of reads')
+haploReads
+
+# haplotype assignments as stacked barplot, proportion of coverage (so height is 1 for all samples)
+
+# I prefer samples without haplotag at the top
+
+# add column tag or no tag
+hco$htag <- TRUE
+hco[which(hco$sample %in% pdgs_notag) , 'htag'] <- FALSE
+
+hco <- hco %>%
+  arrange(desc(htag))
+
+hco$sample <- factor(hco$sample, levels=unique(hco$sample))
+
+haploPro <- ggplot(hco, aes(x=sample, y=haplotype_pro, fill=haplotype)) +
+  coord_flip() +
+  geom_bar(stat='identity') +
+  scale_fill_manual(values=hapcols) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+    axis.title.x=element_blank(),
+    axis.text.x=element_text(),
+    axis.title.y=element_blank(),
+    axis.text.y=element_text()
+  ) +
+  ylab('proportion of reads')
+haploPro
+
+ggsave(here('haplotypephasing', 'haploReads.pdf'), haploReads,width=200, height=100, units='mm')
+ggsave(here('haplotypephasing', 'haploPro.pdf'), haploPro,width=200, height=100, units='mm')
+
+
+# Why unassigned reads?
+  # samples which did not have at least one heterozygous SNVs cannot be haplotagged, that is expected
+  # but some samples have up to ~ 15% unassigned reads (i.e. haplotype0)
+# worst case: PDG 55048 ~ 15% haplotype0
+    # looking carefully at the reads in IGV, whatshap seems to do a good job, even considering possible small shifts in the alignment
+    # there is only one usable SNV for PDG 55048, high unassigned % is mostly reads which do not cover the SNV
+    # I think nothing unexpected
+# does proportion of unassigned reads correlate strongly with how many SNVs available?
+
+# add haplotype phasing meta info
+  # keep all rows from hco (duplicate rows from hpmeta), so left_join
+hco <- left_join(x=hco, y=hpmeta)
+
+# add also the cohort of each sample
+hco <- left_join(x=hco, y=select(meta, sample, prion_disease))
+
+
+hcoun <- subset(hco, ! sample %in% pdgs_notag & haplotype==0) # keep only proportion of unassigned reads, from haplotagged samples
+
+# ! any samples haplotagged but not there: their proportion of unassigned reads is 0, should add them to the plot
+  # slightly clumsy, but will take one row for each (which will correspond to haplotype 1 or 2) and replace the few cells that need to be changed
+rows2add <- hco[which(hco$sample == pdgs_tag[which(! pdgs_tag %in% hcoun$sample)]) , ]
+rows2add$haplotype <- 0
+rows2add$sam_hap <- paste(rows2add$sample, rows2add$haplotype, sep='_')
+rows2add$nreads <- 0
+rows2add$haplotype_pro <- 0
+# rest is correct as dependent on the sample, not the haplotype
+hcoun <- rbind(hcoun, rows2add)
+
+hapUnassigned <- ggplot(hcoun, aes(x=genebody_heterozygousSNVs, y=haplotype_pro, colour=prion_disease)) +
+  geom_quasirandom(width=0.3) +
+  theme_minimal() +
+  theme(
+    panel.grid.minor.x=element_blank()
+  ) +
+  scale_x_continuous(breaks=1:16) +
+  xlab('informative SNVs') + ylab('proportion unassigned reads')
+hapUnassigned
+ggsave(here('haplotypephasing/hapUnassigned_v4.pdf'), hapUnassigned,width=200, height=100, units='mm')
+cor(hcoun$genebody_heterozygousSNVs, hcoun$haplotype_pro, method='pearson')
+cor(hcoun$genebody_heterozygousSNVs, hcoun$haplotype_pro, method='spearman')
+
+
+
+
+# 5- Plot insertions/deletions for each sample ----------------------------
+
+# will then assign each read to a most likely OPR based on total insertion/deletion
+# should leave some space for nanopore noise
+  # eg. an OPRI often appears as a 23-bp insertion
+# here: take the opportunity to look how much space we should leave
+
+  # tentative thresholds
+lownoise <- 6
+highnoise <- 3
+    # eg. for OPRI1: ok from 24 - lownoise up to 24 + highnoise
+    # for example, if lownoise = 6 // highnoise = 3; considers potential OPRI1 anything from insertion of 18 bp to insertion of 27 bp
+    # makes sense for lownoise to be larger, as there is always a drag towards lower lengths
+    # (or in other words, nanopore makes more deletions errors than insertions)
+
+# plot an insertion + a deletion histogram for each sample
+# preallocate list to store the plots
+# ! one list for all deletion plots and one list for all insertion plots
+ips <- vector(mode='list', length=length(pdgs)) # Insertions PlotS
+names(ips) <- pdgs
+
+dps <- vector(mode='list', length=length(pdgs)) # Deletions PlotS
+names(dps) <- pdgs
+
+# make a table of target insertion/deletion lengths
+  # eg. OPRD1 : deletion -24, etc.
+# will do from OPRD4 up to OPRI24, so 4 OPRD + reference + 24 OPRI
+tarl <- as.data.frame(matrix(nrow=29, ncol=2)) # target lengths
+colnames(tarl) <- c('oprg', 'targetindel') # OPR genotype // target indel length
+tarl$oprg <- c(sprintf('OPRD%i', 4:1) , 'reference' , sprintf('OPRI%i', 1:24))
+tarl$targetindel <- c( seq(-24*4, -24*1, 24) , 0 , seq(24*1, 24*24, 24)) # (one OPR = 24 bp)
+
+# first; DELETION plots
+for (s in 1:length(dps)) {
+  
+  # PDG we are plotting
+  pd <- names(dps)[s]
+  
+  # get its OPR variant (if any)
+  sva <- as.character(subset(meta, sample==pd, SV))
+  
+  # and from it the target length
+  target <- as.numeric(subset(tarl, oprg==sva, targetindel))
+    # eg. PDG 1906 >> OPRD2 >> -48 bp
+  
+  # for that sample, count the reads with each deletion length
+    # can also do it automatically with geom_histogram(), but I found it leave a bar when count = 0 if computed before, which I prefer
+  tmp <- ied %>%
+    subset(sample==pd) %>%
+    group_by(del, haplotype) %>%
+    tally(name='nreads') %>%
+    mutate(freq=nreads/nrow(subset(ied, sample==pd))) 
+      # above: compute frequency by dividing number of reads with each insertion length by total number of reads from that sample
+  
+  
+  # if sample has a OPRD: version with marker lines
+    # ! need to specify manually if PDG 46345 = compound homozygous 1 OPRD / 5 OPRI
+  if (pd =='pdg46345') {
+    
+    target <- -24
+    delp <- ggplot(tmp, aes(x=del, y=freq ,colour=haplotype)) +
+      geom_col(fill='white', alpha=0.0, position='identity') +
+      scale_color_manual(values=hapcols) +
+      coord_cartesian(ylim=c(0,0.2), xlim=c(-250,0)) +
+      geom_vline(xintercept=target) +
+      geom_vline(xintercept=c(target-lownoise, target+lownoise), linetype=2) +
+      theme_minimal() +
+      theme(
+        legend.position='none'
+      ) +
+      xlab('deletion') + ylab('frequency') +
+      ggtitle(pd)
+
+  } else if (!is.na(target) & target<0) { # if there is a target (i.e. sample has an OPRD) >> add the marker lines
+    delp <- ggplot(tmp, aes(x=del, y=freq ,colour=haplotype)) +
+      geom_col(fill='white', alpha=0.0, position='identity') +
+      scale_color_manual(values=hapcols) +
+      coord_cartesian(ylim=c(0,0.2), xlim=c(-250,0)) +
+      geom_vline(xintercept=target) +
+      geom_vline(xintercept=c(target-lownoise, target+lownoise), linetype=2) +
+      theme_minimal() +
+      theme(
+        legend.position='none'
+      ) +
+      xlab('deletion') + ylab('frequency') +
+      ggtitle(pd)
+  } else { # if not (i.e. reference or OPRI), skip the marker lines
+    delp <- ggplot(tmp, aes(x=del, y=freq ,colour=haplotype)) +
+      geom_col(fill='white', alpha=0.0, position='identity') +
+      scale_color_manual(values=hapcols) +
+      coord_cartesian(ylim=c(0,0.2), xlim=c(-250,0)) +
+      theme_minimal() +
+      theme(
+        legend.position='none'
+      ) +
+      xlab('deletion') + ylab('frequency') +
+      ggtitle(pd)
+  }
+  
+  # in any case, add deletion plot to the list
+  dps[[s]] <- delp
+  
+}
+
+
+# second; INSERTION plots
+for (s in 1:length(ips)) {
+  
+  # PDG we are plotting
+  pd <- names(ips)[s]
+  
+  # get its OPR variant (if any)
+  sva <- as.character(subset(meta, sample==pd, SV))
+  
+  # and from it the target length
+  target <- as.numeric(subset(tarl, oprg==sva, targetindel))
+  # eg. PDG 1906 >> OPRD2 >> -48 bp
+  
+  # for that sample, count the reads with each insertion length
+  # can also do it automatically with geom_histogram(), but I found it leave a bar when count = 0 if computed before, which I prefer
+  tmp <- ied %>%
+    subset(sample==pd) %>%
+    group_by(ins, haplotype) %>%
+    tally(name='nreads') %>%
+    mutate(freq=nreads/nrow(subset(ied, sample==pd))) 
+  # above: compute frequency by dividing number of reads with each insertion length by total number of reads from that sample
+  
+  # if sample has a OPRI: version with marker lines
+  if (!is.na(target) & target>0) {
+    insp <- ggplot(tmp, aes(x=ins, y=freq ,colour=haplotype)) +
+      geom_col(fill='white', alpha=0.0, position='identity') +
+      scale_color_manual(values=hapcols) +
+      coord_cartesian(ylim=c(0,0.2), xlim=c(0,250)) +
+      geom_vline(xintercept=target) +
+      geom_vline(xintercept=c(target-lownoise, target+lownoise), linetype=2) +
+      theme_minimal() +
+      theme(
+        legend.position='none'
+      ) +
+      xlab('insertion') + ylab('frequency') +
+      ggtitle(pd)
+  } else { # if not (i.e. reference or OPRI), skip the marker lines
+    insp <- ggplot(tmp, aes(x=ins, y=freq ,colour=haplotype)) +
+      geom_col(fill='white', alpha=0.0, position='identity') +
+      scale_color_manual(values=hapcols) +
+      coord_cartesian(ylim=c(0,0.2), xlim=c(0,250)) +
+      theme_minimal() +
+      theme(
+        legend.position='none'
+      ) +
+      xlab('insertion') + ylab('frequency') +
+      ggtitle(pd)
+  }
+  
+  # in any case, add deletion plot to the list
+  ips[[s]] <- insp
+  
+}
+
+# arrange each list of plots in a grid of one column and export
+  # grid of deletion plots
+
+delgrid <- ggarrange(plotlist=dps, ncol=1, nrow=length(dps))
+ggsave(filename=here('needleSam', 'deletionGrid.pdf'), plot=delgrid, height=1000, width=100, units='mm', useDingbats=FALSE)
+
+insgrid <- ggarrange(plotlist=ips, ncol=1, nrow=length(ips))
+ggsave(filename=here('needleSam', 'insertionGrid.pdf'), plot=insgrid, height=1000, width=100, units='mm', useDingbats=FALSE)
+
+# Note there are 3 options for the histograms above:
+  # position='stack' (default): each bar is a stacked barplot with proportion of each haplotype
+    # in a sense it is the best way to plot all the information, but intuitively when I see the tip of the bars in orange (for example),
+      # I imagine that there is another plot below it, which changes completely the interpretation
+  # position='identity': histograms for each haplotypes are effectively separated, overlayed
+  # position='dodge': same but the bars are a little bit shifted so there is less overlay
+# I think 'identity' is the less likely to create confusion (even though it is the one with the most overlay)
+
+
+
+
+# 6- Assign each read to a most likely OPR --------------------------------
+# eg. 23-bp insertion >> could be a OPRI
+# below also allows OPRI + OPRD in same read, although that is unlikely
+
+# create the possible intervals for OPRDs
+delInts <- vector(mode='list', length=4) # OPR deletion intervals
+# minimum = 4 OPRD = - 24 * 4
+# maximum = 1 OPRD = -24 * 1
+delOpr <- seq(-24*4, -24*1, 24)
+for (o in 1:length(delOpr)) {
+  delInts[[o]] <- (delOpr[o] - lownoise) : (delOpr[o] + highnoise)
+}
+names(delInts) <- sprintf('OPRD%i', 4:1)
+
+# create the possible intervals for OPRIs
+inInts <- vector(mode='list', length=24) # OPR insertion intervals
+# minimum = 1 OPRI = 24 * 1
+# maximum = 24 OPRI = 24 * 24
+inOpr <- seq(24*1, 24*24, 24)
+for (o in 1:length(inOpr)) {
+  inInts[[o]] <- (inOpr[o] - lownoise) : (inOpr[o] + highnoise)
+}
+names(inInts) <- sprintf('OPRI%i', 1:24)
+
+# create the possible interval for reference
+  # reference interval is a bit different as will overlap deletions/insertions
+  # i.e. centered around 0, - lownoise / + highnoise
+refInt <- (0-lownoise):(0+highnoise)
+
+# given one total insertion (in bp), check if belongs to one OPRI interval
+  # eg. +23 bp >> return OPRI1
+checkOPRI <- function(totin) {
+  
+  check <- unlist(lapply(inInts, function(int){totin %in% int}))
+  
+  if (length(which(check==TRUE))==0){
+    return(NA)
+  } else(
+    return(names(which(check) == TRUE))
+  )
+}
+
+# given one total deletion (in bp), check if belongs to one OPRD interval
+# eg. -23 bp >> return OPRD1
+checkOPRD <- function(totdel) {
+  
+  check <- unlist(lapply(delInts, function(int){totdel %in% int}))
+  
+  if (length(which(check==TRUE))==0){
+    return(NA)
+  } else(
+    return(names(which(check) == TRUE))
+  )
+}
+
+# given both total deletion / total insertion, check if it should be categorised as reference
+  # eg. -3 bp >> return reference
+  # ! flipping around here: functions above are meant to be applied to all column; checkRef is meant to be apply to each row of ied
+  # (because need both totdel & totins for each read)
+checkRef <- function(read) { # read = row of ied
+  
+  totdel <- read['del']
+  totins <- read['ins']
+  
+  # del and ins should be both in interval
+    # eg. -5 del / +1 ins >> reference
+    # eg. -12 del / +2 ins >> NA (in this case read will not have a category as deletion too small for an OPRD1)
+  check <- (totdel %in% refInt) & (totdel %in% refInt) # check will be Logical, are both totdel / totins in Ref interval
+  
+  if (check) {
+    read['del'] <- 'reference'
+    read['ins'] <- 'reference'
+    return(read)
+  } else {
+    return(read)
+  }
+}
+
+# now assign reads to possible OPR mutations
+ied$opri <- sapply(ied$ins, checkOPRI)
+ied$oprd <- sapply(ied$del, checkOPRD)
+
+# put reads that were not assigned to an OPRI and/or an OPRD in either 'reference' or 'unassigned'
+  # 'reference' = total insertion & total deletion are in reference length interval (i.e. no or small insertion + no or small deletion)
+  # 'unassigned' = none of total insertion or total deletion are in any interval (so anything falling between the cracks, eg. a 10 bp insertion)
+# create a new column for these categories (prevents from plotting same reads in different plots below)
+ied$othercategory <- NA
+ied[which((ied$ins %in% refInt) & (ied$del %in% refInt)) , 'othercategory'] <- 'reference'
+# explicitly call all the leftover reads 'unassigned'; leftover = 
+  # do not have an OPRD
+  # do not have an OPRI
+  # are NA for othercategory (i.e. were not categorised as reference above)
+ied[which(is.na(ied$opri) & is.na(ied$oprd) & is.na(ied$othercategory)) , 'othercategory'] <- 'unassigned'
+
+# Note1; reads categorised as 'reference' or 'unassigned' will always be OPRI NA / OPRD NA
+# Note2; it is a bit of a pain to keep 3 columns when really each read should be in a single category
+  # (i.e. has an OPRI OR has OPRD OR is reference OR is unassigned)
+  # the only advantage is to leave the opportunity for a single read to be both OPRD & OPRI
+    # which are (hopefully) cases which will go away when aligning later
+
+# Check
+  # that means all reads should be in at least one category
+  # i.e. all three columns cannot be NA at once
+subset(ied, is.na(opri) & is.na(oprd) & is.na(othercategory))
+
+
+
+
+# 7- Positive control for haplotype phasing -------------------------------
+# eg. sample 8 OPRI: ~ 100% reads 8 OPRI should be one haplotype / ~ 100% reference should be the other haplotype
+# similar plot as proportion of reads from each haplotype above, but one column for each sample / genotype
+
+# make sure OPRI/OPRD/reference in order that makes sense
+ied$opri <- factor(ied$opri, levels=sprintf('OPRI%i', 1:24))
+ied$oprd <- factor(ied$oprd, levels=sprintf('OPRD%i', 4:1))
+ied$othercategory <- factor(ied$othercategory, levels=c('reference', 'unassigned'))
+
+# make haplotags easier to handle by making (for cases with a heterozygous OPR mutation):
+  # mutated haplotype always haplotype1
+  # reference haplotype always haplotype2
+  # for PDG 46345: haplotype1 = 5 OPRI (as most important, probably causal mutation) / haplotype2 = 1 ORPD
+# for samples that could not be haplotagged: all haplotype0 / for samples that are OPR reference/OPR reference (eg. control): leave it as it
+
+# pdgsOPR = PDGs of sample with an OPR mutation (i.e. those for which we may need to swap haplotags)
+pdgsOPR <- as.character(unlist(subset(meta, !is.na(SV), sample)))
+
+  # ! do not pick PDGs we do not have here
+  # (only applies if running the script on subset of the samples)
+pdgsOPR <- intersect(pdgsOPR, pdgs)
+
+# first set the whole haplotype column to integer (harder to change stuff is stays factor)
+ied$haplotype <- as.integer(as.character(ied$haplotype))
+
+for (p in 1:length(pdgsOPR)) {
+  
+  # check the PDG is not one we could not haplotag
+  if (pdgsOPR[p] %in% pdgs_notag) {
+    next # in which case, do nothing and try next PDG
+  }
+  
+  osan <- as.character(subset(meta, sample==pdgsOPR[p], SV)) # take the Sanger genotype of that sample
+  
+  if (substr(osan, 1, 4) == 'OPRD') { # if Sanger genotype is a deletion
+    
+    # which haplotype matches the Sanger genotype?
+    hap1 <- nrow(subset(ied, sample==pdgsOPR[p] & haplotype==1 & oprd==osan)) # how many reads from haplotype1 match the OPRD
+    hap2 <- nrow(subset(ied, sample==pdgsOPR[p] & haplotype==2 & oprd==osan)) # how many reads from haplotype2 match the OPRD
+    
+    if (hap1 > hap2) { # if haplotype1 is mutated, no need to change anything
+      cat('\t \t \t \t >>>', pdgsOPR[p], ': haplotype 1 is mutated one \n')
+      next
+    }
+    
+    else if (hap1 < hap2) {
+      cat('\t \t \t \t >>>', pdgsOPR[p], ': haplotype 2 is mutated one \n')
+      cat('\t \t \t \t \t swapping haplotags... \n')
+      
+      ied[which(ied$sample==pdgsOPR[p] & ied$haplotype==2), 'haplotype'] <- 99 # which() = these are the rows to change
+      # ! do not 'cheat' and swap haplotags that are probably in the wrong haplotype
+      # or in other words: do not look at the reads' OPR above, only at their tag
+      # temporarily set these rows to 99
+      
+      # temporary set rows of the other haplotype to 9
+      ied[which(ied$sample==pdgsOPR[p] & ied$haplotype==1), 'haplotype'] <- 9
+      
+      # now swap them
+      ied[which(ied$sample==pdgsOPR[p] & ied$haplotype==99), 'haplotype'] <- 1 # haplotype 2 becomes haplotype 1
+      ied[which(ied$sample==pdgsOPR[p] & ied$haplotype==9), 'haplotype'] <- 2 # and vice-versa
+      
+    } else stop('\t \t \t \t >>> Error: cannot find which haplotype matches the insertion for ', pdgsOPR[p], '\n')
+  }
+  
+  ###
+  
+  if (substr(osan, 1, 4) == 'OPRI') { # if Sanger genotype is an insertion
+    # same as above, just OPRI now
+    
+    # which haplotype matches best the Sanger genotype?
+    hap1 <- nrow(subset(ied, sample==pdgsOPR[p] & haplotype==1 & opri==osan)) # how many reads from haplotype1 match the OPRI
+    hap2 <- nrow(subset(ied, sample==pdgsOPR[p] & haplotype==2 & opri==osan)) # how many reads from haplotype2 match the OPRI
+    
+    if (hap1 > hap2) { # if haplotype1 is most likely the mutated one, no need to swap
+      cat('\t \t \t \t >>>', pdgsOPR[p], ': haplotype 1 is mutated one \n')
+      next
+    }
+    
+    else if (hap1 < hap2) { # if haplotype2 is most likely the mutated one, no need to swap
+      cat('\t \t \t \t >>>', pdgsOPR[p], ': haplotype 2 is mutated one \n')
+      
+      
+      ied[which(ied$sample==pdgsOPR[p] & ied$haplotype==2), 'haplotype'] <- 99 # which() = these are the rows to change
+      # ! do not 'cheat' and swap haplotags that are probably in the wrong haplotype
+      # or in other words: do not look at the reads' OPR above, only at their tag
+      # temporarily set these rows to 99
+      
+      # temporary set rows of the other haplotype to 9
+      ied[which(ied$sample==pdgsOPR[p] & ied$haplotype==1), 'haplotype'] <- 9
+      
+      # now swap them
+      cat('\t \t \t \t \t swapping haplotags... \n')
+      ied[which(ied$sample==pdgsOPR[p] & ied$haplotype==99), 'haplotype'] <- 1 # haplotype 2 becomes haplotype 1
+      ied[which(ied$sample==pdgsOPR[p] & ied$haplotype==9), 'haplotype'] <- 2 # and vice-versa
+      
+    } else stop('\t \t \t \t >>> Error: cannot find which haplotype matches the insertion for ', pdgsOPR[p], '\n')
+  }
+}
+
+# turn haplotype column back into factor
+ied$haplotype <- as.factor(ied$haplotype)
+# in the order I want
+ied$haplotype <- factor(ied$haplotype, levels=c(1, 2, 0))
+
+
+# plot proportion of haplotype from each sample again
+# between ### : same as above
+###
+
+hco <- ied %>% # hco for haplotype counts
+  group_by(sample, haplotype) %>%
+  tally(name='nreads')
+
+# add total number of reads in each sample
+tmp <- ied %>%
+  count(sample, name='totalsamplereads')
+
+hco <- left_join(hco, tmp, by='sample')
+
+# order hco by pdgs
+hco <- left_join(data.frame(sample=pdgs), hco, by='sample')
+
+# normalise the read counts per haplotype
+hco <- hco %>%
+  mutate(haplotype_pro=nreads/totalsamplereads) %>% # haplotype proportion, i.e. proportion of reads assigned to haplotype 1, 2, or unassigned (0)
+  mutate(sam_hap=paste(sample, haplotype, sep='_'), .after=haplotype) # add a unique sample_haplotype column, easier for plotting
+
+# put levels of sample and sam_hap in right order for plotting
+hco$sample <- factor(hco$sample, levels=unique(hco$sample))
+hco$sam_hap <- factor(hco$sam_hap, levels=hco$sam_hap)
+hco$haplotype <- factor(hco$haplotype, levels=c(1, 2, 0))
+
+# haplotype assignments as stacked barplot, actual number of reads (so height = OPR coverage)
+haploReads <- ggplot(hco, aes(x=sample, y=nreads, fill=haplotype)) +
+  geom_bar(stat='identity') +
+  scale_fill_manual(values=hapcols) +
+  theme_minimal() +
+  theme(
+    axis.text.x=element_text(angle=90),
+    axis.title.x=element_blank()
+  ) +
+  ylab('number of reads')
+haploReads
+
+# haplotype assignments as stacked barplot, proportion of coverage (so height is 1 for all samples)
+
+# I think best is same order as table all the way
+
+# alternatively, samples without haplotag at the top
+# add column tag or no tag
+  # hco$htag <- TRUE
+  # hco[which(hco$sample %in% pdgs_notag) , 'htag'] <- FALSE
+  # hco <- hco %>%
+  #   arrange(desc(htag))
+  # hco$sample <- factor(hco$sample, levels=unique(hco$sample))
+
+# same order as meta
+hco <- left_join(data.frame(sample=meta$sample), hco, by='sample')
+hco <- hco[seq(dim(hco)[1],1),]
+hco$sample <- factor(hco$sample, levels=unique(hco$sample))
+
+# sample names for y axis
+splsyaxis <- paste0('#', substr(unique(hco$sample), 4, 99))
+
+haploPro <- ggplot(hco, aes(x=sample, y=haplotype_pro, fill=haplotype)) +
+  geom_bar(stat='identity') +
+  coord_flip() +
+  scale_fill_manual(values=hapcols) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+    axis.title.x=element_text(size=9, margin = margin(t=2, r=0, b=0, l=0)),
+    axis.text.x=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
+    axis.title.y=element_blank(),
+    axis.text.y=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0), hjust=0)
+  ) +
+  ylab('proportion of reads') +
+  scale_x_discrete(labels=splsyaxis)
+haploPro
+
+ggsave(here('haplotypephasing/haploReads2.pdf'), haploReads,width=200, height=100, units='mm')
+ggsave(here('haplotypephasing/haploPro2.pdf'), haploPro, width=65, height=90, units='mm')
+
+###
+
+# plot OPRD / OPRI stacked barplots for each sample (fill = haplotype)
+  # ! reads that have both an OPRI & an OPRD will be counted twice (i.e. once in OPRI plot + once in OPRD plot), but they are pretty rare
+
+dir.create(here('needleSam', 'OPRHaploPlots'))
+
+for (s in 1:length(pdgs)) {
+  
+  cat('\t \t \t \t >>> Plotting OPR by haplotype for sample ', s, '/', length(pdgs), '\n')
+  
+  # common parameters (easier for debugging)
+  yaxislimit <- 1.0
+  barwidth <- 1
+  
+  # get the sample we are plotting
+  pd <- pdgs[s]
+  
+  # create a small list for the 4 plots (OPRD / reference / OPRI / unassigned)
+  ohapPlots <- vector(mode='list', length=4)
+  names(ohapPlots) <- c('OPRD', 'reference', 'OPRI', 'unassigned')
+  
+  # OPRD plot
+    # first prepare the data
+  tmp <- ied %>%
+    subset(sample==pd) %>%
+    group_by(oprd, haplotype) %>%
+    tally(name='nreads') %>%
+    mutate(freq=nreads/nrow(subset(ied, sample==pd))) # frequency = proportion to *total* number of reads of that sample
+  
+    # plot
+  ohapPlots$OPRD <- ggplot(subset(tmp, !is.na(oprd)), aes(x=oprd, y=freq, fill=haplotype)) +
+    geom_col(width=barwidth) +
+    scale_fill_manual(values=hapcols) +
+    theme_minimal() +
+    theme(
+      axis.title.x=element_blank()
+    ) +
+    coord_cartesian(ylim=c(0, yaxislimit))
+  
+  # reference plot
+    # first prepare the data
+  tmp <- ied %>%
+    subset(sample==pd) %>%
+    group_by(othercategory, haplotype) %>%
+    tally(name='nreads') %>%
+    mutate(freq=nreads/nrow(subset(ied, sample==pd))) # frequency = proportion to *total* number of reads of that sample
+  
+    # plot
+  ohapPlots$reference <- ggplot(subset(tmp, othercategory=='reference'), aes(x=othercategory, y=freq, fill=haplotype)) +
+    geom_col(width=barwidth) +
+    scale_fill_manual(values=hapcols) +
+    theme_minimal() +
+    theme(
+      axis.title.x=element_blank()
+    ) +
+    coord_cartesian(ylim=c(0, yaxislimit))
+  
+  # OPRI plot
+  # first prepare the data
+  tmp <- ied %>%
+    subset(sample==pd) %>%
+    group_by(opri, haplotype) %>%
+    tally(name='nreads') %>%
+    mutate(freq=nreads/nrow(subset(ied, sample==pd))) # frequency = proportion to *total* number of reads of that sample
+  
+  # plot
+  ohapPlots$OPRI <- ggplot(subset(tmp, !is.na(opri)), aes(x=opri, y=freq, fill=haplotype)) +
+    geom_col(width=barwidth) +
+    scale_fill_manual(values=hapcols) +
+    theme_minimal() +
+    theme(
+      axis.title.x=element_blank()
+    ) +
+    coord_cartesian(ylim=c(0, yaxislimit))
+  
+  # reference plot
+    # first prepare the data
+  tmp <- ied %>%
+    subset(sample==pd) %>%
+    group_by(othercategory, haplotype) %>%
+    tally(name='nreads') %>%
+    mutate(freq=nreads/nrow(subset(ied, sample==pd))) # frequency = proportion to *total* number of reads of that sample
+  
+    # plot
+  ohapPlots$unassigned <- ggplot(subset(tmp, othercategory=='unassigned'), aes(x=othercategory, y=freq, fill=haplotype)) +
+    geom_col(width=barwidth) +
+    scale_fill_manual(values=hapcols) +
+    theme_minimal() +
+    theme(
+      axis.title.x=element_blank()
+    ) +
+    coord_cartesian(ylim=c(0, yaxislimit))
+  
+  # prepare the grid (each plot alongside each other)
+  ohapGrid <- ggarrange(plotlist=ohapPlots, ncol=length(ohapPlots), nrow=1)
+  # save as pdf
+  ggsave(filename=here('needleSam', 'OPRHaploPlots', paste0('OPRHap_', pd, '.pdf')), plot=ohapGrid, width=800, height=200, units='mm')
+  
+}
+
+# these plots are not really good, but good enough for what I want to check now
+  # I will improve these plots if we include them
+
+# simplify plots above (make it a single plot) by
+  # 1- plot only samples with an OPR & which have haplotags
+  # 2- plot only that OPR + reference (or OPRD1 / OPRI5 for the compound homozygous sample)
+# there are 14 samples with an OPR genotype; each 2 alleles, so 28 columns (42 if plot unassigned)
+pdgoptag <- as.character(unlist(subset(meta, !is.na(SV) & ! sample %in% pdgs_notag, sample))) # these are the samples we want to plot
+
+# ! do not pick PDGs we do not have here
+  # (only applies if running the script on subset of the samples)
+pdgoptag <- intersect(pdgoptag, pdgs)
+
+# tally grossly the samples we need to plot, then will delete what we do not want to include in the plot
+oprcount <- ied %>%
+  subset(sample %in% pdgoptag) %>%
+  group_by(sample, opri, oprd, othercategory, haplotype) %>%
+  tally(name='nreads')
+
+# for each sample, delete what does not correspond to its genotype
+for (s in 1:length(pdgoptag)) {
+  
+  # get the sample PDG
+  pd <- pdgoptag[s]
+  
+  # ! when reaching PDG 46345, consider the other OPR genotype and delete all the reference counts
+  if (pd=='pdg46345') {
+    
+    ogen <- as.character(subset(meta, sample==pd, c(SV, SV2))) # take the 2 SVs
+    
+    # delete from oprcount everything that is not these 2 genotypes
+    # or in other words, keep everything that matches
+    oprcount <- oprcount[- which(oprcount$sample==pd &
+                                   (!oprcount$opri %in% ogen) & (!oprcount$oprd %in% ogen)) , ]
+    
+    # there is sometimes a read or two that are eg. OPRI4 + OPRD1, and as OPRD1 is correct it passes filter above
+      # remove also any read that has both an OPRI and an OPRD (if any)
+    inds2del <- which(oprcount$sample==pd & !is.na(oprcount$opri) & !is.na(oprcount$oprd))
+    if (length(inds2del)!=0) {
+      oprcount <- oprcount[ - inds2del , ]
+    }
+    
+  } else { # for the other samples
+    # get its OPR genotype
+    ogen <- as.character(subset(meta, sample==pd, SV))
+    
+    # delete from oprcount everything that is not that genotype or reference
+    oprcount <- oprcount[- which(oprcount$sample==pd
+                   & (oprcount$opri!=ogen | oprcount$oprd!=ogen | oprcount$othercategory!='reference')) , ]
+    
+    # same as above, remove any read that has both OPRD and OPRI (if any)
+    inds2del <- which(oprcount$sample==pd & !is.na(oprcount$opri) & !is.na(oprcount$oprd))
+    if (length(inds2del)!=0) {
+      oprcount <- oprcount[ - inds2del , ]
+    }
+  }
+  
+}
+
+# add proportion of each haplotype within genotype categories
+oprcount <- oprcount %>%
+  group_by(sample, opri, oprd, othercategory, .drop=FALSE) %>%
+  mutate(totalreads_samplegenotype=sum(nreads)) %>%
+  mutate(freq=nreads/totalreads_samplegenotype) %>%
+  mutate(sam_ogen=paste(sample, opri, oprd, othercategory, sep='_')) # add an ID column
+
+# plot
+# will do in order of OPR, which should be in order of meta (minus the ones not present)
+# same order as meta
+oprcount <- left_join(data.frame(sample=meta$sample), oprcount, by='sample')
+oprcount <- oprcount[seq(dim(oprcount)[1],1),] # reverse the rows
+oprcount$sam_ogen <- factor(oprcount$sam_ogen, levels=unique(oprcount$sam_ogen))
+# delete all the NA rows (from left_join above)
+oprcount <- oprcount[-which(is.na(oprcount$sam_ogen)),]
+
+oprHaplot <- ggplot(oprcount, aes(x=sam_ogen, y=freq, fill=haplotype)) +
+  geom_col() +
+  coord_flip() +
+  scale_fill_manual(values=hapcols) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+    axis.title.x=element_text(size=9, margin = margin(t=2, r=0, b=0, l=0)),
+    axis.text.x=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
+    axis.text.y=element_blank(),
+    axis.title.y=element_blank()) +
+  ylab('proportion of reads')
+oprHaplot
+
+ggsave(filename=here('needleSam', 'oprHapPlot.pdf'), plot=oprHaplot, width=54, height=90, units='mm')
+
+
+# haplotags look ok but not perfect
+# can be up to ~ 10% wrong haplotype, eg. OPRI8: 90% from haplotype1 / 10% from haplotype2
+# could also be that error is from detecting OPRI8, but cannot believe that a ~ 192 bp insertion can be there 'by accident'
+# additionally, errors are clearly symmetrical; in example above: reference for that sample will be 90% haplotype2 / 10% haplotype1
+# so I think error is = wrong haplotype assignment, i.e. from whatshap haplotag
+
+# there are 2 questions
+  # 1- why the UNassigned reads (reads which get assigned haplotype 0)
+  # 2- why the MISassigned reads (reads which get assigned what I think is the wrong haplotype from looking at the SV)
+
+# Example
+# PDG 47875 = 8 OPRI
+# good example because
+  # 14 SNVs available, so we do not expect whatshap to do many errors
+  # from looking at phased VCF: all SNVs are from one haplotype
+  # by looking at phased reads in IGV: OPRI8 haplotype is the one carrying the 14 SNVs
+
+# I cannot see anything wrong with these 'haplotype-swapped' reads; they look real to me
+# I see 3 options
+  # 1- artefact on flowcell -- I do not think so, otherwise we would find tons of eg. OPRI6 on flowcell3 (as there are many such reads around)
+  # 2- PCR artefact -- precisely template switching, i.e. the polymerase would be reading a hap2 molecule,
+    # then switch just before the OPR to a hap1 molecule
+  # 3- inter-chromosomal rearrangement
+# I do not see a dry-lab way of differentiating the last two
+
+# Looking deeper into swapped haplotypes
+
+# proportion of swapped haplotypes per sample
+  # for PDG 46345, we are interested in hap1 that are OPRD1 / hap2 that are OPRI5
+swa <- subset(oprcount,
+       (sample!='pdg46345' & haplotype==1 & othercategory=='reference') |
+         (sample!='pdg46345' & haplotype==2 & !is.na(opri)) |
+         (sample!='pdg46345' & haplotype==2 & !is.na(oprd)) |
+         
+         (sample=='pdg46345' & haplotype==1 & !is.na(oprd)) |
+         (sample=='pdg46345' & haplotype==2 & !is.na(opri)) )
+
+# is it true that it looks symmetrical?
+swappedHap <- ggplot(swa, aes(x=sam_ogen, y=freq, fill=haplotype)) +
+  geom_col() +
+  coord_flip() +
+  scale_fill_manual(values=hapcols) +
+  theme_minimal() +
+  theme(
+    axis.title.y=element_blank()
+  )
+swappedHap
+ggsave(filename=here('haplotypephasing', 'swappedHap.pdf'), plot=swappedHap, width=300, height=200, units='mm')
+# yes, clear
+
+# does it correlate with anything?
+
+  # add every kind of sample info
+swa <- left_join(swa, meta, by='sample')
+swa <- left_join(swa, hpmeta, by='sample')
+
+# add proportion left not haplotagged
+proze <- select(hcoun, sample, haplotype_pro) # proportion haplotype0
+colnames(proze) <- c('sample', 'prohap0')
+swa <- left_join(swa, proze)
+
+# just as sanity check
+swappedHapVsnReads <- ggplot(swa, aes(x=nreads, y=freq, colour=haplotype)) +
+  geom_point() +
+  theme_minimal() +
+  scale_colour_manual(values=hapcols)
+swappedHapVsnReads
+# nreads is how frequency is calculated so obviously makes sense
+
+# vs Coverage
+swappedHapVsCov <- ggplot(swa, aes(x=totalreads_samplegenotype, y=freq, colour=haplotype)) +
+  geom_point() +
+  theme_minimal() +
+  scale_colour_manual(values=hapcols)
+swappedHapVsCov
+ggsave(filename=here('haplotypephasing', 'swappedHapVsCov.pdf'), plot=swappedHapVsCov, width=300, height=200, units='mm')
+
+# vs tissue
+swappedHapVsTissue <- ggplot(swa, aes(x=gDNA_tissue, y=freq, colour=haplotype)) +
+  geom_point() +
+  theme_minimal() +
+  scale_colour_manual(values=hapcols)
+swappedHapVsTissue
+ggsave(filename=here('haplotypephasing', 'swappedHapVsTissue.pdf'), plot=swappedHapVsTissue, width=300, height=200, units='mm')
+
+# vs Cohort
+  # cannot do as only inherited samples
+
+# vs Flowcell
+swappedHapVsFlow <- ggplot(swa, aes(x=flowcell, y=freq, colour=haplotype)) +
+  geom_point() +
+  theme_minimal() +
+  scale_colour_manual(values=hapcols) +
+  scale_x_continuous(breaks=c(1, 3))
+swappedHapVsFlow
+ggsave(filename=here('haplotypephasing', 'swappedHapVsFlow.pdf'), plot=swappedHapVsFlow, width=300, height=200, units='mm')
+
+# vs OPR coverage
+# genebody_coverage in in sequencing_summary sheet of AdditionalFile1.xlsx
+
+swa <- seqmeta %>%
+  select('sample', 'genebody_coverage') %>%
+  left_join(swa, ., by='sample')
+
+swappedHapVsOPRcov <- ggplot(swa, aes(x=genebody_coverage, y=freq, colour=haplotype)) +
+  geom_point() +
+  theme_minimal() +
+  scale_colour_manual(values=hapcols)
+swappedHapVsOPRcov
+ggsave(filename=here('haplotypephasing', 'swappedHapVsOPRcov.pdf'), plot=swappedHapVsOPRcov, width=300, height=200, units='mm')
+
+# vs MRC score
+swappedHapVsMRC <- ggplot(swa, aes(x=MRC_slope, y=freq, colour=haplotype)) +
+  geom_point() +
+  theme_minimal() +
+  scale_colour_manual(values=hapcols)
+swappedHapVsMRC
+ggsave(filename=here('haplotypephasing', 'swappedHapVsMRC.pdf'), plot=swappedHapVsMRC, width=300, height=200, units='mm')
+
+# vs age
+swappedHapVsAge <- ggplot(swa, aes(x=age_atsample, y=freq, colour=haplotype)) +
+  geom_point() +
+  theme_minimal() +
+  scale_colour_manual(values=hapcols)
+swappedHapVsAge
+ggsave(filename=here('haplotypephasing', 'swappedHapVsAge.pdf'), plot=swappedHapVsAge, width=300, height=200, units='mm')
+
+# vs gender
+swappedHapVsGender <- ggplot(swa, aes(x=gender, y=freq, colour=haplotype)) +
+  geom_point() +
+  theme_minimal() +
+  scale_colour_manual(values=hapcols)
+swappedHapVsGender
+ggsave(filename=here('haplotypephasing', 'swappedHapVsGender.pdf'), plot=swappedHapVsGender, width=300, height=200, units='mm')
+
+# vs number of SNVs available for haplotagging
+swappedHapVsHaplotagSNVs <- ggplot(swa, aes(x=genebody_heterozygousSNVs, y=freq, colour=haplotype)) +
+  geom_point() +
+  theme_minimal() +
+  scale_colour_manual(values=hapcols)
+swappedHapVsHaplotagSNVs
+ggsave(filename=here('haplotypephasing', 'swappedHapVsHaplotagSNVs.pdf'), plot=swappedHapVsHaplotagSNVs, width=300, height=200, units='mm')
+
+# vs proportion not haplotagged
+swappedHapVsProHap0 <- ggplot(swa, aes(x=prohap0, y=freq, colour=haplotype)) +
+  geom_point() +
+  theme_minimal() +
+  scale_colour_manual(values=hapcols)
+swappedHapVsProHap0
+ggsave(filename=here('haplotypephasing', 'swappedHapVsProHap0.pdf'), plot=swappedHapVsProHap0, width=300, height=200, units='mm')
+
+# vs OPR
+swa$SV <- factor(swa$SV, levels=c('OPRD2', 'OPRD1', 'OPRI1', 'OPRI2', 'OPRI4', 'OPRI5', 'OPRI6', 'OPRI8'))
+swappedHapVsOPR <- ggplot(swa, aes(x=SV, y=freq, colour=haplotype)) +
+  geom_point() +
+  theme_minimal() +
+  scale_colour_manual(values=hapcols)
+swappedHapVsOPR
+ggsave(filename=here('haplotypephasing', 'swappedHapVsOPR.pdf'), plot=swappedHapVsOPR, width=300, height=200, units='mm')
+
+# overall nothing striking
+
+
+# do swapped reads have lower haplotag quality?
+
+# first need to tag swapped reads
+
+swais <- c() # here, will store indices of swapped reads (swapped indices)
+
+for (p in 1:length(pdgoptag)) { # for each sample that was haplotagged + has an SV
+  # if no haplotag or no SV: cannot tell if it is a swapped read
+  
+  # get the pdg we are working with
+  pdg <- pdgoptag[p]
+  
+  # get its SV
+  sv <- as.character(subset(meta, sample==pdg, SV))
+  if (is.na(sv)) stop('\t \t \t \t Error: SV is not expected to be NA \n')
+  
+  # ! bit more complicated for PDG 46345 (compound homozygous)
+  if (pdg == 'pdg46345') {
+    
+    # indices to change
+      # any haplotype1 that says OPRD1
+    inds <- which(ied$sample==pdg &
+                    ied$haplotype==1 &
+                    ied$oprd=='OPRD1')
+    swais <- c(swais, inds)
+    
+      # any haplotype2 that says OPRI5
+    inds <- which(ied$sample==pdg &
+                    ied$haplotype==2 &
+                    ied$oprd=='OPRI5')
+    swais <- c(swais, inds)
+    
+  }
+  
+  else { # in the other cases
+    
+    # if SV is an OPRI
+    if (substr(sv, 1, 4)=='OPRI') {
+      
+        # any haplotype1 that says reference
+      inds <- which(ied$sample==pdg &
+              ied$haplotype==1 &
+              ied$othercategory=='reference')
+      swais <- c(swais, inds)
+      
+        # any haplotype2 that says OPRI
+      inds <- which(ied$sample==pdg &
+              ied$haplotype==2 &
+              ied$opri==sv)
+      swais <- c(swais, inds)
+      
+    }
+    
+    # if SV is an OPRD
+    if (substr(sv, 1, 4)=='OPRD') {
+      
+      # any haplotype1 that says reference
+      inds <- which(ied$sample==pdg &
+                      ied$haplotype==1 &
+                      ied$othercategory=='reference')
+      swais <- c(swais, inds)
+      
+      # any haplotype2 that says OPRD
+      inds <- which(ied$sample==pdg &
+                      ied$haplotype==2 &
+                      ied$oprd==sv)
+      swais <- c(swais, inds)
+      
+    }
+  }
+}
+
+# few checks
+  # how many swapped reads?
+length(swais)
+  # out of
+length(pdgoptag) # samples
+  # so around
+length(swais) / length(pdgoptag) # per sample
+  # sounds about right
+
+  # there should not be duplicates
+sum(duplicated(swais))
+
+# add column swappedOPR
+  # preallocate as all FALSE and just change the indices above to TRUE
+swacol <- rep(FALSE, nrow(ied))
+swacol[swais] <- TRUE
+length(swacol) == nrow(ied)
+
+ied$swappedOPR <- swacol
+
+
+# now need to make haplotype quality more comparable between samples
+  # currently: highly dependent on number of SNVs, usually number of SNVs correct * 30
+# so if only 1 SNV: always 30 or 0
+  # which cannot be compared to samples with many SNVs
+  # eg. read with 14 SNVs, but only 2 match; I want a low haplotag score
+  # but currently it would be 60, so higher than a read with 1 SNV that matches
+# a possible solution: % of available SNVs that match (should be ~ % of maximum for that sample)
+  # in examples above
+    # 1 SNV: match = 1.0 / no-match = 0.0
+    # 14 SNV: only 2 match = 0.14
+besthq <- ied %>%
+  group_by(sample) %>%
+  summarise(besthapqual=max(hapqual, na.rm=TRUE))
+# generates -Inf for the sample which do not have haplotags, turn into NA
+besthq[which(besthq$besthapqual==-Inf), 'besthapqual'] <- NA
+
+# add to ied
+  # keep all rows of ied, so left_join
+ied <- left_join(x=ied, y=besthq, by='sample')
+
+# divide each hapqual by the best of its sample
+ied$hapscore <- ied$hapqual/ied$besthapqual
+
+# back to question: is haplotype quality lower for swapped reads?
+
+# distribution of all scores
+hapScoreDist <- ggplot(ied, aes(hapscore)) +
+  geom_histogram(binwidth=0.1) +
+  theme_minimal()
+hapScoreDist
+ggsave(filename=here('haplotypephasing', 'hapScoreDist.pdf'), plot=hapScoreDist, width=300, height=200, units='mm')
+
+# same, but differentiated by swapped or not
+
+hsbsw <- freqInBins(df=ied, datcol='hapscore', grpcol='swappedOPR', bins=seq(0, 1, by=0.1)) # haplotype scores by swapped or not
+
+hapScoreSwapped <- ggplot(hsbsw, aes(x=bin, y=freq, fill=swappedOPR)) +
+  geom_col(position='identity', alpha=0.5) +
+  theme_minimal()
+hapScoreSwapped
+ggsave(filename=here('haplotypephasing', 'hapScoreSwapped.pdf'), plot=hapScoreSwapped, width=300, height=200, units='mm')
+
+# yes swapped reads may have worse haplotype quality
+t.test(ied$hapscore ~ ied$swappedOPR)
+
+
+
+
+# Check strand directions -------------------------------------------------
+
+# check nothing else than 0 (= forward) or 16 (= reverse complement) in samflag column
+which(ied$samflag!=0 & ied$samflag!=16) # ok
+
+# if yes -- replace all 0 to +, all 16 to -
+ied[which(ied$samflag==0), 'samflag'] <- '+'
+ied[which(ied$samflag==16), 'samflag'] <- '-'
+
+# check all +/- now
+which(ied$samflag!='+' & ied$samflag!='-')
+# ok
+
+# change column name to strand
+colnames(ied)[which(colnames(ied)=='samflag')] <- 'strand'
+
+# quick sanity checks with strand direction
+strandcount <- ied %>%
+  group_by(strand) %>%
+  tally(name='count') %>%
+  mutate(prop=count/nrow(ied)) 
+
+strandPro <- ggplot(strandcount, aes(x=strand, y=prop)) +
+  geom_col() +
+  theme_minimal() +
+  xlab('proportion of reads') + ylab('strand')
+strandPro
+ggsave(here('haplotypephasing', 'strand.pdf'), strandPro, width=100, height=100, units='mm')
+
+# add a dummy column
+strandcount$dummy <- factor(1)
+
+# same but stack
+strandProStack <- ggplot(strandcount, aes(x=dummy, y=prop, fill=strand)) +
+  geom_col() +
+  theme_minimal() +
+  xlab('proportion of reads')
+strandProStack
+ggsave(here('haplotypephasing', 'strandProStack.pdf'), strandProStack, width=100, height=100, units='mm')
+
+
+# vs haplotype
+strandhap <- ied %>%
+  group_by(strand, haplotype) %>%
+  tally(name='count')
+
+# counts
+strandvsHapCount <- ggplot(strandhap, aes(x=haplotype, y=count, fill=strand)) +
+  geom_col() +
+  theme_minimal()
+strandvsHapCount
+ggsave(here('haplotypephasing', 'strandvsHapCount.pdf'), strandvsHapCount, width=100, height=100, units='mm')
+
+# proportion
+hapcount <- ied %>%
+  group_by(haplotype) %>%
+  tally(name='countHap')
+strandhap <- left_join(x=strandhap, y=hapcount)
+strandhap <- strandhap %>%
+  mutate(propHap=count/countHap)
+
+strandvsHapPro <- ggplot(strandhap, aes(x=haplotype, y=propHap, fill=strand)) +
+  geom_col() +
+  theme_minimal() +
+  geom_hline(yintercept=0.5, linetype=2)
+strandvsHapPro
+ggsave(here('haplotypephasing', 'strandvsHapPro.pdf'), strandvsHapPro, width=100, height=100, units='mm')
+
+# other way to look at it: assignment to haplotype for each strand direction
+# add strand count
+strandcount <- ied %>%
+  group_by(strand) %>%
+  tally(name='countStrand')
+strandhap <- left_join(x=strandhap, y=strandcount)
+strandhap <- strandhap %>%
+  mutate(propStrand=count/countStrand)
+
+hapvsStrandPro <- ggplot(strandhap, aes(x=strand, y=propStrand, fill=haplotype)) +
+  geom_col() +
+  theme_minimal() +
+  scale_fill_manual(values=hapcols)
+hapvsStrandPro
+ggsave(here('haplotypephasing', 'hapvsStrandPro.pdf'), hapvsStrandPro, width=100, height=100, units='mm')
+
+
+# vs flowcell
+  # temporary copy of ied and add flowcell info to it
+tmp <- ied
+tmp <- left_join(x=tmp, y=meta)
+
+flowstrand <- tmp %>%
+  group_by(strand, flowcell) %>%
+  tally(name='count')
+  # add total number of reads on each flowcell
+flowcount <- tmp %>%
+  group_by(flowcell) %>%
+  tally(name='countFlow')
+flowstrand <- left_join(x=flowstrand, y=flowcount)
+flowstrand <- flowstrand %>%
+  mutate(propFlow=count/countFlow)
+flowstrand$flowcell <- factor(flowstrand$flowcell, levels=c(1, 2, 3))
+
+strandvsFlowPro <- ggplot(flowstrand, aes(x=flowcell, y=propFlow, fill=strand)) +
+  geom_col() +
+  theme_minimal() +
+  geom_hline(yintercept=0.5, linetype=2) +
+  scale_fill_manual(values=hapcols)
+strandvsFlowPro
+ggsave(here('haplotypephasing', 'strandvsFlowPro.pdf'), strandvsFlowPro, width=100, height=100, units='mm')
+
+
+# vs cohort
+  # can use same tmp copy
+cohstrand <- tmp %>%
+  group_by(strand, prion_disease) %>%
+  tally(name='count')
+  # add total number of reads in each cohort
+cohcount <- tmp %>%
+  group_by(prion_disease) %>%
+  tally(name='countCoh')
+cohstrand <- left_join(x=cohstrand, y=cohcount)
+cohstrand <- cohstrand %>%
+  mutate(propCoh=count/countCoh)
+
+strandvsCohort <- ggplot(cohstrand, aes(x=prion_disease, y=propCoh, fill=strand)) +
+  geom_col() +
+  theme_minimal() +
+  geom_hline(yintercept=0.5, linetype=2)
+strandvsCohort
+ggsave(here('haplotypephasing', 'strandvsCohort.pdf'), strandvsCohort, width=100, height=100, units='mm')
+
+# vs sample
+  # can use same tmp copy
+samstrand <- tmp %>%
+  group_by(strand, sample) %>%
+  tally(name='count')
+  # add total number of reads in each cohort
+samcount <- tmp %>%
+  group_by(sample) %>%
+  tally(name='countSam')
+samstrand <- left_join(x=samstrand, y=samcount)
+samstrand <- samstrand %>%
+  mutate(propSam=count/countSam)
+
+strandvsSample <- ggplot(samstrand, aes(x=sample, y=propSam, fill=strand)) +
+  geom_col() +
+  geom_hline(yintercept=0.5, linetype=2) +
+  theme_minimal() +
+  theme(
+    axis.text.x=element_text(angle=90)
+  )
+strandvsSample
+ggsave(here('haplotypephasing', 'strandvsSample.pdf'), strandvsSample, width=250, height=150, units='mm')
+
+# related to above: would strand bias correlate with haplotype-swapped reads?
+  # just keep positive strand proportion (negative is just 1 - that)
+samplus <- samstrand %>%
+  subset(strand=='+') %>%
+  ungroup() %>%
+  select(sample, propSam)
+colnames(samplus) <- c('sample', 'propPlus')
+  
+swa <- left_join(x=swa, y=samplus)
+
+strandvSwapped <- ggplot(swa, aes(x=propPlus, y=prohap0)) +
+  geom_point() +
+  theme_minimal()
+strandvSwapped
+ggsave(here('haplotypephasing', 'strandvSwapped.pdf'), strandvSwapped, width=100, height=100, units='mm')
+
+
+
+
+# 8- Alignment to consensus OPR sequences ---------------------------------
+
+# for example, say a read has an insertion of 92 bp >> it could be a OPRI4
+# how can we measure how convincing it is?
+# we will align to a consensus sequence of what an OPRI4 looks like
+# this makes use of the consensus R sequence written by Simon, which uses flexible nucleotides (eg. Y = C or T)
+# >> see OPRconsensus.xlsx / generateOPRCatalog.R / OPRConsensusCatalog.xlsx for more information
+
+# previously: was trying alignment to published OPR sequences
+  # but there is some logic to which nucleotides can change, so better to define a consensus sequence
+  # advantage: it includes more possible OPRs (OPR sequences that are probably out there but were never detected or published)
+
+
+# function msaConsensusFlexible takes an alignment and outputs a corrected consensus sequence
+  # ! msa support alignment with flexible nucleotides, see https://github.com/UBod/msa/issues/12
+  # but eg. G vs A returns R; which is correct but not what I want here
+  # I want to leave flexibility only if the reference leaves that option open
+  # where eg. Y vs T returns Y
+# build look-up table for flexible nucleotides
+fnu <- c('R', 'Y', 'S', 'W', 'K', 'M', 'B', 'D', 'H', 'V', 'N') # flexible nucleotides
+lut <- as.data.frame(matrix(nrow=4, ncol=length(fnu)))
+colnames(lut) <- fnu
+lut$R[1:2] <- c('A', 'G')
+lut$Y[1:2] <- c('C', 'T')
+lut$S[1:2] <- c('G', 'C')
+lut$W[1:2] <- c('A', 'T')
+lut$K[1:2] <- c('G', 'T')
+lut$M[1:2] <- c('A', 'C')
+lut$B[1:3] <- c('C', 'G', 'T')
+lut$D[1:3] <- c('A', 'G', 'T')
+lut$H[1:3] <- c('A', 'C', 'T')
+lut$V[1:3] <- c('A', 'C', 'G')
+lut$N[1:4] <- c('A', 'C', 'G', 'T')
+
+# msaConsensusFlexible
+  # use instead of msaConsensusSequence, will return the consensus sequence of an alignment, but
+  # will replace ? in the consensus sequence which were thrown because of the presence of a flexible nucleotide, where the alignment was fine (eg. Y vs T)
+  # ! Note; it assumes alignment one-to-one and only one of the sequence (typically the reference) allows flexible nucleotides
+msaConsensusFlexible <- function (alignment){
+  conm <- consensusMatrix(alignment)
+  cons <- msaConsensusSequence(alignment)
+  
+  sapply(1:ncol(conm), function(pos) { # for each position of the alignment
+    
+    al <- conm[,pos] # get the alignment matrix at that position
+    # count number of flexible nucleotides at that position
+    nfnu <- sum(al[which(names(al) %in% fnu)])
+    
+    if (nfnu==0) { # if no flexible nucleotide at that position, do nothing
+      return(NULL)
+    }
+    
+    if (nfnu>0) { # if any flexible nucleotide at that position
+      # get the flexible nucleotide
+      f <- names(which(al[which(names(al) %in% fnu)] > 0))
+      
+      # get the standard nucleotide
+      s <- names(which(al[which(!names(al) %in% fnu)] > 0))
+      
+      # is the alignment valid? Check in the look-up table; eg. T vs Y is ok
+      if (!s %in% lut[,f]) { # if not -- do NOT modify the consensus sequence at that position (should leave ?)
+        return(NULL)
+      }
+      
+      else if (s %in% lut[,f]) { # if yes -- modify the consensus sequence at that position (should put the flexible nucleotide instead of the ?)
+        substr(cons, pos, pos) <<- f
+      }
+    }
+    
+  })
+  
+  return(cons)
+  
+}
+
+
+# function alignCon_NumMis:
+  # takes a query DNA sequence (i.e. possible somatic mutation)
+  # looks at the possible OPRD/OPRI for that read
+  # takes consensus DNA sequence from the catalog of OPRs (by allele)
+  # try aligning to the consensus sequence, gets number of mismatches
+  # reports the number of mismatches vs consensus
+alignCon_NumMis <- function(query, allele) {
+  # eg. alignRef_NumMis('CGAAATGC', 'OPRD1'), i.e. try aligning CGAAATGC to the consensus OPRD1 sequence and tell me number of mismatches
+  # can also be alignRef_NumMis('CGAAATGC', 'reference')
+  
+  if( is.na(allele) | allele=='unassigned') { # if allele is NA (eg. a 8OPRI read will usually have NA in oprd column; all unassigned reads are opri/oprd NA; etc.)
+    return(NA)
+  }
+  
+  if(! allele %in% cato$genotype) { # check the allele is in catalog (I think should always be ok as catalog covers all possible cases)
+    stop('\t \t \t \t >>> Allele', allele, ' was not found in catalog \n')
+  }
+  
+  
+  # 1- take the consensus sequence (used as reference)
+  ref <- as.character(subset(cato, genotype==allele, consensusseq))
+  
+  # 2- align query to that reference
+  ali <- msa(c(ref, query), type='dna', method='ClustalW')
+  
+  # 3- correct the consensus sequence to allow flexible nucleotides
+    # using msaConsensusFlexible()
+  con <- msaConsensusFlexible(ali)
+  
+  # 4- count number of mismatches (?)
+  nmis <- str_count(con, '\\?')
+  
+  # return that
+  return(nmis)
+  
+}
+
+# sanity checks
+  # align reference OPR to the consensus sequence
+alignCon_NumMis('CCTCAGGGCGGTGGTGGCTGGGGGCAGCCTCATGGTGGTGGCTGGGGGCAGCCTCATGGTGGTGGCTGGGGGCAGCCCCATGGTGGTGGCTGGGGACAGCCTCATGGTGGTGGCTGGGGTCAA',
+                'reference')
+# gives 0 mismatch
+
+  # check eg. G vs A should NOT return R; only edit consensus if reference leaves the option open
+alitest <- msa(c('CYGTK', 'CTAAT'), type='dna', method='ClustalW') # should return C Y ? ? K
+msaConsensusFlexible(alitest) # ok
+
+
+tic('\t \t \t \t Aligning reads took... \n')
+
+# align all possible OPRI reads to their consensus sequence
+opri_mis <- sapply(1:nrow(ied), function(ri){ # ri = read index; opri_mats = OPRI mismatches
+  # Note; will be one element per row of ied, can add to ied as it
+  cat('\t \t \t \t >>> Read', ri, 'out of', nrow(ied), '\n')
+  
+  readseq <- as.character(ied[ri, 'seq']) # read sequence
+  oprgeno <- as.character(ied[ri, 'opri']) # possible OPR allele
+  
+  alignCon_NumMis(readseq, oprgeno)
+})
+
+# put the results in ied
+ied$opri_nummis <- opri_mis
+
+
+
+# align all possible OPRD reads to their consensus sequence
+oprd_mis <- sapply(1:nrow(ied), function(ri){ # ri = read index; oprd_mis = OPRD mismatches
+  cat('\t \t \t \t >>> Read', ri, 'out of', nrow(ied), '\n')
+  
+  readseq <- as.character(ied[ri, 'seq']) # read sequence
+  oprgeno <- as.character(ied[ri, 'oprd']) # possible OPR genotype
+  
+  alignCon_NumMis(readseq, oprgeno)
+})
+
+# put the results in ied
+ied$oprd_nummis <- oprd_mis
+
+
+
+# align reference reads to the consensus sequence
+ref_mis <- sapply(1:nrow(ied), function(ri){ # ri = read index; ref_mis = reference mismatches
+  cat('\t \t \t \t >>> Read', ri, 'out of', nrow(ied), '\n')
+  
+  readseq <- as.character(ied[ri, 'seq']) # read sequence
+  oprgeno <- as.character(ied[ri, 'othercategory']) # take what is othercategory column as allele
+  # Note alignCon_NumMis takes care of 'unassigned' already, i.e. returns NA (as there is nothing to align to)
+  
+  alignCon_NumMis(readseq, oprgeno)
+})
+
+# put the results in ied
+ied$reference_nummis <- ref_mis
+
+
+toc()
+
+
+
+
+# 9- Threshold for maximum number of mismatches? --------------------------
+# what is a good threshold for maximum number of mismatches?
+# eg. if 25% of the nucleotides in one read do not match the consensus OPR sequence, might not be so convincing
+# >> look at the 'positive control' distribution of number of mismatches
+  # eg. take sample genotyped as OPRD2 by Sanger
+  # take all the reads that are consistent with an OPRD2
+  # and look at their number of mismatches with OPRD2 consensus sequence
+
+# add samples metadata
+ied <- left_join(x=ied, y=meta, by='sample')
+  # we do not need the colour column
+ied$colour <- NULL
+
+# plot mismatches plot
+# list to store mismatches plots
+# will basically do one list for each copy of the genome, i.e.
+  # first list = all the SVs or reference for the samples w/o SV
+  # second list = reference for the samples that have a SV / OPRD1 for the compound homozygous sample
+
+# List 1
+misPlots1 <- vector(mode='list', length=length(pdgs))
+names(misPlots1) <- pdgs
+
+for (s in 1:length(pdgs)) {
+  
+  # get the PDG we are plotting
+  pdg <- pdgs[[s]]
+  
+  # get its Sanger genotype
+  sang <- as.character(unique(subset(ied, sample==pdg, SV)))
+  
+  
+  # if Sanger genotype is NA (example sCJD case)
+  if (is.na(sang)) { # then plot the mismatches vs reference
+    
+    # then it means its genotype is reference
+    sang <- 'reference'
+
+    tmp <- subset(ied, sample==pdg & othercategory==sang) # subset of ied for plot
+    misplot <- ggplot(tmp, aes(reference_nummis, after_stat(density))) + # mismatches plot
+      geom_histogram(binwidth=1) +
+      theme_minimal() +
+      coord_cartesian(xlim=c(0,100), ylim=c(0, 0.3)) +
+      labs(title=pdg,
+           subtitle=sang)
+    misPlots1[[s]] <- misplot # add it to the list
+  
+    
+  # if Sanger genotype is an insertion...
+  } else if (substr(sang, 1, 4) == 'OPRI') {
+      
+    tmp <- subset(ied, sample==pdg & opri==sang) # subset of ied for plot
+    misplot <- ggplot(tmp, aes(opri_nummis, after_stat(density))) + # mismatches plot
+      geom_histogram(binwidth=1) +
+      theme_minimal() +
+      coord_cartesian(xlim=c(0,100), ylim=c(0, 0.3)) +
+      labs(title=pdg,
+           subtitle=sang)
+    misPlots1[[s]] <- misplot # add it to the list
+    
+    # if Sanger genotype is a deletion...
+  } else if (substr(sang, 1, 4) == 'OPRD') {
+    
+    tmp <- subset(ied, sample==pdg & oprd==sang) # subset of ied for plot
+    misplot <- ggplot(tmp, aes(oprd_nummis, after_stat(density))) + # mismatches plot
+      geom_histogram(binwidth=1) +
+      theme_minimal() +
+      coord_cartesian(xlim=c(0,100), ylim=c(0, 0.3)) +
+      labs(title=pdg,
+           subtitle=sang)
+    misPlots1[[s]] <- misplot # add it to the list
+  }
+}
+
+# List 2
+misPlots2 <- vector(mode='list', length=length(pdgs))
+names(misPlots2) <- pdgs
+
+for (s in 1:length(pdgs)) {
+  
+  # get the PDG we are plotting
+  pdg <- pdgs[[s]]
+  
+  # get its first Sanger genotype
+  sang <- as.character(unique(subset(ied, sample==pdg, SV)))
+  
+  if (is.na(sang)) { # if its first Sanger genotype is NA, we already plotted the reference reads above
+    # so do not repeat the plot but instead plot an empty plot to fill the list
+    misplot <- ggplot() + 
+      theme_void() +
+      labs(title=pdg)
+    misPlots2[[s]] <- misplot # add it to the list
+    
+  # if first Sanger genotype is not NA, then we should plot the reference reads now
+    # (or the 1 OPRD reads for the compound homozygous; will be in SV2 column)
+  } else if(!is.na(sang)) {
+    
+    sang2 <- as.character(unique(subset(ied, sample==pdg, SV2))) # look at SV2 column, the only one not NA is the compound homozygous
+    
+    if(is.na(sang2)) {
+      # then it means we should plot the reference reads
+      sang2 <- 'reference'
+      
+      tmp <- subset(ied, sample==pdg & othercategory==sang2) # subset of ied for plot
+      misplot <- ggplot(tmp, aes(reference_nummis, after_stat(density))) + # mismatches plot
+        geom_histogram(binwidth=1) +
+        theme_minimal() +
+        coord_cartesian(xlim=c(0,100), ylim=c(0, 0.3)) +
+        labs(title=pdg,
+             subtitle=sang2)
+      misPlots2[[s]] <- misplot # add it to the list
+      
+      # if not NA, i.e. we are dealing with the compound homozygous; so it will be 1 OPRD
+    } else if (!is.na(sang2)) {
+      if(sang2!='OPRD1') stop('\t \t \t \t >>> Something unexpected with the compound homozygous sample \n') # check that is correct
+      
+      tmp <- subset(ied, sample==pdg & oprd==sang2) # subset of ied for plot
+      misplot <- ggplot(tmp, aes(oprd_nummis, after_stat(density))) + # mismatches plot
+        geom_histogram(binwidth=1) +
+        theme_minimal() +
+        coord_cartesian(xlim=c(0,100), ylim=c(0, 0.3)) +
+        labs(title=pdg,
+             subtitle=sang2)
+      misPlots2[[s]] <- misplot # add it to the list
+    }
+    
+  }
+}
+
+# interleave the two lists
+orderindex <- order(c(seq_along(misPlots1), seq_along(misPlots2))) # gives 1, 26, 2, 27, etc; so can concatenate both lists then order them
+misPlots <- (c(misPlots1,misPlots2))[orderindex]
+
+# arrange the interleaved list in a grid
+misGrid <- ggarrange(plotlist=misPlots, nrow=length(pdgs), ncol=2) # should produce each row = one sample
+ggsave(filename=here('needleSam', 'mismatches_vsconsensus.pdf'), misGrid, height=2000, width=100, units='mm', limitsize=FALSE)
+
+# there is some effect of length on number of mismatches (expectedly); i.e. longer reads (eg. OPRI8) = more mismatches
+# I think need to have a flexible threshold based on % error by nucleotide
+
+# below: look at effect of length on number of mismatches
+
+# for all 'expected genotype' reads, plot number of mismatches vs read length
+  # add a read length column
+ied$read_length <- nchar(ied$seq)
+
+# subset we are interested in = all 'true' reads
+  # = all reference reads (except for compound homozygous)
+  # + all reads that match Sanger genotype (! 2 Sanger genotypes for compound homozygous)
+
+# will add a 'true read' column and then subset based on that
+ied$trueread <- NA
+
+  # all reference reads, except for PDG 46345
+ied[which(ied$sample != 'pdg46345' & ied$othercategory == 'reference'), 'trueread'] <- TRUE
+
+  # all reads which have an OPRI that match their SV
+ied[which(ied$opri == ied$SV), 'trueread'] <- TRUE
+
+  # all reads which have an OPRI that match their SV2 (will only apply to PDG 46345)
+ied[which(ied$opri == ied$SV2), 'trueread'] <- TRUE
+
+  # all reads which have an OPRD that match their SV
+ied[which(ied$oprd == ied$SV), 'trueread'] <- TRUE
+
+  # all reads which have an OPRD that match their SV2 (will only apply to PDG 46345)
+ied[which(ied$oprd == ied$SV2), 'trueread'] <- TRUE
+
+  # total number of 'true reads'
+sum(ied$trueread, na.rm=TRUE)
+sum(ied$trueread, na.rm=TRUE) / nrow(ied)
+  
+  # will put FALSE for all unassigned reads
+ied[which(ied$othercategory == 'unassigned'), 'trueread'] <- FALSE
+
+  # how many NA left
+sum((is.na(ied$trueread)))
+
+# looks correct, essentially all reads left NA have a potential somatic mutation
+  # mismatch threshold will say if convincing or not, though
+
+  # put FALSE for these reads
+ied[which(is.na(ied$trueread)), 'trueread'] <- FALSE
+
+  # check no more NA in trueread
+sum(is.na(ied$trueread))
+
+
+# subset ied to keep only true reads
+iedTrue <- subset(ied,
+                  trueread)
+
+
+# few checks
+# there should not be any unassigned reads
+subset(iedTrue, othercategory=='unassigned')
+# there should not be any reads from pdg46345 that is reference
+subset(iedTrue, sample=='pdg46345' & othercategory=='reference')
+# there should not be any eg. 3 OPRI (as no sample has that as its Sanger genotype)
+subset(iedTrue, opri=='OPRI3')
+subset(iedTrue, opri=='OPRD3')
+
+# looks good
+  # ! number of mismatches are spread out between 3 columns
+  # need to collapse both columns in one
+  # first; check there is no case where both OPRD and OPRI are not NA
+subset(iedTrue, !is.na(opri_nummis) & !is.na(oprd_nummis)) # check okay to delete... If only a few and one of opri_nummis or oprd_nummis is high
+inds2del <- which(!is.na(iedTrue$opri_nummis) & !is.na(iedTrue$oprd_nummis))
+iedTrue <- iedTrue[ - inds2del , ]
+
+
+# quick sanity checks
+subset(iedTrue, !is.na(opri) & is.na(opri_nummis))
+subset(iedTrue, is.na(opri) & !is.na(opri_nummis))
+
+subset(iedTrue, !is.na(oprd) & is.na(oprd_nummis))
+subset(iedTrue, is.na(oprd) & !is.na(oprd_nummis))
+
+identical(which(is.na(iedTrue$opri)) , which(is.na(iedTrue$opri_nummis)))
+identical(which(is.na(iedTrue$oprd)) , which(is.na(iedTrue$oprd_nummis)))
+
+# also, wherever read is reference, it is NA for both opri/oprd
+subset(iedTrue, !is.na(reference_nummis) & (!is.na(opri) | !is.na(oprd)))
+
+# or in summary, for each row there should only be one of OPRI/OPRD/othercategory that is not NA
+checkNa <- apply(iedTrue, 1, function(re) {
+  totna <- as.numeric(!is.na(re['opri_nummis'])) + as.numeric(!is.na(re['oprd_nummis'])) + as.numeric(!is.na(re['reference_nummis']))
+  return(totna)
+})
+# should always be 1
+which(checkNa != 1)
+# ok so we can collapse into one column
+
+# now to collapse both columns in one:
+iedTrue$true_nummis <- NA # preallocate the column
+
+# fill in the OPRI
+iedTrue$true_nummis [which(!is.na(iedTrue$opri_nummis))] <- iedTrue$opri_nummis[which(!is.na(iedTrue$opri_nummis))]
+iedTrue$true_nummis [which(!is.na(iedTrue$oprd_nummis))] <- iedTrue$oprd_nummis[which(!is.na(iedTrue$oprd_nummis))]
+iedTrue$true_nummis [which(!is.na(iedTrue$reference_nummis))] <- iedTrue$reference_nummis[which(!is.na(iedTrue$reference_nummis))]
+  # left which() gives the indices to fill in / right which() gets all the non-NA OPRI
+
+# there should not be any NA left
+sum(is.na(iedTrue$true_nummis)) # ok
+
+# let's put this column back into ied, will make life easier
+  # can simply left_join by read ID, it is safe as no duplicated read
+  # check to be safe
+sum(duplicated(ied$read))
+tmp <- select(iedTrue, read, true_nummis) # keep only read/true_nummis columns
+ied <- left_join(x=ied, y=tmp, by='read')
+
+# easy check: all trueread TRUE should have a true_nummis
+subset(ied,
+       trueread & is.na(true_nummis))
+  # few reads, which are the one which have both an OPRI and an OPRD (see above), it is fine we can leave them there
+
+# or in other words, all FALSE trueread should be NA true_nummis
+subset(ied,
+       !trueread & !is.na(true_nummis))
+# ok
+
+# plot read length vs number of mismatches
+  # can do from ied now; all not 'true read' will have NA for true_nummis and will not be plotted
+lengthVsMis <- ggplot(ied, aes(x=read_length, y=true_nummis)) +
+  geom_point() +
+  theme_minimal() +
+  coord_cartesian(xlim=c(0, 350))
+lengthVsMis
+ggsave(filename=here('needleSam', 'lengthVsMis.pdf'), lengthVsMis, width=300, height=300, units='mm')
+
+# for any read length, most reads are with low number of mismatches which is good news
+# however, there is a clear effect of read length as expected
+  # it increases the variability of number of mismatches & the number of reads with many mismatches
+
+# perhaps clearer if plotting average mismatch for each read length
+meanMisbyLength <- ied %>%
+  group_by(read_length) %>%
+  summarise(meanMis=mean(true_nummis, na.rm=TRUE))
+
+meanMisbyLengthPlot <- ggplot(meanMisbyLength, aes(x=read_length, y=meanMis)) +
+  geom_point() +
+  theme_minimal()
+meanMisbyLengthPlot
+ggsave(filename=here('needleSam', 'meanMisbyLength.pdf'), meanMisbyLengthPlot, width=300, height=300, units='mm')
+# interestingly it makes like a chainsaw pattern
+# I think it is simpl because reads tend to have lowest number of mismatches when their length match the target
+  # eg. reference = 123 bp, reads that are exactly 123 bp are more likely to have low number of mismatches
+  # a reference read that is eg. 118 bp will necessarily have at least 6 mismatches
+# correlation read_length vs number of mismatches
+cor(ied$read_length, ied$true_nummis, use='complete.obs')
+
+# Normalise number of mismatches by doing % mismatch by nucleotide
+  # compute % mismatches by nucleotide
+    # simply need to divide mismatches by read_length
+ied$true_misbp <- ied$true_nummis / ied$read_length # true read, mismatch by bp
+
+# how does it look
+  # mean / sd will give us an idea of the appropriate threshold
+mismatchByNucleotide <- ggplot(ied, aes(true_misbp, after_stat(density))) +
+  geom_histogram(binwidth=0.01) +
+  geom_vline(xintercept=mean(ied$true_misbp, na.rm=TRUE), linetype=1) +
+  geom_vline(xintercept=mean(ied$true_misbp, na.rm=TRUE) + sd(ied$true_misbp, na.rm=TRUE), linetype=2) +
+  theme_minimal()
+mismatchByNucleotide
+ggsave(filename=here('needleSam', 'mismatchByNucleotide.pdf'), mismatchByNucleotide, width=100, height=100, units='mm')
+
+# >> I am happy with mean + 1*sd as threshold 'mismatch by nucleotide'
+thrMis <- mean(ied$true_misbp, na.rm=TRUE) + sd(ied$true_misbp, na.rm=TRUE)
+cat('\t \t \t \t >>> Maximum mismatches allowed vs consensus sequence =', round(thrMis*100, 2), '% of the read length \n')
+cat('\t \t \t \t \t or in other words,', round(100-thrMis*100, 2), '% of the read has to align correctly to its consensus sequence \n')
+
+# plot how the threshold line for illustration
+thrLine <- as.data.frame(cbind(min(ied$read_length):max(ied$read_length), min(ied$read_length):max(ied$read_length)))
+colnames(thrLine) <- c('read_length', 'misauthor') # number of mismatches authorised
+thrLine$misauthor <- thrLine$misauthor * thrMis
+thrLinePlot <- ggplot(thrLine, aes(x=read_length, y=misauthor)) +
+  geom_point()
+thrLinePlot
+ggsave(filename=here('needleSam', 'thrLinePlot.pdf'), thrLinePlot, width=100, height=100, units='mm')
+
+# is there any correlation with read_length? there should not be any anymore
+mismatchByNucleotide_vsReadLength <- ggplot(ied, aes(x=read_length, y=true_misbp)) +
+  geom_point() +
+  theme_minimal() +
+  coord_cartesian(xlim=c(0, 350))
+mismatchByNucleotide_vsReadLength
+ggsave(filename=here('needleSam', 'mismatchByNucleotide_vsReadLength.pdf'), mismatchByNucleotide_vsReadLength, width=100, height=100, units='mm')
+# I do not see any anymore
+
+# do the same as above, mean and correlation
+meanMisByNucbyLength <- ied %>%
+  group_by(read_length) %>%
+  summarise(mean_true_misbp=mean(true_misbp, na.rm=TRUE))
+
+meanMisByNucbyLengthPlot <- ggplot(meanMisByNucbyLength, aes(x=read_length, y=mean_true_misbp)) +
+  geom_point() +
+  theme_minimal()
+meanMisByNucbyLengthPlot
+ggsave(filename=here('needleSam', 'meanMisByNucbyLength.pdf'), meanMisByNucbyLengthPlot, width=300, height=300, units='mm')
+
+cor(ied$read_length, ied$true_misbp, use='complete.obs')
+
+
+
+
+# 10- Filter by maximum number of mismatches -------------------------------
+
+# thrMis is the maximum number of mismatches by nucleotide
+# compute the maximum number of mismatches authorised for each read
+  # i.e. read_length * thrMis
+ied$authMis <- round(ied$read_length * thrMis) # number of mismatches authorised
+
+# how many cases are there where one read is both OPRI + OPRD?
+nrow(subset(ied, !is.na(opri) & !is.na(oprd)))
+
+# now add a TRUE/FALSE to the reads that are below
+  # because of above, will add 2 columns, opri_pass and oprd_pass
+ied$opri_pass <- ied$opri_nummis < ied$authMis
+ied$oprd_pass <- ied$oprd_nummis < ied$authMis
+
+# ! for PDG 46345 (compound homozygous OPRD1/OPRI5), should consider reference reads as possible somatic mutations
+# will simply add a column ref_pass for all; same as above
+ied$ref_pass <- ied$reference_nummis < ied$authMis
+
+# check it does what I expect to the PDG 46345 reads
+subset(ied, sample=='pdg46345' & othercategory=='reference')
+
+
+# can also compute % mismatch by nucleotide for each read
+  # I am finding it less intuitive, but it has the advantage to be more flexible later as can be used with another threshold later
+  # same as above, we need to do it in 3 columns
+ied$opri_misbp <- ied$opri_nummis / ied$read_length # mismatches by bp for all OPRI reads
+ied$oprd_misbp <- ied$oprd_nummis / ied$read_length # mismatches by bp for all OPRD reads
+ied$ref_misbp <- ied$reference_nummis / ied$read_length # mismatches by bp for all reference reads
+
+
+
+
+# write the database ------------------------------------------------------
+# (as alignments take a while)
+# and we will start exploring it in needleSam_exploration.R
+
+# format a bit better ied
+
+# remove metadata information, only keep read-level info
+cols2remove <- c('individual', 'cohortID', 'cohort', 'gDNA_tissue', 'prion_disease',
+                 'SV', 'SV2', 'codon129', 'otherSNVs', 'flowcell', 
+                 'gender', 'birth_date', 'sample_date', 'death_date', 'death_age',
+                 'onset_age', 'age_atsample', 'MRC_t0', 'MRC_intersect', 'MRC_slope')
+cols2remove <- match(cols2remove, colnames(ied))
+
+ied <- ied[, -cols2remove]
+
+# arrange columns order
+colsorder <- c('sample', 'read', 'seq', 'read_length', 'strand', 'haplotype', 'hapqual', 'besthapqual', 'hapscore',
+               'swappedOPR', 'ins', 'del', 'opri', 'oprd', 'othercategory', 'opri_nummis', 'oprd_nummis',
+               'reference_nummis', 'trueread', 'true_nummis', 'true_misbp', 'authMis',
+               'opri_pass', 'oprd_pass', 'ref_pass', 'opri_misbp', 'oprd_misbp', 'ref_misbp')
+length(colsorder) == ncol(ied)
+colsorder <- match(colsorder, colnames(ied))
+ied <- ied[, colsorder]
+
+# write the database
+write.csv(ied, file=here('needleSam', 'allOPRreads.csv'), row.names=FALSE)
+
+# >>> move to needleSam_exploration.R
\ No newline at end of file
diff --git a/needleSam/needleSam_exploration_v2.R b/needleSam/needleSam_exploration_v2.R
new file mode 100644
index 0000000..e38df17
--- /dev/null
+++ b/needleSam/needleSam_exploration_v2.R
@@ -0,0 +1,2734 @@
+# after needleSam.R
+  # needleSam.R creates the dataset allOPRreads.csv
+  # needleSam_exploration.R: exploring the dataset for possible OPR somatic mutations
+
+exportOrNo <- FALSE # Do you want to export the plots (useful to put FALSE when just want to load everything)
+
+# packages ----------------------------------------------------------------
+
+library(here)
+library(tidyr)
+library(dplyr)
+library(ggplot2)
+library(openxlsx)
+library(ggbeeswarm)
+library(ggpubr)
+
+
+
+# small functions ---------------------------------------------------------
+
+# function to pool columns from a dataframe into one column
+  # ! assumes that each row, there is only one element from the columns to pool which is not NA
+    # eg. at row 5, the columns we want to pool are NA, 9, NA >> gives 9
+    # but if 2, 9, NA >> Error! (as more than one element)
+
+# usage example; poolColumns(df, c('col2', 'col5'))
+poolColumns <- function(dataframe, vectorOfColNames) {
+  apply(dataframe, 1, function(row) {
+    tmp <- as.vector(unlist(c(row[vectorOfColNames]))) # concatenate the columns we need to pool at this row
+    # check there is only one not NA
+    if (sum(!is.na(tmp)) != 1) stop ('\t \t \t \t >>> Error: One or more rows has more than one element in the columns to be pooled \n')
+    # if ok, remove the NA and we are left with a single element
+    return(tmp[!is.na(tmp)]) # return for within the apply
+  })
+}
+
+###
+
+# counts & frequencies within bins of a continuous variable (each bin = one range of values) per group
+# e.g. data is number of mismatches for each read
+# and 2 groups: reference / mutated
+# for each group; how many reads in bin 0--10 mismatches; how many reads in bin 11-20 etc.
+# both in counts and frequencies (% of total within each group)
+# Note; bins represent the right boundary, e.g. a case where we have proportions, 0.1 means the bin that contains the data from 0 to 0.1
+# same in the dataframe it returns
+# example usage/ freqInBins(df = mydata, datcol = 'mismatchpercentage', grpcol = 'treated', bins=seq(0, 1, by=0.1))
+# by default grpcol = NA, meaning it does do it by any group, just takes all the data in column datcol
+freqInBins <- function(df, datcol, grpcol=NA, bins) {
+  
+  ###
+  # if no grouping
+  if (is.na(grpcol)) {
+    dat <- as.numeric(df[, datcol]) # data we need to bin
+    cos <- hist(dat, breaks=bins, include.lowest=TRUE, plot=FALSE)$counts # counts
+    fre <- cos / sum(cos) # frequencies
+    gdf <- data.frame(bin=bins[-1], counts=cos, freq=fre) # small dataframe for this group, two columns counts and frequencies
+    
+    return(gdf)
+    
+  }
+  
+  ###
+  # if grouping
+  
+  else {
+    
+    ngrps <- length(unique(df[,grpcol])) # number of groups
+    grpnms <- unique(df[,grpcol]) # group names
+    
+    cfpg <- vector(mode='list', length=ngrps) # preallocate list counts + frequencies per group
+    
+    for(i in 1:length(grpnms)) { # loop through groups
+      dat <- as.numeric(df[ which(df[,grpcol] == grpnms[i]) , datcol]) # data for this group (the data we need to bin)
+      cos <- hist(dat, breaks=bins, include.lowest=TRUE, plot=FALSE)$counts # counts
+      fre <- cos / sum(cos) # frequencies
+      
+      gdf <- data.frame(grp=grpnms[i], bin=bins[-1], counts=cos, freq=fre) # small dataframe for this group, two columns counts and frequencies
+      
+      # group column should be called the same as grpcol
+      colnames(gdf)[1] <- grpcol
+      
+      # add it to the list
+      cfpg[[i]] <- gdf
+    }
+    
+    # stick the list together and return it
+    fbing <- as.data.frame(rbindlist(cfpg)) # frequencies in each bin by group
+    return(fbing)
+  }
+  
+}
+
+
+# plot colours ------------------------------------------------------------
+
+# somatic call TRUE/FALSE
+somcols <- c('#f1876b', '#B3BDC4')
+
+# strand
+  # will use same colours as IGV
+  # forward = pink; reverse = blue
+strcols <- c('#ebb0af', '#aeafd8') # ! Forward, Reverse
+
+# haplotypes
+hapcols <- c('#f1876b', '#4D4D4D', '#B3BDC4') # ! 1, 2, 0
+
+# cohorts
+  # inherited = yellow
+  # sCJD = khaki
+  # control = grey
+cohcols <- c('#fcb505', '#78ac63', '#B3BDC4')
+
+
+
+
+# import ------------------------------------------------------------------
+
+# all the reads
+ied <- read.csv(here('needleSam', 'allOPRreads.csv'))
+
+# sample metadata
+meta <- read.xlsx(here('AdditionalFile1.xlsx'), sheet='samples')
+
+# format meta as in needleSam.R
+  # - >> NA
+meta$SV[which(meta$SV=='-')] <- NA
+
+  # special case: PDG 46345 is 5 OPRI/1 OPRD
+    # make a new column SV2 for it (just for the 1 OPRD)
+meta <- meta %>%
+  mutate(SV2=NA, .after=SV)
+meta[which(meta$sample==46345), 'SV2'] <- unlist(strsplit(meta[which(meta$sample==46345), 'SV'], '/'))[2]
+meta[which(meta$sample==46345), 'SV'] <- unlist(strsplit(meta[which(meta$sample==46345), 'SV'], '/'))[1]
+
+  # 2 OPRD >> OPRD2, etc.
+    # first lapply takes what is after the space = OPRD or OPRI
+    # second lapply takes what is before the space = 2, 1, ...
+meta$SV <- paste0(lapply(strsplit(meta$SV, ' '), function(i) {i[2]}),
+                  lapply(strsplit(meta$SV, ' '), function(i) {i[1]}))
+    # the NA above become 'NANA', put them back to NA
+meta$SV[which(meta$SV=='NANA')] <- NA
+
+  # same for SV2
+meta$SV2 <- paste0(lapply(strsplit(meta$SV2, ' '), function(i) {i[2]}),
+                   lapply(strsplit(meta$SV2, ' '), function(i) {i[1]}))
+meta$SV2[which(meta$SV2=='NANA')] <- NA
+
+  # sample: 52331 >> pdg52331, etc.
+meta$sample <- sprintf('pdg%s', meta$sample)
+
+
+# haplotype phasing metadata
+  # eg. how many SNVs available for haplotagging
+hpmeta <- read.xlsx(here('AdditionalFile1.xlsx'), sheet='haplotypephasing')
+    # sample: 52331 >> pdg52331, etc.
+hpmeta$sample <- sprintf('pdg%s', hpmeta$sample)
+
+  # from this:
+    # pdgs of samples that we could haplotag
+pdgs_tag <- as.character(unlist(subset(hpmeta, haplotagged, sample)))
+    # pdgs of samples we could not haplotag
+pdgs_notag <- as.character(unlist(subset(hpmeta, !haplotagged, sample)))
+
+# sequencing results summary
+seqmeta <- read.xlsx(here('AdditionalFile1.xlsx'), sheet='sequencing_summary')
+# sample: 52331 >> pdg52331, etc.
+seqmeta$sample <- sprintf('pdg%s', seqmeta$sample)
+
+# we will also need all pdgs
+pdgs <- meta$sample
+# do not take the ones we do not have loaded (only applies if running script on subset of samples)
+pdgs <- intersect(unique(ied$sample), pdgs)
+
+
+
+
+# add sample info to ied for simplicity -----------------------------------
+
+ied <- left_join(ied, meta, by='sample')
+
+ied <- hpmeta %>%
+  select('sample', 'haplotagged', 'genebody_heterozygousSNVs') %>%
+  left_join(ied, ., by='sample')
+
+ied <- seqmeta %>%
+  select('sample', 'genebody_coverage', 'promoter_coverage') %>%
+  left_join(ied, ., by='sample')
+
+
+
+
+# Possible somatic mutations: approach 1 ----------------------------------
+
+# there is no read that has both an OPRD and a OPRI that pass mismatch threshold, which makes things easier
+subset(ied,
+       (opri_pass & oprd_pass))
+
+# interesting reads are:
+  # a- there is a mutation (not reference)
+    # ! except for PDG 46345 (compound homozygous 1 OPRD / 5 OPRI), for this sample 'reference' may be a somatic mutation
+  # b- Sanger SV is NA (eg. sCJD cases) or is different than the mutation in the read (not interesting if matches the Sanger genotype)
+  # c- One of OPRI or OPRD is below mismatch threshold
+
+# in practice:
+  # 1- all reads that match what we know from Sanger (so uninteresting here) have a true_nummis number, so can take only true_nummis = NA
+  # (true_nummis = number of mismatches vs reference, when it matches what is expected)
+    # by doing so, no need to worry about PDG 46345 (compound homozygous 1 OPRD / 5 OPRI), true_nummis is present for both OPRD1 or OPRI5 reads
+      # (and is NA if read says reference, as it should)
+  # 2- ! true_nummis = NA also includes unassigned reads, so need to exclude those
+    # (! remember to explicitly include othercategory = NA otherwise != excludes NA)
+  # 3- make use of mismatch threshold, opri_pass or oprd_pass
+
+iedSom1 <- subset(ied,
+                 is.na(true_nummis) & # 1- 
+                   (is.na(othercategory) | othercategory!='unassigned') & # 2-
+                   (opri_pass | oprd_pass | ref_pass)) # 3-, one of opri_pass, oprd_pass, ref_pass is TRUE
+
+
+# Possible somatic mutations: approach 2 ----------------------------------
+# Another approach...
+  # allows to confirm the above if we reach the same conclusion with two different logics
+
+# tag the 'expected' reads, i.e. those that do not look like a somatic mutation
+ied$expected <- NA # preallocate all as NA
+
+  # all reads whose OPRI matches their sample's SV are 'expected'
+ied[which( ied$opri == ied$SV ), 'expected'] <- TRUE
+
+  # all reads whose OPRI matches their sample's SV2 are 'expected' (only applies to PDG 46345)
+ied[which( ied$opri == ied$SV2 ), 'expected'] <- TRUE
+
+  # all reads whose OPRD matches their sample's SV are 'expected'
+ied[which( ied$oprd == ied$SV ), 'expected'] <- TRUE
+
+  # all reads whose OPRD matches their sample's SV2 are 'expected' (only applies to PDG 46345)
+ied[which( ied$oprd == ied$SV2 ), 'expected'] <- TRUE
+
+  # all reads that are reference are 'expected' (! except for PDG 46345, read above) 
+ied[which( ied$sample != 'pdg46345' & ied$othercategory == 'reference' ), 'expected'] <- TRUE
+
+# >>> at this stage, expected should tag the same rows as trueread
+identical(subset(ied, trueread, read), subset(ied, expected, read))
+
+  # lots of reads are unassigned to an OPR category
+    # >> will tag them as expected for simplicity; this will be the difference vs trueread
+ied[which( ied$othercategory == 'unassigned' ), 'expected'] <- TRUE
+
+# great majority of reads should be 'expected'
+cat('\t \t \t \t >>> ', sum(ied$expected, na.rm=TRUE), 'out of', nrow(ied), 'considered expected \n')
+cat('\t \t \t \t \t i.e.', round((sum(ied$expected, na.rm=TRUE) / nrow(ied))*100, 1), '% \n')
+
+  # tag the rest as 'not expected' (FALSE), i.e. potentially somatic mutation
+ied[which(is.na(ied$expected)), 'expected'] <- FALSE
+
+  # check there is no NA left
+sum(is.na(ied$expected))
+
+# possible somatic = 'not expected', and pass mismatch threshold
+iedSom <- subset(ied,
+                 !expected &
+                   (opri_pass | oprd_pass | ref_pass)) # one of opri_pass, oprd_pass, ref_pass is TRUE
+
+
+# >>> does it give the same reads as first logic?
+identical(iedSom$read , iedSom1$read)
+# yes, same result
+# will only keep iedSom
+rm(iedSom1)
+
+# will be useful for later;
+  # add column 'isSomatic' (TRUE or FALSE) to ied, i.e. this read is a somatic mutation call
+ied$isSomatic <- FALSE # preallocate as FALSE (most of them)
+ied[match(iedSom$read, ied$read), 'isSomatic'] <- TRUE # for the reads that are in iedSom, replace by TRUE
+# put isSomatic in order I want
+ied$isSomatic <- factor(ied$isSomatic, levels=c('TRUE', 'FALSE'))
+
+
+
+
+
+# Illustrate mismatch threshold -------------------------------------------
+
+# in read length vs number of mismatches;
+  # plot all 'potentially somatic mutations' reads and colour the ones that pass the mismatch threshold
+    # the reads that have both OPRI1 and OPRD1 make this annoyingly complicated
+    # for those, I think will simply plot the minimum mismatch of OPRI/OPRD (read length does not change)
+    # (to be honest it does not really matter as all of these reads are above the mismatch threshold anyways)
+
+# all somatic mutations candidates
+  # (i.e. iedSom from above + all not expected but which does not pass the mismatch threshold)
+iedsomall <- subset(ied,
+                    !expected)
+
+iedsomall$conmis <- NA # number of mismatches vs consensus
+
+# for reference reads, fill in conmis with reference_nummis
+iedsomall[which(!is.na(iedsomall$reference_nummis)) , 'conmis'] <- iedsomall[which(!is.na(iedsomall$reference_nummis)) , 'reference_nummis']
+
+# for OPRI reads, fill in conmis with opri_nummis
+iedsomall[which(!is.na(iedsomall$opri_nummis)) , 'conmis'] <- iedsomall[which(!is.na(iedsomall$opri_nummis)) , 'opri_nummis']
+
+# for OPRD reads, fill in conmis with oprd_nummis
+iedsomall[which(!is.na(iedsomall$oprd_nummis)) , 'conmis'] <- iedsomall[which(!is.na(iedsomall$oprd_nummis)) , 'oprd_nummis']
+
+# check no more NA
+sum(is.na(iedsomall$conmis))
+
+# ! now, any reads that has both OPRD & OPRI got the number of mismatches vs OPRD consensus (as OPRD was last)
+# for these reads, go back and take the minimum number of mismatches between OPRD and OPRI (the best match)
+rows2edit <- which( !is.na(iedsomall$opri) & !is.na(iedsomall$oprd) )
+for (r in rows2edit) {
+  iedsomall[r,'conmis'] <- min(c(iedsomall[r,'opri_nummis'] , iedsomall[r,'oprd_nummis']))
+}
+# >>> we have number of mismatches vs consensus for all candidate somatic mutations 
+
+# mismatch threshold
+  # mean of % mismatch of all true reads (those that match what we know is there) + sd
+thrMis <- mean(ied$true_misbp, na.rm=TRUE) + sd(ied$true_misbp, na.rm=TRUE) # as calculated in needleSam.R
+
+# more intuitively, the threshold excludes the
+1 - (nrow(subset(ied, trueread & true_misbp < thrMis)) / nrow(subset(ied, trueread))) # least accurate reads
+
+
+# on plot, will add markers for reference OPR lengths (eg. reference 123 bp, 1 OPRI = 123 + 24 bp, etc.)
+ref <- 9 * 1 * 3 + 8 * 4 * 3 # reference = 1 nonapeptide + 4 octapeptide
+# OPR I / D are simply +/- full Rs, so +/- multiples of 8 * 3 = 24 bp
+oct <- 8 * 3
+
+oprlengths <- c(ref-4*oct,
+                ref-3*oct,
+                ref-2*oct,
+                ref-1*oct,
+                #ref,
+                ref+1*oct,
+                ref+2*oct,
+                ref+3*oct,
+                ref+4*oct,
+                ref+5*oct,
+                ref+6*oct,
+                ref+7*oct,
+                ref+8*oct,
+                ref+9*oct,
+                ref+10*oct,
+                ref+11*oct,
+                ref+12*oct,
+                ref+13*oct)
+
+somMisThr <- ggplot(iedsomall, aes(x=read_length, y=conmis, colour=isSomatic)) +
+  geom_vline(xintercept=ref, linetype=2, colour='#b0b0b0', size=0.8) +
+  geom_vline(xintercept=oprlengths, linetype=2, colour='#ebebeb', size=0.8) +
+  geom_segment(x=0, xend=450,
+               y=0, yend=450*thrMis,
+               size=1.0, linetype=2, colour='#595E60') +
+  geom_point(size=0.8) +
+  scale_colour_manual(values=somcols) +
+  coord_cartesian(xlim=c(30,377), ylim=c(0, 100.1)) + # when cropping to 450 bp (see below), no read above 150 mismatches
+  theme_minimal() +
+  theme(
+    legend.position='none',
+    panel.grid.major.x=element_blank(),
+    panel.grid.minor.x=element_blank(),
+    axis.title.x=element_text(size=9, margin = margin(t=0, r=0, b=0, l=0)),
+    axis.text.x=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
+    axis.title.y=element_text(size=9, margin = margin(t=0, r=-2, b=0, l=0)),
+    axis.text.y=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
+  ) +
+  xlab('read length') + ylab('mismatches vs template')
+somMisThr
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'somMisThr.pdf'), somMisThr, width=77, height=72, units='mm')
+}
+
+# Note; the few reads above 450 bp are crazy repetitions eg. GCGCGCGCGCGC, I think fair to crop the plot at 450 bp
+# (and not somatic calls as they have terrible % mismatch)
+
+# Note; now lowered max X to 377, it is excluding 1 read extra
+
+
+
+
+
+# Do somatic calls have worse % mismatch? ---------------------------------
+# or; are somatic reads less accurate than 'true' reads?
+
+# calculate mismatch per bp for all somatic candidates
+iedsomall$som_misbp <- iedsomall$conmis / iedsomall$read_length
+
+# mismatch per bp in iedSom currently in three columns (ref / opri / oprd), need to pool them in one
+iedSom$som_misbp <- as.numeric(unlist(poolColumns(iedSom, c('ref_misbp', 'opri_misbp', 'oprd_misbp'))))
+
+# check reads that reads which are both in iedsomall and iedSom have the same misbp
+round(iedsomall[match(iedSom$read, iedsomall$read) , 'som_misbp'], 2) == round(iedSom$som_misbp, 2)
+# (need to round or it says FALSE for numbers which are clearly equal, numbers are just stored with different levels of precision)
+
+
+# binning for mismatch (will use for histograms below)
+misbins <- seq(0, 1, by=0.01) # bins of 1% mismatch
+
+###
+###
+# first do all somatic reads (i.e. before mismatch threshold) vs trueread
+
+# all somatic reads before mismatch filtering are in iedsomall
+# counts / frequencies in bins
+som <- freqInBins(df=iedsomall, datcol='som_misbp', grpcol=NA, bins=misbins)
+# add column source
+som$source <- 'somatic'
+
+# same for all truereads
+tru <- freqInBins(df=subset(ied, trueread), datcol='true_misbp', grpcol=NA, bins=misbins)
+# add column source
+tru$source <- 'trueread'
+
+# stick the two together
+somtru <- rbind(som, tru)
+
+# plot
+misTruevsSomAll <- ggplot(somtru, aes(x=bin, y=freq, fill=source)) +
+  geom_col(position='identity', alpha=0.5) +
+  scale_fill_manual(values=somcols) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+  ) +
+  coord_cartesian(xlim=c(0, 1)) +
+  xlab('mismatch per bp') + ylab('frequency')
+misTruevsSomAll
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'misTruevsSomAll.pdf'), misTruevsSomAll, width=120, height=100, units='mm')
+}
+
+###
+###
+# now only selected somatic reads (i.e. after mismatch threshold) vs trueread
+
+# ! need to also crop trueread by the same mismatch threshold if we want to compare frequencies
+trure <- subset(ied, trueread & true_misbp < thrMis) # true reads below mismatch threshold
+tru <- freqInBins(df=trure, datcol='true_misbp', grpcol=NA, bins=misbins)
+# add column source
+tru$source <- 'trueread'
+
+# for somatic, use iedSom (i.e. iedsomall + mismatch threshold)
+som <- freqInBins(df=iedSom, datcol='som_misbp', grpcol=NA, bins=misbins)
+# add column source
+som$source <- 'somatic'
+
+# stick the two together
+somtru <- rbind(som, tru)
+
+# plot
+misTruevsSomThr <- ggplot(somtru, aes(x=bin, y=freq, fill=source)) +
+  geom_col(position='identity', alpha=0.5) +
+  scale_fill_manual(values=somcols) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+  ) +
+  coord_cartesian(xlim=c(0, thrMis+0.3*thrMis)) +
+  xlab('mismatch per bp') + ylab('frequency')
+misTruevsSomThr
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'misTruevsSomThr.pdf'), misTruevsSomThr, width=120, height=100, units='mm')
+}
+
+
+###
+# in this comparison, do somatic reads have significantly worse mismatch per bp?
+# compare with a t-test
+
+# gather the reads we plotted above
+trure <- select(trure, true_misbp)
+colnames(trure) <- 'misbp'
+trure$source <- 'trueread'
+
+somre <- select(iedSom, som_misbp)
+colnames(somre) <- 'misbp'
+somre$source <- 'somatic'
+
+misre <- rbind(trure, somre) # mismatches for reads to compare
+
+wilcox.test(misre$misbp ~ misre$source)
+t.test(misre$misbp ~ misre$source)
+
+###
+###
+
+# within truereads, are % mismatch different reference vs mutated?
+  # if yes -- above is not a fair comparison as somatic reads are almost all mutated
+  # (only PDG 46345 can have somatic calls that are reference)
+
+# take all the 'true' reads + their 'othercategory' column
+  # within truereads, in 'othercategory' column, should only be reference or NA
+  # can assume all NA are OPRI or OPRD in the other columns
+
+trumut <- subset(ied, trueread & is.na(othercategory)) # true reads mutated
+truref <- subset(ied, trueread & othercategory=='reference') # true reads reference
+
+trumutb <- freqInBins(df=trumut, datcol='true_misbp', grpcol=NA, bins=misbins)
+# add column source
+trumutb$source <- 'trueread_mutated'
+
+trurefb <- freqInBins(df=truref, datcol='true_misbp', grpcol=NA, bins=misbins)
+# add column source
+trurefb$source <- 'trueread_reference'
+
+# stick the two together
+somtru <- rbind(trurefb, trumutb)
+
+# plot
+misTrueRefvsTrueMut <- ggplot(somtru, aes(x=bin, y=freq, fill=source)) +
+  geom_col(position='identity', alpha=0.5) +
+  scale_fill_manual(values=somcols) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+  ) +
+  coord_cartesian(xlim=c(0, 0.5)) +
+  xlab('mismatch per bp') + ylab('frequency')
+misTrueRefvsTrueMut
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'misTrueRefvsTrueMut.pdf'), misTrueRefvsTrueMut, width=120, height=100, units='mm')
+}
+
+
+# compare statistically
+
+# gather the reads we plotted
+trumut <- select(trumut, true_misbp)
+trumut$source <- 'trueread_mutated'
+
+truref <- select(truref, true_misbp)
+truref$source <- 'trueread_reference'
+
+trumis <- rbind(trumut, truref)
+
+wilcox.test(trumis$true_misbp ~ trumis$source)
+t.test(trumis$true_misbp ~ trumis$source)
+# yes - it is different
+
+###
+###
+# consequence: comparing somatic reads vs all true reads is unfair
+  # as most true reads are reference, but most somatic reads are mutated
+# to tell if somatic reads really have more mismatches, need to compare with mutated truereads
+
+# take the mutated truereads as above + apply mismatch threshold
+trumut <- subset(ied, trueread & is.na(othercategory) & true_misbp < thrMis) # true reads mutated
+trumutb <- freqInBins(df=trumut, datcol='true_misbp', grpcol=NA, bins=misbins)
+# add column source
+trumutb$source <- 'trueread_mutated'
+
+# take the somatic reads as above
+somb <- freqInBins(df=iedSom, datcol='som_misbp', grpcol=NA, bins=misbins)
+# add column source
+somb$source <- 'somatic'
+
+# stick the two together
+somtru <- rbind(trumutb, somb)
+
+# plot
+misTrueMutvsSomThr <- ggplot(somtru, aes(x=bin, y=freq, fill=source)) +
+  geom_col(position='identity', alpha=0.5) +
+  scale_fill_manual(values=somcols) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+  ) +
+  coord_cartesian(xlim=c(0, thrMis+0.3*thrMis)) +
+  xlab('mismatch per bp') + ylab('frequency')
+misTrueMutvsSomThr
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'misTrueMutvsSomThr.pdf'), misTrueMutvsSomThr, width=120, height=100, units='mm')
+}
+
+
+# compare statistically
+
+# gather the reads we plotted
+trumut <- select(trumut, true_misbp)
+colnames(trumut) <- 'misbp'
+trumut$source <- 'trueread_mutated'
+
+som <- select(iedSom, som_misbp)
+colnames(som) <- 'misbp'
+som$source <- 'somatic'
+
+mis <- rbind(trumut, som)
+
+wilcox.test(mis$misbp ~ mis$source)
+t.test(mis$misbp ~ mis$source)
+# yes - it is different
+
+# it did reduce the difference but excess around 5% mismatch still present
+# difference in means now is 0.5%
+
+
+###
+###
+
+# Can it be a coincidence?
+# as we have thousands of reference reads but 'only' ~ 100--150 somatic reads
+
+# draw from mutated truereads the same number of somatic reads multiple times and test with Mann-Whitney U Test
+ndraws <- 1000
+mwu <- c() # Mann-Whitney U Test p-values
+
+for (i in 1:ndraws) {
+  
+  trudraw <- trumut[sample(nrow(trumut), nrow(iedSom)),]
+  mis <- rbind(trudraw, som)
+  
+  pval <- wilcox.test(mis$misbp ~ mis$source)$p.value
+  mwu <- c(mwu, pval)
+  
+}
+
+length(which(mwu>0.05)) / ndraws
+# it is not significant ~ 18% of the time
+
+###
+###
+# In summary; do somatic calls have worse % mismatch?
+# in our dataset, the answer is yes; precisely there seems to be a surplus of reads in the 5% mismatch range
+# however, it is possible that we were simply 'unlucky' with the batch of somatic reads we got
+# indeed, comparing our batch of somatic reads against subsets of mutated truereads only give a significant difference 82% of the time
+
+# we could be more stringent with the threshold mismatch (lower it to mean for example) and this difference would probably go away
+# however, it is important to note 5% mismatch is not surprising for reads that are correct ('trueread'), so would likely exclude some reads that are ok
+# and the consequence would be to have low number of somatic calls, which would make all the comparisons between samples below difficult/impossible
+# there is also the chance that it represents something real that is not allowed by the OPR consensus sequences
+# I will keep the mismatch threshold as is
+
+# Update: I think I found where the excess of ~ 5% mismatch reads come from
+# it might be mostly reads where the R1 is affected
+# to my knowledge this has never been reported in patients, but I do not know why strand slippage would not affect R1
+# (I guess it could only be template strand slippage?)
+# from http://www.mad-cow.org/prion_repeat_insertions.html#References; should now allow extra R1 or R4
+# this holds in the reads I have inspected closely
+# in contrast, I found at least one clean R1 deletion
+
+
+
+
+# Do somatic calls have worse haplotype scores? ---------------------------
+
+# haplotype scores is my rough attempt at putting a score for confidence of assignment to a haplotype
+# see needleSam.R for comments
+
+# if somatic calls all have bad haplotype assignment scores, would be worrying
+
+# bin the hap scores for somatic calls
+bns <- seq(0, 1, by=0.1)
+hscns <- hist(as.numeric(unlist(subset(ied, isSomatic==TRUE, hapscore))), # haplotype score counts
+              breaks=bns, include.lowest=TRUE, plot=FALSE)$counts
+somhs <- data.frame(source='somatic', binend=bns[-1], counts=hscns, freq=hscns/sum(hscns)) # d for dataframe 
+
+# bin the hap scores for trueread
+hscns <- hist(as.numeric(unlist(subset(ied, trueread, hapscore))), # haplotype score counts
+              breaks=bns, include.lowest=TRUE, plot=FALSE)$counts
+truhs <- data.frame(source='trueread', binend=bns[-1], counts=hscns, freq=hscns/sum(hscns)) # d for dataframe 
+
+# stick them together
+hsf <- rbind(somhs, truhs) # haplotype score frequencies
+
+
+# plot haplotype quality score vs somatic call or not
+hapScoreSomatic <- ggplot(hsf, aes(x=binend, y=freq, fill=source)) +
+  geom_col(position='identity', alpha=0.4) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+    axis.text.y=element_blank()
+  ) +
+  scale_fill_manual(values=somcols) +
+  xlab('haplotype assignment score') + ylab('frequency')
+hapScoreSomatic
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'hapScoreSomatic.pdf'), plot=hapScoreSomatic, width=100, height=100, units='mm')
+}
+
+
+# for test; select the reads we are plotting
+iedhs <- ied %>% # ied to compare haplotype scores
+  subset(isSomatic==TRUE | trueread)
+t.test(iedhs$hapscore ~ iedhs$isSomatic)
+# somatic calls do not have worse haplotype assignment scores
+
+
+
+
+# Any evidence of strand bias? --------------------------------------------
+
+# make sure it is in same order in ied & iedSom
+ied$strand <- factor(ied$strand, levels=c('+', '-'))
+iedSom$strand <- factor(iedSom$strand, levels=c('+', '-'))
+
+# number of calls by strand
+cbstr <- iedSom %>%
+  group_by(strand) %>%
+  tally(name='countStr')
+
+# add total number of Forward/Reverse reads
+cbstr <- ied %>%
+  group_by(strand) %>%
+  tally(name='totalStr') %>%
+  left_join(cbstr, ., by='strand')
+
+# ratio of Forward/Reverse of all reads serves as baseline
+cbstr <- cbstr %>%
+  mutate(ratioTot = totalStr / sum(totalStr))
+
+# add as percentage of Forward/Reverse reads
+cbstr$somPer <- (cbstr$countStr / cbstr$totalStr) * 100 # somatic percentage
+
+# ratio of Forward/Reverse reads * total number of somatic calls will give expected counts
+cbstr$expCount <- cbstr$ratioTot * sum(cbstr$countStr)
+
+# add a dummy category for x axis below
+cbstr$dummy <- factor(1)
+
+# annoyingly, need to reverse the factor levels
+cbstr$strand <- factor(cbstr$strand, levels=c('-', '+'))
+# ! reverse as well the colours
+strcols2 <- rev(strcols)
+
+# plot as horizontal bar with ratios
+  # essentially horizontal stacked barplot
+strandPro <- ggplot(cbstr, aes(x=dummy, y=countStr, fill=strand)) +
+  geom_col(position='stack') +
+  coord_flip() +
+  scale_fill_manual(values=strcols2) +
+  geom_hline(yintercept=cbstr$expCount[2], linetype=2, colour='#ebebeb', size=0.5) +
+  theme_minimal() +
+  theme(
+    axis.title.y=element_blank(),
+    axis.text.y=element_blank(),
+    axis.title.x=element_blank(),
+    axis.text.x=element_blank(),
+    panel.grid=element_blank(),
+    legend.position='none'
+  )
+strandPro
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'strandProportion.pdf'), strandPro, width=60, height=15, units='mm')
+}
+
+
+# plot number of calls by strand
+
+# put back order
+cbstr$strand <- factor(cbstr$strand, levels=c('+', '-'))
+
+callsByStrand <- ggplot(cbstr, aes(x=strand, y=countStr, fill=strand)) +
+  geom_col() +
+  scale_fill_manual(values=strcols) +
+  geom_segment(aes(x=0.5, xend=1.5, y=as.numeric(cbstr[1, 'expCount']), yend=as.numeric(cbstr[1, 'expCount'])), # add expected counts markers
+               linetype=2, colour='#ebebeb', size=2) +
+  geom_segment(aes(x=1.5, xend=2.5, y=as.numeric(cbstr[2, 'expCount']), yend=as.numeric(cbstr[2, 'expCount'])),
+               linetype=2, colour='#ebebeb', size=2) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+    panel.grid.minor.y=element_blank()
+  ) +
+  ylab('somatic mutation calls')
+callsByStrand
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsByStrand.pdf'), callsByStrand, width=75, height=100, units='mm')
+}
+
+
+# anything unexpected (would be sign of artefact) or within what we expect?
+# statistical approach: Chi2 test
+# build contingency table
+cont <- table(ied$isSomatic, ied$strand)
+  # should not do Chi2 test if any expected count below 5 (from http://www.sthda.com/english/wiki/chi-square-test-of-independence-in-r)
+  # in all the reads, there is some slight surplus of Reverse reads, but we pretty much expect 50/50 Forward/Reverse in the somatic calls
+  # we have 139 calls, so around 70/70 Forward/Reverse, so no expected count will be below 5
+# now Chi2 test on contingency table
+chi <- chisq.test(cont, correct=FALSE)
+chi
+chi$expected
+chi$observed
+
+
+# another approach I like to visualise this:
+  # draw reads at random & see how many Forward/Reverse we get
+    # ! better to do the draws properly sample by sample (if any sample has some strand bias)
+    # eg. PDG 58398 has 24 calls >> draw at random 24 reads from all of its reads
+
+# how many somatic calls per sample?
+cps <- iedSom %>% # calls per sample
+  group_by(sample) %>%
+  tally(name='nSom')
+
+# function to run the random draws and output the data ready to plot
+strandDraws <- function(nsim) {
+  
+  # preallocate dataframe
+  simres <- as.data.frame(matrix(nrow=nsim, ncol=3)) # simulation results;
+    # rows = simulations, columns = simulation # / number of forward draws / number of reverse draws
+  colnames(simres) <- c('sim', 'plus', 'minus')
+  simres$sim <- 1:nrow(simres)
+  
+  # for each simulation;
+  # for each sample, how many reads do we need to draw? (i.e. number of somatic calls for this sample)
+  # draw that many reads (! from that sample)
+  # pool all the draws together
+  # count how many Forward, Reverse we got
+  # add the results to the simres
+  
+  for (i in 1:nsim) {
+    
+    cat ('\t \t \t \t Simulation', i, 'out of', nsim, '\n')
+    
+    stradra <- c() # strand drawn (will pool all the draws there)
+    
+    for (p in 1:length(cps$sample)) { # for each sample (or row of cps)
+      spl <- as.character(cps[p, 'sample']) # sample we are talking about
+      ndraw <- as.integer(cps[p, 'nSom']) # how many reads do we need to draw?
+      drawfrom <- as.character(unlist(subset(ied, sample==spl, strand))) # ! only draw from that sample's reads
+      stradra <- c(stradra, sample ( drawfrom , ndraw)) # draw that many reads from that sample's reads & add them to hapdra
+    }
+    
+    # length of stradra now should be = total number of somatic calls
+    if (length(stradra) != nrow(iedSom)) stop ('\t \t \t \t >>> Error: something wrong with the draws \n')
+    
+    # count how many of each strand we got, and add that to our simulation results
+    simres[i, 'plus'] <- sum(stradra == '+') # how many forward strand?
+    simres[i, 'minus'] <- sum(stradra == '-') # how many reverse strand?
+  }
+  
+  return(simres)
+}
+
+nsim <- 100 # ideally 1000, but takes a couple of minutes
+stradraws <- strandDraws(nsim=nsim)
+
+# real count of somatic calls from Forward strand
+nplus <- nrow(subset(iedSom, strand=='+'))
+
+# real count of somatic calls from Reverse strand
+nminus <- nrow(subset(iedSom, strand=='-'))
+
+# plot for each strand
+plusDraws <- ggplot(stradraws, aes(plus)) +
+  geom_histogram(binwidth=1) +
+  geom_vline(xintercept=nplus, linetype=2) +
+  theme_minimal() +
+  theme(
+    axis.text.y=element_blank()
+  ) +
+  coord_cartesian(xlim=c(25, 100), ylim=c(0, 80)) +
+  ylab('frequency') + xlab('somatic calls from forward strand')
+plusDraws
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'plusDraws.pdf'), plusDraws, width=100, height=100, units='mm')
+}
+
+
+# plot for each strand
+minusDraws <- ggplot(stradraws, aes(minus)) +
+  geom_histogram(binwidth=1) +
+  geom_vline(xintercept=nplus, linetype=2) +
+  theme_minimal() +
+  theme(
+    axis.text.y=element_blank()
+  ) +
+  coord_cartesian(xlim=c(25, 100), ylim=c(0, 80)) +
+  ylab('frequency') + xlab('somatic calls from reverse strand')
+minusDraws
+ggsave(filename=here('needleSam', 'minusDraws.pdf'), plusDraws, width=100, height=100, units='mm')
+
+
+
+
+# Somatic mutations calls -- from which cohort? ---------------------------
+
+# make sure it is in this order in ied / iedSom
+ied$prion_disease <- factor(ied$prion_disease, levels=c('inherited', 'sCJD', 'control'))
+iedSom$prion_disease <- factor(iedSom$prion_disease, levels=c('inherited', 'sCJD', 'control'))
+
+# v1: removing calls from brain samples here in order to keep one sample per individual
+# but does not match number in text (129 calls) if do this
+
+# calls by cohort
+cbyco <- iedSom %>%
+  group_by(prion_disease) %>%
+  tally(name='nSom')
+
+# first observation: majority from Inherited cases
+nrow(subset(iedSom, prion_disease=='inherited')) / nrow(iedSom) # all samples
+
+cat('\t \t \t \t >>> ', nrow(subset(iedSom, prion_disease=='inherited')), '/', nrow(iedSom), 'calls = ',
+    round(nrow(subset(iedSom, prion_disease=='inherited')) / nrow(iedSom) * 100, 2),
+    '% from Inherited \n')
+
+# add total number of reads from each cohort
+cbyco <- ied %>%
+  group_by(prion_disease) %>%
+  tally(name='totalCoh') %>%
+  left_join(cbyco, ., by='prion_disease')
+
+# somatic calls as percentage of reads from that cohort
+cbyco$somPer <- (cbyco$nSom / cbyco$totalCoh) * 100 # somatic percentage
+
+# add ratio each cohort of total number of reads
+cbyco <- cbyco %>%
+  mutate(proCoh=totalCoh/sum(totalCoh))
+
+# ratio each cohort * total number of somatic calls will give expected counts
+cbyco$expCount <- cbyco$proCoh * sum(cbyco$nSom)
+
+# can also add expected percentage
+# Note, this is same for all cohorts (as expected counts basically assumes always same percentage)
+cbyco$expPer <- (cbyco$expCount / cbyco$totalCoh) * 100
+
+# plot histogram of counts
+callsbyCoh <- ggplot(cbyco, aes(x=prion_disease, y=nSom, fill=prion_disease)) +
+  geom_col(width=0.7) +
+  # add expected counts markers
+  # geom_segment(aes(x=0.5, xend=1.5, y=as.numeric(cbyco[1, 'expCount']), yend=as.numeric(cbyco[1, 'expCount'])), linetype=2, colour='#ebebeb', size=2) +
+  # geom_segment(aes(x=1.5, xend=2.5, y=as.numeric(cbyco[2, 'expCount']), yend=as.numeric(cbyco[2, 'expCount'])), linetype=2, colour='#ebebeb', size=2) +
+  # geom_segment(aes(x=2.5, xend=3.5, y=as.numeric(cbyco[3, 'expCount']), yend=as.numeric(cbyco[3, 'expCount'])), linetype=2, colour='#ebebeb', size=2) +
+  scale_fill_manual(values=cohcols) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+    axis.title.x=element_blank(), 
+    axis.text.x=element_text(size=9, margin = margin(t=0, r=0, b=0, l=0), angle=90),
+    axis.title.y=element_text(size=9, margin = margin(t=0, r=0, b=0, l=0)),
+    axis.text.y=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
+  ) +
+  scale_x_discrete(labels=c('inherited', 'sporadic', 'control')) +
+  ylab('somatic mutation calls')
+callsbyCoh
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsbyCohort.pdf'), callsbyCoh, width=40, height=75, units='mm')
+}
+
+
+
+# same but percentages
+callsbyCohPer <- ggplot(cbyco, aes(x=prion_disease, y=somPer, fill=prion_disease)) +
+  geom_col() +
+  # add expected percentage
+  geom_hline(yintercept=mean(cbyco$expPer), linetype=2, size=2, colour='#ebebeb') +
+  scale_fill_manual(values=cohcols) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+    axis.title.x=element_blank(),
+  ) +
+  scale_x_discrete(labels=c('inherited', 'sporadic', 'control')) +
+  ylab('somatic mutation calls (% of reads)')
+callsbyCohPer
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsbyCohortPercentage.pdf'), callsbyCohPer, width=100, height=100, units='mm')
+}
+
+
+# Is there really a surplus of Inherited calls?
+# Chi2 test
+  # build contingency table
+# v1: removing brain samples from ied
+# v2: keeping all samples
+cont <- table(ied$prion_disease, ied$isSomatic)
+chi <- chisq.test(cont, correct=FALSE)
+chi
+# ! check no expected count below 5
+chi$expected
+chi$observed
+
+# alternative approach to visualise this
+  # by drawing reads at random and counting how many we have from each cohort
+
+# function to run the random draws and output the data ready to plot
+cohortDraws <- function(nsim) {
+  
+  # preallocate dataframe
+  simres <- as.data.frame(matrix(nrow=nsim, ncol=4)) # simulation results;
+  # rows = simulations, columns = simulation # / number of inherited draws / number of sCJD draws / number of control draws
+  colnames(simres) <- c('sim', 'inherited', 'sCJD', 'control')
+  simres$sim <- 1:nrow(simres)
+  
+  # for each simulation;
+  # for each sample, how many reads do we need to draw? (i.e. number of somatic calls for this sample)
+  # draw that many reads (! from that sample)
+  # pool all the draws together
+  # count how many inherited, sCJD, control
+  # add the results to the simres
+  
+  for (i in 1:nsim) {
+    
+    cat ('\t \t \t \t Simulation', i, 'out of', nsim, '\n')
+    cohdra <- sample(iedo$prion_disease, nrow(iedSomo)) # cohorts drawn
+    
+    # count how many of each cohort we got, and add that to our simulation results
+    simres[i, 'inherited'] <- sum(cohdra == 'inherited') # how many inherited?
+    simres[i, 'sCJD'] <- sum(cohdra == 'sCJD') # how many sCJD?
+    simres[i, 'control'] <- sum(cohdra == 'control') # how many sCJD?
+  }
+  return(simres)
+}
+
+nsim <- 1000
+cohdraws <- cohortDraws(nsim=nsim)
+
+# real count of somatic calls from inherited
+ninh <- nrow(subset(iedSom, prion_disease=='inherited'))
+
+# real count of somatic calls from sCJD
+nspo <- nrow(subset(iedSom, prion_disease=='sCJD'))
+
+# real count of somatic calls from control
+ncon <- nrow(subset(iedSom, prion_disease=='control'))
+
+# plot for inherited
+inhDraws <- ggplot(cohdraws, aes(inherited)) +
+  geom_histogram(binwidth=1) +
+  geom_vline(xintercept=ninh, linetype=2) +
+  theme_minimal() +
+  theme(
+    axis.text.y=element_blank()
+  ) +
+  coord_cartesian(xlim=c(0, 150), ylim=c(0, 100)) +
+  ylab('frequency') + xlab('somatic calls from inherited samples')
+inhDraws
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'inheritedDraws.pdf'), inhDraws, width=100, height=100, units='mm')
+}
+
+
+# plot for sporadic
+spoDraws <- ggplot(cohdraws, aes(sCJD)) +
+  geom_histogram(binwidth=1) +
+  geom_vline(xintercept=nspo, linetype=2) +
+  theme_minimal() +
+  theme(
+    axis.text.y=element_blank()
+  ) +
+  coord_cartesian(xlim=c(0, 150), ylim=c(0, 100)) +
+  ylab('frequency') + xlab('somatic calls from sporadic samples')
+spoDraws
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'sporadicDraws.pdf'), spoDraws, width=100, height=100, units='mm')
+}
+
+
+# plot for control
+conDraws <- ggplot(cohdraws, aes(control)) +
+  geom_histogram(binwidth=1) +
+  geom_vline(xintercept=ncon, linetype=2) +
+  theme_minimal() +
+  theme(
+    axis.text.y=element_blank()
+  ) +
+  coord_cartesian(xlim=c(0, 150), ylim=c(0, 100)) +
+  ylab('frequency') + xlab('somatic calls from control samples')
+conDraws
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'controlDraws.pdf'), conDraws, width=100, height=100, units='mm')
+}
+
+
+
+
+
+# Somatic mutations calls -- from which samples? --------------------------
+
+# will prepare a dataframe with lots of info for each sample
+  # first compute coverage (will use below)
+cabs <- ied %>% # number of CAlls By Sample
+  group_by(sample) %>%
+  tally(name='iedCov') # coverage = simply number of rows per sample in ied
+
+# add all the metadata info
+cabs <- left_join(cabs, meta, by='sample')
+
+# check one row per sample
+nrow(cabs) == length(unique(ied$sample))
+
+nsomps <- iedSom %>% # number of somatic mutation calls per sample
+  group_by(sample) %>%
+  tally(name='nSom')
+
+# join this to cabs
+cabs <- left_join(x=cabs, y=nsomps, by='sample')
+
+# if sample not in iedSom it means it had 0 calls
+  # it should be NA for column nSom in cabs, but check to be safe
+nocp <- unique(ied$sample) [ !unique(ied$sample) %in% unique(iedSom$sample) ] # no-call PDGs
+
+as.character(unlist(subset(cabs, is.na(nSom), sample))) %in% nocp
+nocp %in% as.character(unlist(subset(cabs, is.na(nSom), sample)))
+# ok
+
+# replace cabs column nSom NA in cabs by 0
+cabs[which(is.na(cabs$nSom)), 'nSom'] <- 0
+# check no more NA in cabs nSom
+sum(is.na(cabs$nSom))
+
+# compute % of reads somatic
+  # i.e. number of somatic calls / coverage
+cabs$somPro <- cabs$nSom / cabs$iedCov
+cabs$somPer <- cabs$somPro * 100
+
+# make sure levels of prion_disease are in order of cohcols above
+cabs$prion_disease <- factor(cabs$prion_disease, levels=c('inherited', 'sCJD', 'control'))
+
+# order from sample with largest number of somatic calls to lowest
+cabs <- cabs[order(-cabs$nSom),]
+cabs$sample <- factor(cabs$sample, levels=cabs$sample)
+
+# plot histogram sample vs number of somatic calls
+
+# temporarily replace 0 by a low number (0.1) so it plots the colour of the sample in plot
+cabs2 <- cabs
+cabs2$nSom <- as.numeric(cabs2$nSom)
+cabs2[which(cabs2$nSom==0) , 'nSom'] <- 0.1
+
+cabsple <- ggplot(cabs2, aes(x=sample, y=nSom, fill=prion_disease)) +
+  geom_col() +
+  scale_fill_manual(values=cohcols, labels=c('inherited', 'sporadic', 'control')) +
+  theme_minimal() +
+  theme(
+    axis.title.x=element_blank(),
+    axis.text.x=element_text(angle=90),
+    legend.title=element_blank()
+  ) +
+  ylab('somatic mutation calls')
+cabsple
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsbySample.pdf'), cabsple, width=200, height=150, units='mm')
+}
+
+
+
+# plot with percentages
+
+# order from sample with largest percentage of somatic calls to lowest
+cabs <- cabs[order(-cabs$somPer),]
+cabs$sample <- factor(cabs$sample, levels=cabs$sample)
+
+# as above, raise a little bit the samples with 0 somatic calls so can see them
+cabs2 <- cabs
+cabs2$nSom <- as.numeric(cabs2$nSom)
+cabs2[which(cabs2$somPer==0) , 'somPer'] <- 0.002
+
+spllabels <- paste0('#', substr(cabs2$sample, 4, 99))
+
+cabsple2 <- ggplot(cabs2, aes(x=sample, y=somPer, fill=prion_disease, alpha=gDNA_tissue)) +
+  geom_col(width=0.8) +
+  scale_alpha_discrete(range=c(1.0, 0.5)) +
+  scale_fill_manual(values=cohcols, labels=c('inherited', 'sporadic', 'control')) +
+  theme_minimal() +
+  theme(
+    axis.title.x=element_blank(),
+    legend.title=element_blank(),
+    legend.position='none',
+    axis.text.x=element_text(size=7, angle=90, margin = margin(t=0, r=0, b=0, l=0), hjust=0, vjust=0.5),
+    axis.title.y=element_text(size=9, margin = margin(t=0, r=2, b=0, l=0)),
+    axis.text.y=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
+  ) +
+  ylab('somatic mutation calls (% of reads)') +
+  scale_x_discrete(labels=spllabels)
+cabsple2
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsbySamplePer.pdf'), cabsple2, width=80, height=70, units='mm')
+}
+
+
+# version with each sample's colour
+
+# samples are ranked by % somatic, so cannot change that
+# need to match the colour to each sample
+splcols <- meta$colour[match(cabs$sample, meta$sample)]
+
+cabsple3 <- ggplot(cabs2, aes(x=sample, y=somPer, fill=sample)) +
+  geom_col() +
+  scale_fill_manual(values=splcols) +
+  theme_minimal() +
+  theme(
+    axis.title.x=element_blank(),
+    axis.text.x=element_text(angle=90),
+    legend.title=element_blank(),
+    legend.position='none'
+  ) +
+  ylab('somatic mutation calls (% of reads)')
+cabsple3
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsbySamplePer_col.pdf'), cabsple3, width=200, height=150, units='mm')
+}
+
+
+# Number of calls vs coverage ---------------------------------------------
+# does number of calls for each sample correlate with coverage?
+  # would not make the calls wrong (it would be expected we pick up more stuff with higher coverage)
+  # but I think would mean we are not getting a full picture
+callsvscoverage <- ggplot(cabs, aes(x=iedCov, y=nSom, colour=prion_disease)) +
+  geom_point(size=1.5) +
+  scale_colour_manual(values=cohcols) +
+  theme_minimal() +
+  theme(
+    legend.title=element_blank()
+  ) +
+  theme(
+    legend.position='none',
+    #panel.grid.major.x=element_blank(),
+    #panel.grid.minor.x=element_blank(),
+    axis.title.x=element_text(size=9, margin = margin(t=0, r=0, b=0, l=0)),
+    axis.text.x=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
+    axis.title.y=element_text(size=9, margin = margin(t=0, r=0, b=0, l=0)),
+    axis.text.y=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
+  ) +
+  xlab('coverage') + ylab('somatic mutations calls') +
+  scale_x_continuous(breaks=c(0, 10000, 20000, 30000), labels=c('0', '10,000', '20,000', '30,000')) +
+  coord_cartesian(xlim=c(0, 30000))
+callsvscoverage
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvscoverage.pdf'), callsvscoverage, width=60, height=60, units='mm')
+}
+
+
+cor(x=cabs$iedCov, y=cabs$nSom, method='pearson')
+cor.test(x=cabs$iedCov, y=cabs$nSom, method='pearson')
+
+cor(x=cabs$iedCov, y=cabs$nSom, method='spearman')
+cor.test(x=cabs$iedCov, y=cabs$nSom, method='spearman')
+
+
+# useful to have a version with labels
+callsvscoverage2 <- ggplot(cabs, aes(x=iedCov, y=nSom, colour=prion_disease, label=sample)) +
+  geom_point(size=2) +
+  geom_text(size=1.8, hjust=1.0, vjust=2.0) +
+  scale_colour_manual(values=cohcols) +
+  theme_minimal() +
+  theme(
+    legend.title=element_blank()
+  ) +
+  xlab('coverage') + ylab('somatic mutations calls')
+callsvscoverage2
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvscoverage_labels.pdf'), callsvscoverage2, width=120, height=100, units='mm')
+}
+
+
+
+
+# Number of calls vs flowcell ---------------------------------------------
+  # anything weird about flowcell?
+callsvsflowcell <- ggplot(cabs, aes(x=flowcell, y=nSom, colour=prion_disease)) +
+  geom_quasirandom(width=0.1) +
+  scale_colour_manual(values=cohcols) +
+  scale_x_continuous(breaks=1:3) +
+  theme_minimal() +
+  theme(
+    legend.title=element_blank()
+  ) +
+  ylab('somatic mutation calls')
+callsvsflowcell
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvsflowcell.pdf'), callsvsflowcell, width=80, height=100, units='mm')
+}
+
+
+# with percentages
+callsvsflowcellPer <- ggplot(cabs, aes(x=flowcell, y=somPer, colour=prion_disease)) +
+  geom_quasirandom(width=0.1) +
+  scale_colour_manual(values=cohcols) +
+  scale_x_continuous(breaks=1:3) +
+  theme_minimal() +
+  theme(
+    legend.title=element_blank()
+  ) +
+  ylab('somatic mutation calls')
+callsvsflowcellPer
+ggsave(filename=here('needleSam', 'callsvsflowcellPercentage.pdf'), callsvsflowcellPer, width=80, height=100, units='mm')
+
+
+
+# Number of calls vs tissue -----------------------------------------------
+# first version with all the samples
+
+callsvstissue <- ggplot(cabs, aes(x=gDNA_tissue, y=nSom, label=sample, colour=prion_disease)) +
+  geom_quasirandom(width=0.2) +
+  scale_colour_manual(values=cohcols, labels=c('inherited', 'sporadic', 'control')) +
+  theme_minimal() +
+  theme(
+    legend.title=element_blank(),
+    panel.grid.minor.x=element_blank(),
+  ) +
+  coord_cartesian(ylim=c(0,18)) +
+  xlab('') + ylab('somatic calls')
+callsvstissue
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvstissue.pdf'), callsvstissue, width=90, height=100, units='mm')
+}
+
+
+# with percentages
+callsvstissuePer <- ggplot(cabs, aes(x=gDNA_tissue, y=somPer, label=sample, colour=prion_disease)) +
+  geom_quasirandom(width=0.2) +
+  scale_colour_manual(values=cohcols, labels=c('inherited', 'sporadic', 'control')) +
+  theme_minimal() +
+  theme(
+    legend.title=element_blank(),
+    panel.grid.minor.x=element_blank(),
+  ) +
+  xlab('') + ylab('somatic calls')
+callsvstissuePer
+ggsave(filename=here('needleSam', 'callsvstissuePercentage.pdf'), callsvstissuePer, width=90, height=100, units='mm')
+
+# Chi2 test: is there a surplus/deficit of calls in brain or blood
+cont <- table(ied$isSomatic, ied$gDNA_tissue)
+chi <- chisq.test(cont, correct=FALSE)
+chi
+chi$observed
+chi$expected # none below 5
+# there are *more* calls from brain samples than we would have expected
+# or less calls from blood samples than we would have expected
+
+# this does not mean much
+# ! we will see below OPR length predicts well number of somatic calls
+# and brain samples are all long OPRs, so expect more somatic calls
+
+# to demonstrate this, removing samples which have OPRD or reference OPR before Chi2 would be predicted to cancel out the effect
+# = keep only OPRI samples
+iedI <- subset(ied, substr(SV, 1, 4) == 'OPRI')
+cont <- table(iedI$isSomatic, iedI$gDNA_tissue)
+chi <- chisq.test(cont, correct=FALSE)
+chi
+chi$observed
+chi$expected # none below 5
+# correct, p-val is not significant anymore
+
+# plot matched blood v brain samples as slope plot
+# prepare data so one row for each individual
+cabsma <- subset(cabs, cohort=='inherited_tissuecomparison') # cabs matched samples
+
+# order them by individual then by tissue
+cabsma <- cabsma[order(cabsma$individual, cabsma$gDNA_tissue),]
+
+# and use that as factor level
+cabsma$sample <- factor(cabsma$sample, levels=cabsma$sample)
+# also fix factor level of gDNA tissue
+cabsma$gDNA_tissue <- factor(cabsma$gDNA_tissue, levels=unique(cabsma$gDNA_tissue))
+
+# just for the purpose of the plot, need to slightly move apart two dots to avoid overplotting but keep the connecting lines correct
+tissueMatch <- ggplot(cabsma, aes(x=gDNA_tissue, y=nSom, fill=sample)) +
+  geom_line(aes(group=individual), size=1.0, colour='#a7a7a7') +
+  scale_fill_manual(values=cabsma2$colour) +
+  geom_quasirandom(pch=21, size=5.0, colour='white', stroke=1.5, width=0.8, groupOnX=FALSE) +
+  theme_minimal() +
+  theme(
+    axis.title.x=element_blank(),
+    legend.position='none'
+  ) +
+  coord_cartesian(ylim=c(0, 20)) +
+  ylab('somatic mutations calls')
+tissueMatch
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'tissueMatch.pdf'), tissueMatch, width=90, height=100, units='mm')
+}
+
+# with percentages
+tissueMatchPer <- ggplot(cabsma, aes(x=gDNA_tissue, y=somPer, fill=sample)) +
+  geom_line(aes(group=individual), size=1.0, colour='#a7a7a7') +
+  scale_fill_manual(values=cabsma$colour) +
+  coord_cartesian(ylim=c(0,0.3)) +
+  geom_quasirandom(pch=21, size=5.0, colour='white', stroke=1.5, width=0.8, groupOnX=FALSE) +
+  theme_minimal() +
+  theme(
+    axis.title.x=element_blank(),
+    legend.position='none',
+    axis.text.x=element_text(size=9, margin = margin(t=0, r=0, b=0, l=0)),
+    axis.title.y=element_text(size=9, margin = margin(t=0, r=2, b=0, l=0)),
+    axis.text.y=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
+  ) +
+  ylab('somatic mutations calls (% of reads)')
+tissueMatchPer
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'tissueMatchPercentage.pdf'), tissueMatchPer, width=60, height=68, units='mm')
+}
+
+# one-sample t-test
+
+# make a simpler dataframe for this
+tisco <- cabsma %>% # tissue comparison
+  select(individual, gDNA_tissue, somPer) %>%
+  pivot_wider(names_from=gDNA_tissue,
+             values_from=somPer) %>%
+  mutate(tissuediff=blood-brain) # compute delta blood - brain
+
+# one-sample t-test: I am testing if the % decrease is significant
+  # so null hypothesis = no decrease = 0% (hence one-sample)
+  # I am specifically testing decrease, so one-sided
+  # why specifically decrease: strand slippage model relies on replication,
+    # hence should occur more often in tissues that divides a lot vs post-mitotic tissues
+t.test(x=tisco$tissuediff, mu=0, alternative='greater')
+
+# mean +- sd of difference
+mean(tisco$tissuediff)
+sd(tisco$tissuediff)
+
+
+
+
+# Number of calls vs MRC intersect/slope ----------------------------------
+
+# vs MRC_t0
+# i.e. score when patient first started getting followed
+callsvsMRCt0 <- ggplot(cabs, aes(x=MRC_t0, y=nSom, colour=prion_disease, alpha=gDNA_tissue)) +
+  geom_point() +
+  scale_colour_manual(values=cohcols) +
+  scale_alpha_discrete(range=c(1.0, 0.5)) +
+  theme_minimal() +
+  theme(
+    legend.position='none'
+  ) +
+  coord_cartesian(xlim=c(20, 0)) +
+  ylab('somatic mutation calls') + xlab('MRC scale at first assessment')
+callsvsMRCt0
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvsMRCt0.pdf'), callsvsMRCt0, width=120, height=100, units='mm')
+}
+
+
+# with percentages
+callsvsMRCt0Per <- ggplot(cabs, aes(x=MRC_t0, y=somPer, colour=prion_disease, alpha=gDNA_tissue)) +
+  geom_point() +
+  scale_colour_manual(values=cohcols) +
+  scale_alpha_discrete(range=c(1.0, 0.5)) +
+  theme_minimal() +
+  theme(
+    legend.position='none'
+  ) +
+  coord_cartesian(xlim=c(20, 0)) +
+  ylab('somatic mutation calls (% of reads)') + xlab('MRC scale at first assessment')
+callsvsMRCt0Per
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvsMRCt0Percentage.pdf'), callsvsMRCt0, width=120, height=100, units='mm')
+}
+
+
+# vs MRC_intersect
+  # just having a look, but I do not think there is a clear interpretation for MRC intersect
+  # also it is very close to MRC_t0
+callsvsMRCint <- ggplot(cabs, aes(x=MRC_intersect, y=nSom, colour=prion_disease, alpha=gDNA_tissue)) +
+  geom_point() +
+  scale_colour_manual(values=cohcols) +
+  scale_alpha_discrete(range=c(1.0, 0.5)) +
+  theme_minimal() +
+  theme(
+    legend.position='none'
+  ) +
+  coord_cartesian(xlim=c(20, 0)) +
+  ylab('somatic mutation calls') + xlab('MRC scale at first assessment')
+callsvsMRCint
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvsMRCint.pdf'), callsvsMRCint, width=120, height=100, units='mm')
+}
+
+  
+# with percentages
+callsvsMRCintPer <- ggplot(cabs, aes(x=MRC_intersect, y=somPer, colour=prion_disease, alpha=gDNA_tissue)) +
+  geom_point() +
+  scale_colour_manual(values=cohcols) +
+  scale_alpha_discrete(range=c(1.0, 0.5)) +
+  theme_minimal() +
+  theme(
+    legend.position='none'
+  ) +
+  coord_cartesian(xlim=c(20, 0)) +
+  ylab('somatic mutation calls (% of reads)') + xlab('MRC scale at first assessment')
+callsvsMRCintPer
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvsMRCintPercentage.pdf'), callsvsMRCint, width=120, height=100, units='mm')
+}
+
+# vs MRC_slope
+  # goal is to quantify speed of decline
+callsvsMRCslope <- ggplot(cabs, aes(x=MRC_slope, y=nSom, colour=prion_disease, alpha=gDNA_tissue)) +
+  geom_point() +
+  scale_colour_manual(values=cohcols) +
+  scale_alpha_discrete(range=c(1.0, 0.5)) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+    panel.grid.minor.x=element_blank(), 
+    axis.title.x=element_text(size=9, margin = margin(t=2, r=0, b=0, l=0)),
+    axis.text.x=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
+    axis.title.y=element_text(size=9, margin = margin(t=0, r=2, b=0, l=0)),
+    axis.text.y=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0))
+  ) +
+  coord_cartesian(xlim=c(0, -0.7)) +
+  ylab('somatic mutation calls') + xlab('MRC scale slope')
+callsvsMRCslope
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvsMRCslope.pdf'), callsvsMRCslope, width=75, height=75, units='mm')
+}
+
+
+# with percentages
+callsvsMRCslopePer <- ggplot(cabs, aes(x=MRC_slope, y=somPer, colour=prion_disease, alpha=gDNA_tissue)) +
+  geom_point() +
+  scale_colour_manual(values=cohcols) +
+  scale_alpha_discrete(range=c(1.0, 0.5)) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+    panel.grid.minor.x=element_blank(), 
+    axis.title.x=element_text(size=9, margin = margin(t=2, r=0, b=0, l=0)),
+    axis.text.x=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
+    axis.title.y=element_text(size=9, margin = margin(t=0, r=2, b=0, l=0)),
+    axis.text.y=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0))
+  ) +
+  coord_cartesian(xlim=c(0, -0.7)) +
+  ylab('somatic mutation calls (% of reads)') + xlab('MRC scale slope')
+callsvsMRCslopePer
+cor(cabs$MRC_slope, cabs$somPer, use='pairwise.complete.obs')
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvsMRCslopePercentage.pdf'), callsvsMRCslopePer, width=75, height=75, units='mm')
+}
+
+
+
+# Number of calls vs age --------------------------------------------------
+
+# number of somatic mutation calls vs age of patient when sample taken
+  # blood samples: when blood drawn
+  # brain samples: age at death
+callsvsage <- ggplot(cabs, aes(x=age_atsample, y=nSom, colour=prion_disease, alpha=gDNA_tissue)) +
+  geom_point() +
+  scale_colour_manual(values=cohcols) +
+  scale_alpha_discrete(range=c(1.0, 0.5)) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+    panel.grid.minor.x=element_blank()
+  ) +
+  xlab('age of patient') + ylab('somatic mutation calls')
+callsvsage
+cor(cabs$nSom, cabs$age_atsample, method='pearson', use='pairwise.complete.obs')
+cor(cabs$nSom, cabs$age_atsample, method='spearman', use='pairwise.complete.obs')
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvsage.pdf'), callsvsage, width=120, height=100, units='mm')
+}
+
+# with percentages
+callsvsagePer <- ggplot(cabs, aes(x=age_atsample, y=somPer, colour=prion_disease, alpha=gDNA_tissue)) +
+  geom_point() +
+  scale_colour_manual(values=cohcols) +
+  scale_alpha_discrete(range=c(1.0, 0.5)) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+    panel.grid.minor.x=element_blank(),
+    axis.title.x=element_text(size=9, margin = margin(t=2, r=0, b=0, l=0)),
+    axis.text.x=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
+    axis.title.y=element_text(size=9, margin = margin(t=0, r=2, b=0, l=0)),
+    axis.text.y=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0))) +
+  xlab('age of individual') + ylab('somatic mutation calls (% of reads)')
+callsvsagePer
+
+cor(cabs$somPer, cabs$age_atsample, method='pearson', use='pairwise.complete.obs')
+cor.test(cabs$somPer, cabs$age_atsample, method='pearson', use='pairwise.complete.obs')
+cor(cabs$somPer, cabs$age_atsample, method='spearman', use='pairwise.complete.obs')
+cor.test(cabs$somPer, cabs$age_atsample, method='spearman', use='pairwise.complete.obs')
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvsagePercentage.pdf'), callsvsagePer, width=75, height=75, units='mm')
+}
+
+
+
+# Number of calls vs gender -----------------------------------------------
+
+callsvsgender <- ggplot(cabs, aes(x=gender, y=nSom, colour=prion_disease)) +
+  geom_quasirandom(width=0.2) +
+  scale_colour_manual(values=cohcols) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+    panel.grid.minor.x=element_blank(),
+    axis.title.x=element_blank()
+  ) +
+  scale_x_discrete(labels=c('female', 'male')) +
+  ylab('somatic mutation calls')
+callsvsgender
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvsgender.pdf'), callsvsgender, width=90, height=100, units='mm')
+}
+
+# with percentages
+callsvsgenderPer <- ggplot(cabs, aes(x=gender, y=somPer, colour=prion_disease)) +
+  geom_quasirandom(width=0.2) +
+  scale_colour_manual(values=cohcols) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+    panel.grid.minor.x=element_blank(),
+    axis.title.x=element_blank()
+  ) +
+  scale_x_discrete(labels=c('female', 'male')) +
+  ylab('somatic mutation calls (% of reads)')
+callsvsgenderPer
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvsgenderPercentage.pdf'), callsvsgenderPer, width=90, height=100, units='mm')
+}
+
+
+
+# Number of calls vs Sanger OPR -------------------------------------------
+
+# by strand slippage model, would predict
+  # more repeats, more opportunities for slippage, so more calls
+  # less repeats, less opportunities for slippage, so fewer calls
+
+# add a 'OPR difference' column
+  # eg. OPRI1 = +1 / OPRD1 = -1
+cabs$sanOPRdiff <- sapply(cabs$SV, function(o){ # Sanger OPR difference (eg. 1 OPRD = -1)
+  
+  if (is.na(o)) {
+    return(0) # is NA, that means no SV, so OPRdiff = 0
+  } else {
+    diff <- as.numeric(substr(o, 5, 999)) # get the number; eg. OPRI8 >> 8
+    
+    if (substr(o, 4, 4)=='D') return(-diff) # if OPRD >> put a negative sign
+    if (substr(o, 4, 4)=='I') return(diff) # if OPRI >> leave positive
+  }
+})
+# Note; for PDG 46345 writing +5 for the 5 OPRI
+# Note; could also do total OPR length (i.e. Sanger OPR difference + 5)
+  # in a way it fits better the strand slippage model as what is determinant would be the total number of repeats
+  # but I find the difference more intuitive as can still read the sample's genotype
+
+# number of somatic calls vs Sanger OPR difference
+callsvsSanOPRdiff <- ggplot(cabs, aes(x=sanOPRdiff, y=nSom, colour=prion_disease)) +
+  geom_quasirandom(width=0.3) +
+  scale_colour_manual(values=cohcols) +
+  scale_x_continuous(breaks=-2:8) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+    panel.grid.minor.x=element_blank()
+  ) +
+  xlab('OPR change vs reference') + ylab('somatic mutation calls')
+callsvsSanOPRdiff
+cor(x=cabs$sanOPRdiff, y=cabs$nSom, method='pearson')
+cor.test(x=cabs$sanOPRdiff, y=cabs$nSom, method='pearson')
+cor(x=cabs$sanOPRdiff, y=cabs$nSom, method='spearman')
+cor.test(x=cabs$sanOPRdiff, y=cabs$nSom, method='spearman')
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvsSangerOPRdiff.pdf'), callsvsSanOPRdiff, width=120, height=100, units='mm')
+}
+
+# add linear regression
+# ! lm (y ~ x)
+# x = predictor // y = to be predicted
+# here: x = OPR length // y = number of somatic calls
+somfit <- lm(nSom ~ sanOPRdiff, data=cabs)
+# formula is y = intercept + slope * x
+# here: number of somatic calls = 3.1803 + 0.8938 * OPR difference
+
+intercept <- somfit[[1]][1]
+slope <- somfit[[1]][2]
+
+# X start
+# is shortest OPR
+xstart <- min(cabs$sanOPRdiff)
+
+# X stop
+# is longest OPR
+xstop <- max(cabs$sanOPRdiff)
+
+# Y start
+# >> y = intercept + slope * xstart
+ystart = as.numeric(intercept + slope * xstart)
+
+# Y stop
+# >> y = intercept + slope * xstop
+ystop = as.numeric(intercept + slope * xstop)
+
+# Notes;
+summary(somfit)
+
+# version plot above with fitted line
+callsvsSanOPRdiffFit <- ggplot(cabs, aes(x=sanOPRdiff, y=nSom, colour=prion_disease)) +
+  geom_segment(aes(x=xstart, xend=xstop,
+                   y=ystart, yend=ystop),
+               linetype=2, colour='#ebebeb', size=1.0) +
+  geom_quasirandom(width=0.3) +
+  scale_colour_manual(values=cohcols) +
+  scale_x_continuous(breaks=-2:8) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+    panel.grid.minor.x=element_blank()
+  ) +
+  xlab('OPR change vs reference') + ylab('somatic mutation calls')
+callsvsSanOPRdiffFit
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvsSangerOPRdiffFit.pdf'), callsvsSanOPRdiffFit, width=120, height=100, units='mm')
+}
+
+###
+
+# same but with percentages
+callsvsSanOPRdiffPer <- ggplot(cabs, aes(x=sanOPRdiff, y=somPer, colour=prion_disease)) +
+  geom_quasirandom(width=0.3) +
+  scale_colour_manual(values=cohcols) +
+  scale_x_continuous(breaks=-2:8) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+    panel.grid.minor.x=element_blank()
+  ) +
+  xlab('OPR length vs reference') + ylab('somatic mutation calls (% of reads)')
+callsvsSanOPRdiffPer
+cor(x=cabs$sanOPRdiff, y=cabs$somPer, method='pearson')
+cor.test(x=cabs$sanOPRdiff, y=cabs$somPer, method='pearson')
+cor(x=cabs$sanOPRdiff, y=cabs$somPer, method='spearman')
+cor.test(x=cabs$sanOPRdiff, y=cabs$somPer, method='spearman')
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvsSangerOPRdiffPercentage.pdf'), callsvsSanOPRdiffPer, width=120, height=100, units='mm')
+}
+  
+# add linear regression
+# ! lm (y ~ x)
+# x = predictor // y = to be predicted
+# here: x = OPR length // y = number of somatic calls
+somfit <- lm(somPer ~ sanOPRdiff, data=cabs)
+# formula is y = intercept + slope * x
+# here: number of somatic calls = 3.1803 + 0.8938 * OPR difference
+
+intercept <- somfit[[1]][1]
+slope <- somfit[[1]][2]
+
+# X start
+# is shortest OPR
+xstart <- min(cabs$sanOPRdiff)
+
+# X stop
+# is longest OPR
+xstop <- max(cabs$sanOPRdiff)
+
+# Y start
+# >> y = intercept + slope * xstart
+ystart = as.numeric(intercept + slope * xstart)
+
+# Y stop
+# >> y = intercept + slope * xstop
+ystop = as.numeric(intercept + slope * xstop)
+
+# Notes;
+summary(somfit)
+
+# version plot above with fitted line
+callsvsSanOPRdiffPerFit <- ggplot(cabs, aes(x=sanOPRdiff, y=somPer, colour=prion_disease, alpha=gDNA_tissue)) +
+  geom_segment(aes(x=xstart, xend=xstop,
+                   y=ystart, yend=ystop),
+               linetype=2, colour='#ebebeb', size=1.0) +
+  geom_quasirandom(width=0.3, size=1.2) +
+  scale_colour_manual(values=cohcols) +
+  scale_alpha_discrete(range=c(1.0, 0.5)) +
+  scale_x_continuous(breaks=-2:8) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+    panel.grid.minor.x=element_blank(),
+    axis.title.x=element_text(size=9, margin = margin(t=2, r=0, b=0, l=0)),
+    axis.text.x=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
+    axis.title.y=element_text(size=9, margin = margin(t=0, r=2, b=0, l=0)),
+    axis.text.y=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
+  ) +
+  xlab('OPR length vs reference') + ylab('somatic mutation calls (% of reads)')
+callsvsSanOPRdiffPerFit
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvsSangerOPRdiffPercentageFit.pdf'), callsvsSanOPRdiffPerFit, width=80, height=70, units='mm')
+}
+
+# version with age labels
+  # exploring more a possible relationship below
+callsvsSanOPRdiffPerFitAgelabels <- ggplot(cabs, aes(x=sanOPRdiff, y=somPer, colour=prion_disease, label=age_atsample)) +
+  geom_segment(aes(x=xstart, xend=xstop,
+                   y=ystart, yend=ystop),
+               linetype=2, colour='#ebebeb', size=1.0) +
+  geom_quasirandom(width=0.3) +
+  geom_text() +
+  scale_colour_manual(values=cohcols) +
+  scale_x_continuous(breaks=-2:8) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+    panel.grid.minor.x=element_blank()
+  ) +
+  xlab('OPR change vs reference') + ylab('somatic mutation calls (% of reads)')
+callsvsSanOPRdiffPerFitAgelabels
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvsSanOPRdiffAgelabelsPercentageFit.pdf'), callsvsSanOPRdiffPerFitAgelabels, width=120, height=100, units='mm')
+}
+
+
+
+# Number of calls vs Sanger OPR -- haplotype resolution -------------------
+
+# same idea as above, but treating haplotypes as separate samples
+# re-create cabs but including haplotype info
+cabsh <- ied %>% # number of CAlls By Sample by Haplotype
+  group_by(sample, haplotype) %>%
+  tally(name='iedCov') %>% # coverage = simply number of rows per sample in ied
+  mutate(sam_hap=paste(sample, haplotype, sep='_'), .before='sample') # add sample_haplotype ID column
+
+# add all the metadata info
+  # ! naturally, will be repeated for each haplotype
+cabsh <- left_join(cabsh, meta, by='sample')
+
+nsomps <- iedSom %>% # number of somatic mutation calls per sample per haplotype
+  group_by(sample, haplotype) %>%
+  tally(name='nSom') %>%
+  mutate(sam_hap=paste(sample, haplotype, sep='_'), .before='sample') %>% # add sample_haplotype ID column
+  ungroup() %>%
+  select(-sample, -haplotype)
+
+# join this to cabsh by matching sam_hap
+cabsh <- left_join(x=cabsh, y=nsomps, by='sam_hap')
+
+# nSom = NA actually means 0 somatic calls
+cabsh[which(is.na(cabsh$nSom)) , 'nSom'] <- 0
+
+# compute % of reads somatic
+  # there are 2 options
+  # 1- % of reads from that haplotype
+  # 2- % of reads from that sample
+# I think best to do 1- so haplotype proportions do not contribute
+# i.e. number of somatic calls / coverage
+cabsh$somPro <- cabsh$nSom / cabsh$iedCov
+cabsh$somPer <- cabsh$somPro * 100
+
+# make sure levels of prion_disease are in order of cohcols above
+cabsh$prion_disease <- factor(cabsh$prion_disease, levels=c('inherited', 'sCJD', 'control'))
+
+# order from sam_hap with largest number of somatic calls to lowest
+cabsh <- cabsh[order(-cabsh$nSom),]
+cabsh$sam_hap <- factor(cabsh$sam_hap, levels=cabsh$sam_hap)
+
+# add sanger OPR difference column
+cabsh$sanOPRdiff <- sapply(cabsh$SV, function(o){ # Sanger OPR difference (eg. 1 OPRD = -1)
+  
+  if (is.na(o)) {
+    return(0) # is NA, that means no SV, so OPRdiff = 0
+  } else {
+    diff <- as.numeric(substr(o, 5, 999)) # get the number; eg. OPRI8 >> 8
+    
+    if (substr(o, 4, 4)=='D') return(-diff) # if OPRD >> put a negative sign
+    if (substr(o, 4, 4)=='I') return(diff) # if OPRI >> leave positive
+  }
+})
+# ! remember compound homozygous sample will show up as +5 (not -1)
+
+# now do plot Sanger OPR vs number of calls again, but essentially treating separate haplotypes as separate samples
+# number of somatic calls per haplotype vs Sanger OPR
+cabsh$haplotype <- factor(cabsh$haplotype, levels=c(1, 2, 0))
+
+callsvsSanOPRdiffHap <- ggplot(cabsh, aes(x=sanOPRdiff, y=nSom, colour=haplotype)) +
+  geom_quasirandom(width=0.2) +
+  scale_colour_manual(values=hapcols) +
+  scale_x_continuous(breaks=-2:8) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+    panel.grid.minor.x=element_blank()
+  ) +
+  xlab('OPR change vs difference') + ylab('somatic mutation calls')
+callsvsSanOPRdiffHap
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvsSanOPRdiffHap.pdf'), callsvsSanOPRdiffHap, width=120, height=100, units='mm')
+}
+
+# with percentages
+callsvsSanOPRdiffHapPer <- ggplot(cabsh, aes(x=sanOPRdiff, y=somPer, colour=haplotype)) +
+  geom_quasirandom(width=0.2) +
+  scale_colour_manual(values=hapcols) +
+  scale_x_continuous(breaks=-2:8) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+    panel.grid.minor.x=element_blank()
+  ) +
+  xlab('OPR change vs difference') + ylab('somatic mutation calls')
+callsvsSanOPRdiffHapPer
+ggsave(filename=here('needleSam', 'callsvsSanOPRdiffHapPercentage.pdf'), callsvsSanOPRdiffHapPer, width=120, height=100, units='mm')
+
+# Any striking if plotting sample per sample?
+  # could be for example an OPRI sample where the reference haplotype is affected
+# set OPRdiff column as factor for this, ggplot understands better what I am doing
+cabsh$sanOPRdiff <- factor(cabsh$sanOPRdiff)
+
+# list of plots, one per sample
+hapod <- vector(mode='list', length=length(pdgs)) # one plot per sample, each will be histogram number of calls per haplotype
+
+for (p in 1:length(pdgs)) {
+  
+  # get the Sanger genotype for that sample
+  osan <- as.character(unlist(subset(meta, sample==pdgs[p], SV))) # OPR Sanger
+  
+  # if osan is NA >> it means reference
+  if (is.na(osan)) {
+    osan <- 'reference'
+  }
+  
+  haplot <- ggplot( subset(cabsh, sample==pdgs[p]), aes(x=haplotype, y=nSom, fill=haplotype) ) +
+    geom_col(position='dodge') +
+    scale_fill_manual(values=hapcols) +
+    theme_minimal() +
+    ggtitle(osan) +
+    coord_cartesian(ylim=c(0, 12)) +
+    scale_y_continuous(breaks=seq(0, 12, 2)) +
+    ylab('somatic mutation calls')
+  
+  # place the plot in the list
+  hapod[[p]] <- haplot
+  
+}
+
+hapodGrid <- ggarrange(plotlist=hapod, ncol=1, nrow=length(pdgs))
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsVsHap_bysample.pdf'), plot=hapodGrid, height=1000, width=100, units='mm')
+}
+
+
+
+# Does age affect number of somatic calls irrespective of OPR? ------------
+
+# first; is there a correlation OPR vs age in our dataset
+  # we are assuming here age at sample is a proxy for age at onset
+  # from samples for which we have both age at onset + age at sample + age at death, can see that it correlates well
+  # however there are 4 individuals for whom this is evidently incorrect
+  # 2 controls + one 1 OPRI + one 6 OPRI
+  # (I imagine the OPRI ones are patients screened then followed over time but were not sick at time of recording)
+  # can tell these samples as they have an MRC slope of 0 (as not sick)
+  # it is possible that there is a couple of other samples like this
+    # especially when the individual is dead but we do not have information about disease progression
+    # it may be that they were being followed because they had an OPR mutation but died from something else entirely
+  # however, I can still see for some of those that sample was taken on the year of death or the year before
+  # which strongly suggests this was because they were sick
+
+# exclude individuals which are not sick (so age at sample does not represent onset)
+onidv <- as.numeric(unlist(cabs[which(cabs$MRC_slope==0), 'individual'])) # onidv for onset individuals
+cabson <- subset(cabs, ! individual %in% onidv)
+
+# fit, lm(y ~ x)
+agefit <- lm(cabson$age_atsample ~ cabson$sanOPRdiff)
+agefit
+# age onset = 70 - 1.46 * sanOPRdiff
+# i.e. if reference OPR: age at onset is 70
+# less repeats increase age at onset (i.e. delay onset)
+# more repeats decrease age at onset (i.e. advance onset)
+# for each extra repeat, onset is 1.5 years earlier
+# (need to highlight this is a poor fit though; but replicates literature, see below)
+summary(agefit)
+
+intercept <- agefit[[1]][1]
+slope <- agefit[[1]][2]
+
+# X start
+# is shortest OPR
+xstart <- min(cabson$sanOPRdiff)
+
+# X stop
+# is longest OPR
+xstop <- max(cabson$sanOPRdiff)
+
+# Y start
+# >> y = intercept + slope * xstart
+ystart = as.numeric(intercept + slope * xstart)
+
+# Y stop
+# >> y = intercept + slope * xstop
+ystop = as.numeric(intercept + slope * xstop)
+
+sanOPRdiffvsage <- ggplot(cabson, aes(x=sanOPRdiff, y=age_atsample, colour=prion_disease)) +
+  geom_segment(aes(x=xstart, xend=xstop,
+                   y=ystart, yend=ystop),
+               linetype=2, colour='#ebebeb', size=1.0) +
+  geom_quasirandom(width=0.3) +
+  scale_colour_manual(values=cohcols) +
+  scale_x_continuous(breaks=-2:8) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+    panel.grid.minor.x=element_blank()
+  ) +
+  xlab('OPR change vs reference') + ylab('age of patient')
+sanOPRdiffvsage
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'sanOPRdiffvsage.pdf'), sanOPRdiffvsage, width=120, height=100, units='mm')
+}
+  
+cor(x=cabson$sanOPRdiff, y=cabson$age_atsample, method='pearson')
+cor.test(x=cabson$sanOPRdiff, y=cabson$age_atsample, method='pearson')
+
+cor(x=cabson$sanOPRdiff, y=cabson$age_atsample, method='spearman')
+cor.test(x=cabson$sanOPRdiff, y=cabson$age_atsample, method='spearman')
+# there is a NON-significant negative correlation
+  # negative is what we would expect; more repeats give lower age at onset
+
+# this looks fairly similar as Figure 1: https://jnnp.bmj.com/content/75/8/1166
+# which is reassuring
+
+# get the residuals from fit above
+  # = portion of onset age not explained by OPR
+  # e.g. for individual 3, onset is higher (delayed) vs what we would expect from their OPR (positive residual)
+cabson$sanOPRdiff_vs_age_residuals <- resid(agefit)
+
+# add the column to cabs by matching samples
+cabs <- cabson %>%
+  select(sample, sanOPRdiff_vs_age_residuals) %>%
+  left_join(x=cabs, y=., by='sample')
+
+# similarly, we have fit OPR vs % somatic calls from before
+# get the residuals
+  # = portion of % somatic that is not explained by OPR
+  # e.g. for this 8 OPRI individual, we would have expected many calls but it does not have a lot (negative residual)
+cabs$sanOPRdiff_vs_somPer_residuals <- resid(somfit) # we can directly add those to cabs
+
+
+# Approach 1
+# Is there a correlation somatic % residuals vs age?
+  # I think this is assuming there is no relationship age vs OPR which is probably untrue
+# positive correlation:
+  # less mutations than expected from OPR match younger age &&
+  # more mutations ... match older age
+    # >>> so more mutations as age increases, irrespective of OPR
+  # and vice-versa; negative correlation >>> less mutations as age increases, irrespective of OPR
+ageVsMutationResidual <- ggplot(cabs, aes(x=age_atsample, y=sanOPRdiff_vs_somPer_residuals, colour=prion_disease)) +
+  geom_point() +
+  scale_colour_manual(values=cohcols) +
+  theme_minimal() +
+  theme(
+    legend.position='none'
+  ) +
+  xlab('age of patient') + ylab('% somatic calls unexplained by OPR')
+ageVsMutationResidual
+
+cor(x=cabs$age_atsample, y=cabs$sanOPRdiff_vs_somPer_residuals, method='pearson')
+cor.test(x=cabs$age_atsample, y=cabs$sanOPRdiff_vs_somPer_residuals, method='pearson')
+
+cor(x=cabs$age_atsample, y=cabs$sanOPRdiff_vs_somPer_residuals, method='spearman')
+cor.test(x=cabs$age_atsample, y=cabs$sanOPRdiff_vs_somPer_residuals, method='spearman')
+# NON-significant negative correlation (or none with Spearman)
+# would (vaguely) suggest less mutations as age increases
+
+# Approach 2
+# residuals vs residuals
+# i.e. x = portion of age unexplained by OPR vs y = portion of % somatic unexplained by OPR
+  # positive correlation:
+    # younger onset than expected by OPR match less mutations than expected by OPR && 
+    # vice-versa; older onset ... more mutations
+    # >>> interpretation could be mutations are protective; i.e. they delay onset
+  # negative correlation:
+    # older onset ... more mutations &&
+    # younger onset ... less mutations
+    # # >>> interpretation could be mutations are detrimental; i.e. they accelerate onset
+
+ageResidualsVsMutationResiduals <- ggplot(cabs, aes(x=sanOPRdiff_vs_age_residuals, y=sanOPRdiff_vs_somPer_residuals, colour=prion_disease)) +
+  geom_point() +
+  scale_colour_manual(values=cohcols) +
+  theme_minimal() +
+  theme(
+    legend.position='none'
+  ) +
+  xlab('difference in age unexplained by OPR') + ylab('% mutation unexplained by OPR')
+ageResidualsVsMutationResiduals
+
+cor(x=cabs$sanOPRdiff_vs_age_residuals, y=cabs$sanOPRdiff_vs_somPer_residuals, use='complete.obs', method='spearman')
+cor.test(x=cabs$sanOPRdiff_vs_age_residuals, y=cabs$sanOPRdiff_vs_somPer_residuals, use='complete.obs', method='spearman')
+
+cor(x=cabs$sanOPRdiff_vs_age_residuals, y=cabs$sanOPRdiff_vs_somPer_residuals, use='complete.obs', method='pearson')
+cor.test(x=cabs$sanOPRdiff_vs_age_residuals, y=cabs$sanOPRdiff_vs_somPer_residuals, use='complete.obs', method='pearson')
+# not convincing at all -- but if anything; negative correlation
+# which would suggest somatic mutations accelerate onset irrespective of OPR
+
+# from labels + 2 approaches here, I think nothing particularly convincing
+
+
+
+# Is OPR mutated haplotype preferentially affected? -----------------------
+
+# from needleSam.R: OPR mutated haplotype (if any) is set to 1
+# Note; PDG 46345, 5 OPRI haplotype is considered as mutated
+# ! here, should exclude samples where haplotype 1/2 distinction is not meaningful
+  # i.e. the samples we could not haplotag (0 SNV) + the samples with reference OPR
+pdgop <- as.character(unlist(subset(meta, !is.na(SV), sample))) # samples with a mutated OPR
+pdgoptag <- intersect(pdgop, pdgs_tag) # samples with a mutated OPR + haplotagged
+
+cabh <- iedSom %>% # calls by haplotype
+  subset(sample %in% pdgoptag) %>% # only somatic calls from samples with mutated OPR + haplotagged
+  group_by(haplotype) %>%
+  tally(name='nSom')
+
+cabh$haplotype <- factor(cabh$haplotype, levels=c(1, 2, 0))
+
+# is there really a surplus of haplotype1 calls?
+
+  # subset of ied, keeping only reads from samples pdgoptag
+iedoptag <- ied %>%
+  subset(sample %in% pdgoptag)
+
+  # build contingency table
+cont <- table(iedoptag$isSomatic, iedoptag$haplotype)
+
+  # Chi2 test
+chi <- chisq.test(cont, correct=FALSE)
+
+chi$expected
+chi$observed
+# the expected count for haplotype0 is low so throws a Warning
+# in this case it is usually recommended to use Fisher's exact test
+fisher.test(cont)
+# it gives very similar p-value (slightly lower)
+
+# I do not actually care about the haplotype0 calls, so perhaps better solution to exclude haplotype0 calls and do Chi2 again
+iedoptag2 <- iedoptag %>%
+  subset(haplotype!=0)
+iedoptag2$haplotype <- factor(iedoptag2$haplotype, levels=c(1,2))
+cont <- table(iedoptag2$isSomatic, iedoptag2$haplotype)
+chi <- chisq.test(cont, correct=FALSE) # p-value ~ 10x lower
+chi
+chi$expected
+chi$observed
+
+# all 3 approaches essentially give the same answer
+# I think will use last (test only on haplotype1/haplotype2)
+
+# get expected counts to add to plot
+hap1exp <- chi$expected[ which(row.names(chi$expected)=='TRUE'), which(colnames(chi$expected)=='1') ]
+hap2exp <- chi$expected[ which(row.names(chi$expected)=='TRUE'), which(colnames(chi$expected)=='2') ]
+
+callsbyHaplotype <- ggplot(cabh, aes(x=haplotype, y=nSom, fill=haplotype)) +
+  geom_col() +
+  #geom_segment(aes(x=0.5, xend=1.5, y=hap1exp, yend=hap1exp), linetype=2, colour='#ebebeb', size=1) +
+  #geom_segment(aes(x=1.5, xend=2.5, y=hap2exp, yend=hap2exp),linetype=2, colour='#ebebeb', size=1) +
+  scale_fill_manual(values=hapcols) +
+  scale_x_discrete(labels=c('OPR mutated', 'OPR reference', 'unassigned')) +
+  theme_minimal() +
+  theme(
+    legend.position='none',
+    #panel.grid.major.x=element_blank(),
+    #panel.grid.minor.x=element_blank(),
+    axis.title.x=element_blank(),
+    # axis.text.x=element_text(angle=90, size=7, margin = margin(t=0, r=0, b=0, l=0)),
+    axis.text.x=element_blank(),
+    axis.title.y=element_text(size=9, margin = margin(t=0, r=2, b=0, l=0)),
+    axis.text.y=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
+  ) +
+  ylab('somatic mutation calls')
+callsbyHaplotype
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsbyHaplotype.pdf'), callsbyHaplotype, width=40, height=60, units='mm')
+}
+
+# with percentages
+  # (should look fairly similar as haplotype1/2 are roughly 50/50)
+  # cannot use haplotype0 calls, very low count but of low total number of reads, so give higher percentage
+  # definitely misleading, will simply exclude for the purpose of this plot
+
+# add total count of each haplotype
+iedoptag2$haplotype <- factor(iedoptag2$haplotype, levels=c(1, 2))
+
+cabh2 <- subset(cabh, haplotype!=0)
+
+cabh2 <- iedoptag2 %>%
+  group_by(haplotype) %>%
+  tally(name='hapReads') %>%
+  left_join(cabh2, by='haplotype')
+
+# add proportion of each haplotype of whole dataset
+cabh2$hapPro <- cabh2$hapReads / nrow(iedoptag)
+
+# compute somatic % of haplotype reads
+cabh2$somPro <- (cabh2$nSom/cabh2$hapReads)
+cabh2$somPer <- (cabh2$nSom/cabh2$hapReads) * 100
+
+# compute expected counts & expected percentages
+# expected count = proportion of haplotype * total number of somatic calls
+cabh2 <- cabh2 %>%
+  mutate(expCount=hapPro * sum(cabh2$nSom)) %>%
+  mutate(expPer=(expCount/hapReads)*100)
+
+cabh2$haplotype <- factor(cabh2$haplotype, levels=c(1, 2))
+
+# plot as above
+callsbyHaplotypePer <- ggplot(cabh2, aes(x=haplotype, y=somPer, fill=haplotype)) +
+  geom_col() +
+  geom_hline(yintercept=cabh2$expPer[1], linetype=2, colour='#ebebeb', size=2) +
+  scale_fill_manual(values=hapcols) +
+  scale_x_discrete(labels=c('OPR mutated', 'OPR reference', 'unassigned')) +
+  theme_minimal() +
+  theme(
+    legend.position='none'
+  ) +
+  ylab('somatic mutation calls (% of reads)')
+callsbyHaplotypePer
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsbyHaplotypePercentage.pdf'), callsbyHaplotypePer, width=100, height=100, units='mm')
+}
+
+
+
+# What are the somatic calls? ---------------------------------------------
+# plot calls vs counts
+# calls as in eg. OPRD1, OPRI2, etc.
+
+# somatic OPR calls are separated in 3 columns mostly to allow cases where a read is both OPRD & OPRI
+
+# but in the calls after mismatch threshold...
+subset(iedSom, !is.na(opri) & !is.na(oprd)) # no row is both OPRI/OPRD
+# and by definition any read that is reference in $othercategory has to be NA in both OPRI & OPRD
+  # >>> so we can merge all three columns in one column
+iedSom$somOPR <- as.character(poolColumns(iedSom, c('othercategory', 'opri', 'oprd')))
+
+# check factor order as I want it
+iedSom$somOPR <- factor(iedSom$somOPR, levels=c(sprintf('OPRD%i', 4:1), 'reference', sprintf('OPRI%i', 1:24)))
+
+# now count number of calls per new OPR
+sowp <- iedSom %>% # number of somatic calls by new OPR 
+  group_by(somOPR) %>%
+  tally(name='nSom')
+
+callsvsnewOPR <- ggplot(sowp, aes(x=somOPR, y=nSom)) +
+  geom_col(fill=NA, colour='#202222') +
+  theme_minimal() +
+  theme(
+    axis.text.x=element_text(angle=90)
+  ) +
+  xlab('') + ylab('somatic mutation calls')
+callsvsnewOPR
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvsnewOPR.pdf'), callsvsnewOPR, width=150, height=100, units='mm')
+}
+
+# I do not think I can do in percentages here
+# I do not see percentage of what I would do...
+# actually I think it is not the most useful plot, as we know it is highly dependent on what the sample was originally
+
+# can fill with haplotype as stacked barplot
+sowph <- iedSom %>%
+  group_by(somOPR, haplotype) %>%
+  tally(name='nSom')
+sowph$haplotype <- factor(sowph$haplotype, levels=c(1, 2, 0))
+
+callsvsnewOPRHap <- ggplot(sowph, aes(x=somOPR, y=nSom, fill=haplotype)) +
+  geom_col(position='stack') +
+  theme_minimal() +
+  theme(
+    axis.text.x=element_text(angle=90),
+    axis.title.x=element_blank(),
+    legend.position='none'
+  ) +
+  scale_fill_manual(values=hapcols) +
+  ylab('somatic mutation calls')
+callsvsnewOPRHap
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvsnewOPRHap.pdf'), callsvsnewOPRHap, width=180, height=100, units='mm')
+}
+
+
+
+
+# Number of calls vs somatic OPR change -----------------------------------
+# more interesting than above as includes 'what the sample was'
+# eg. OPRI8 sample, somatic read is haplotype1 and says OPRI6 >>> -2
+# essentially 'how far does it go from the genotype'
+
+# first prepare column somOPRdiff
+# this is somatic OPR difference vs reference (whatever the sample)
+# eg. somatic calls is 8 OPRI >>> +8
+iedSom$somOPRdiff <- as.integer(unlist(sapply(iedSom$somOPR, function(o) {
+  if(substr(o, 1, 4) == 'OPRI') { # if somatic call is an insertion
+    return(as.integer(substr(o, 5, 99))) # return the number of OPR
+  }
+  
+  else if(substr(o, 1, 4) == 'OPRD') { # if somatic call is a deletion
+    return( - as.integer(substr(o, 5, 99))) # return the number of OPR with a minus sign
+  }
+  
+  else if (o == 'reference') { # if somatic call is reference (only possible for compound homozygous sample)
+    return(0)
+  }
+  
+})))
+
+# second prepare exphapOPR column
+# = Sanger OPR diff we would expect for this haplotype
+  # eg. sample is 8 OPRI
+  # haplotype1 read >>> expected +8
+  # haplotype2 read >>> expected 0
+tmp <- unlist(sapply(1:nrow(iedSom), function(r) { # for each somatic read
+  
+  # get that read's sample
+  spl <- iedSom[r, 'sample']
+  # get the SV of that read's sample
+  sv <- iedSom[r, 'SV']
+  # get that read's haplotype
+  hap <- iedSom[r, 'haplotype']
+  
+  # if SV is NA, always return 0 (reference)
+  # Note, it is also ok to return 0 for samples that we could not haplotag but do not have SV
+  if (is.na(sv)) {
+    return(0)
+  } 
+  
+  # if haplotype is 0 and there is an SV, return NA (as we do not know what to expect)
+  if (hap==0 & !is.na(sv)) {
+    return(NA)
+  }
+  
+  else if (spl=='pdg46345' & hap==1) { # if it is the compound homozygous sample and we are looking at haplotype1 (OPRI5)
+    return(as.integer(substr(sv, 5, 99))) # return the number of OPR (+5)
+  }
+  
+  else if (spl=='pdg46345' & hap==2) { # if it is the compound homozygous sample and we are looking at haplotype2 (OPRD1)
+    return(-1) # return -1
+  }
+  
+  else if (substr(sv, 1, 4) == 'OPRI' & hap==1) { # if SV is an insertion and we are looking at haplotype1 (mutated)
+    return(as.integer(substr(sv, 5, 99))) # return the number of OPR
+  }
+  
+  else if (substr(sv, 1, 4) == 'OPRI' & hap==2) { # if SV is an insertion but we are looking at haplotype2 (reference)
+    return(0)
+  }
+  
+  else if(substr(sv, 1, 4) == 'OPRD' & hap==1) { # if SV is a deletion and we are looking at haplotype1 (mutated)
+    return( - as.integer(substr(sv, 5, 99))) # return minus the number of OPR
+  }
+  
+  else if(substr(sv, 1, 4) == 'OPRD' & hap==2) { # if SV is a deletion but we are looking at haplotype2 (reference)
+    return(0)
+  }
+  
+  else{
+    cat('\t \t \t \t >>> Something unexpected row', r, '\n')
+  }
+  
+}))
+
+length(tmp) == nrow(iedSom) # ok can add to iedSom
+iedSom$exphapOPR <- tmp
+
+# most NA should be from samples which have a mutated OPR but were not haplotagged (PDG 53747, 59060, 58648)
+# as we do not know what to 'expect' from that read (was it from mutated OPR haplotype or not?)
+
+# others NA are from unassigned reads (haplotype0) of samples which have a mutated OPR
+
+# Note; samples which do not have a mutated OPR, we know both haplotypes are reference so what we expect the reads to be is reference = 0 OPRdiff
+
+# third prepare column somOPRchange
+  # = 'how far did the somatic call go from what is expected from the haplotype'
+  # eg. 3 OPRI somatic call in a 4 OPRI sample >> -1
+  # first get absolute difference between somatic OPR diff / Sanger OPR diff
+  # then add the appropriate sign
+tmp <- sapply(1:nrow(iedSom), function(re) { # for each read
+  san <- iedSom[re, 'exphapOPR'] # Sanger OPR count for that haplotype
+  som <- iedSom[re, 'somOPRdiff'] # somatic OPR count for that read
+  
+  # if exphapOPR is NA, return NA
+  # (somOPRdiff is never NA)
+  if (is.na(san)) {
+    return(NA)
+  }
+  
+  if (san < som) {
+    # eg. Sanger = -2 / somatic = -1 >> should return +1
+    # eg. Sanger = 0 / somatic = 4 >> should return +4
+    return(abs(san - som))
+  }
+  
+  else if (san > som) {
+    # eg. Sanger = -1 / somatic = -2 >> should return -1
+    # eg. Sanger = 4 / somatic = 0 >> should return -4
+    return( - abs(san - som))
+  }
+})
+
+iedSom$somOPRchange <- tmp # add the column
+
+
+
+# prepare counts for plot
+scoc <- iedSom %>% # somatic counts by somatic OPR change (eg. how many -1)
+  group_by(somOPRchange) %>%
+  tally(name='nSom')
+
+# plot somatic OPR change vs number of calls
+callsvssomaticOPRchange <- ggplot(scoc, aes(x=somOPRchange, y=nSom)) +
+  geom_col(fill=NA, colour='#4D4D4D', width=0.8) +
+  theme_minimal() +
+  theme(
+    panel.grid.minor.x=element_blank(),
+    axis.title.x=element_text(size=9, margin = margin(t=2, r=0, b=0, l=0)),
+    axis.text.x=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
+    axis.title.y=element_text(size=9, margin = margin(t=0, r=2, b=0, l=0)),
+    axis.text.y=element_text(size=7, margin = margin(t=0, r=0, b=0, l=0)),
+  ) +
+  scale_x_continuous(breaks=min(scoc$somOPRchange, na.rm=TRUE):max(scoc$somOPRchange, na.rm=TRUE)) +
+  xlab('OPR change vs haplotype') + ylab('somatic mutation calls')
+callsvssomaticOPRchange
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvssomaticOPRchange.pdf'), callsvssomaticOPRchange, width=74, height=70, units='mm')
+}
+  
+# prepare counts, similar as above but adding haplotype resolution
+scoch <- iedSom %>% # somatic counts by somatic OPR change (eg. how many -1) by haplotype
+  group_by(somOPRchange, haplotype) %>%
+  tally(name='nSom')
+
+# make sure haplotype is in order I want
+scoch$haplotype <- factor(scoch$haplotype, levels=c(1, 2, 0))
+
+# would adding haplotype fill informative?
+callsvssomaticOPRchangeHap <- ggplot(scoch, aes(x=somOPRchange, y=nSom , fill=haplotype)) +
+  geom_col() +
+  scale_fill_manual(values=hapcols) +
+  theme_minimal() +
+  theme(
+    legend.position='none'
+  ) +
+  scale_x_continuous(breaks=min(scoc$somOPRchange, na.rm=TRUE):max(scoc$somOPRchange, na.rm=TRUE)) +
+  xlab('OPR change vs haplotype') + ylab('somatic mutation calls')
+callsvssomaticOPRchangeHap
+# there are a few I am suspicious about
+  # eg. -5 calls are haplotype1, reference call from PDG 46345 (5 OPRI/1 OPRD)
+  # it may be more likely to be a +1 from 1 OPRD haplotype
+  # but I do not see any objective reason to exclude those
+
+
+# plot separately inherited (mostly mutated OPR) and sporadic/control (reference OPR)
+scocc <- iedSom %>% # somatic counts by somatic OPR change (eg. how many -1) by cohort
+  group_by(somOPRchange, prion_disease) %>%
+  tally(name='nSom')
+
+# add column inherited or not
+scocc$inhTF <- scocc$prion_disease == 'inherited' # inherited TRUE or FALSE
+
+# make sure inhTF is in order I want
+scocc$inhTF <- factor(scocc$inhTF, levels=c(TRUE, FALSE))
+
+# need to 'fill in' the dataframe so both inherited TRUE & FALSE have all OPRs
+# otherwise, eg. -5 is all inherited, so makes a large bar which is inconsistent with the others
+# or in other words, add the 0 counts (-5 non-inherited >> should be 0 nSom)
+
+# make the template dataframe and left_join it to scocc
+oprcgs <- min(scocc$somOPRchange, na.rm=TRUE) : max(scocc$somOPRchange, na.rm=TRUE) # OPR changes present in the data
+tmpl <- as.data.frame(matrix(nrow=length(oprcgs)*2, ncol=2)) # nrow = all OPR changes for both inherited TRUE and FALSE, so x 2
+colnames(tmpl) <- c('inhTF', 'somOPRchange')
+tmpl$inhTF <- c(rep(TRUE, length(oprcgs)) , rep(FALSE, length(oprcgs)))
+tmpl$inhTF <- factor(tmpl$inhTF, levels=c(TRUE, FALSE))
+tmpl$somOPRchange <- c(oprcgs, oprcgs)
+
+scocc <- left_join(tmpl, scocc, by=c('inhTF', 'somOPRchange'))
+
+# replace nSom NA we added by 0
+scocc[which(is.na(scocc$nSom)) , 'nSom'] <- 0
+
+# add total number of reads from inherited / other
+scocc <- ied %>%
+  mutate(inhTF = factor((ied$prion_disease == 'inherited'))) %>%
+  group_by(inhTF) %>%
+  tally(name='totalInhTF') %>%
+  left_join(scocc, by='inhTF')
+# above changes the factor levels
+scocc$inhTF <- factor(scocc$inhTF, levels=c(TRUE, FALSE))
+
+# compute percentage of cohort (inherited / other)
+scocc$somPer <- (scocc$nSom / scocc$totalInhTF) * 100
+
+# colours for inherited TRUE / FALSE
+inhTFcols <- c('#697a87', '#595E60')
+
+# counts
+callsvssomaticOPRchangeCoh <- ggplot(scocc, aes(x=somOPRchange, y=nSom , fill=inhTF)) +
+  geom_col(position='dodge') +
+  scale_fill_manual(values=inhTFcols) +
+  theme_minimal() +
+  theme(
+    legend.position='none'
+  ) +
+  scale_x_continuous(breaks=min(scoc$somOPRchange, na.rm=TRUE):max(scoc$somOPRchange, na.rm=TRUE)) +
+  xlab('OPR change vs haplotype') + ylab('somatic mutation calls')
+callsvssomaticOPRchangeCoh
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvssomaticOPRchangeCoh.pdf'), callsvssomaticOPRchangeCoh, width=120, height=100, units='mm')
+}
+  
+# percentages
+callsvssomaticOPRchangeCohPer <- ggplot(scocc, aes(x=somOPRchange, y=somPer , fill=inhTF)) +
+  geom_col(position='dodge') +
+  scale_fill_manual(values=inhTFcols) +
+  theme_minimal() +
+  theme(
+    legend.position='none'
+  ) +
+  scale_x_continuous(breaks=min(scoc$somOPRchange, na.rm=TRUE):max(scoc$somOPRchange, na.rm=TRUE)) +
+  xlab('OPR change vs haplotype') + ylab('somatic mutation calls (% of reads)')
+callsvssomaticOPRchangeCohPer
+
+if (exportOrNo) {
+  ggsave(filename=here('needleSam', 'callsvssomaticOPRchangeCohPercentage.pdf'), callsvssomaticOPRchangeCohPer, width=120, height=100, units='mm')
+}
+
+
+
+
+# number of somatic insertion/deletion by tissue --------------------------
+# in https://pubmed.ncbi.nlm.nih.gov/16713248/
+# there is an argument why expansions (insertions) may be more frequent in non-dividing cells
+
+# add a column to iedSom whether OPR change vs expected from haplotype is a deletion or an insertion
+  # i.e. whether iedSom$somOPRchange is positive or negative
+iedSom$somOPRchange_type <- NA
+iedSom$somOPRchange_type[which(iedSom$somOPRchange > 0)] <- 'insertion'
+iedSom$somOPRchange_type[which(iedSom$somOPRchange < 0)] <- 'deletion'
+
+# compare within tissue comparison cohort
+idtc <- iedSom %>% # insertion/deletion tissue comparison
+  subset(cohort=='inherited_tissuecomparison') %>%
+  group_by(sample, individual, gDNA_tissue, somOPRchange_type) %>%
+  tally(name='nSom') %>%
+  pivot_wider(names_from=somOPRchange_type,
+              values_from=nSom)
+# one pair of samples could not be haplotagged, so cannot know if insertion or deletion
+# can delete these samples
+inds2del <- match(pdgs_notag, idtc$sample)
+inds2del <- inds2del[!is.na(inds2del)]
+idtc <- idtc[-inds2del,]
+
+# >>> can see it will not be interesting. There are only 2 insertions, both in blood tissues
+
+
+# Can chimeric reads/index hopping be an issue? ---------------------------
+# stringent with filtering criteria in Guppy and before needleSam.R so I do not think likely
+# but here let's take a few examples to see if likely or not
+
+# somatic calls by new OPR by flowcell
+oprflo <- iedSom %>%
+  group_by(somOPR, flowcell) %>%
+  tally(name='nSom')
+
+# count total number of reads from each flowcell
+# and the counts to oprflo
+oprflo <- ied %>%
+  group_by(flowcell) %>%
+  tally(name='totreads') %>%
+  left_join(x=oprflo, y=., by='flowcell')
+
+# compute somatic calls in % of flowcell
+oprflo$somPer <- (oprflo$nSom / oprflo$totreads) * 100
+
+# convert flowcell to factor
+oprflo$flowcell <- factor(oprflo$flowcell, levels=c(1, 2, 3))
+
+# Example1/ flowcell3 has two samples OPRI8; others have 0
+# >> are there more OPRI8 somatic calls from flowcell3 than flowcell 1 & 2?
+subset(oprflo, somOPR=='OPRI8') # no 8 OPRI call from flowcell3
+
+# Example2/ flowcell3 has four samples OPRI4; other have 0
+nrow(subset(ied, flowcell==3 & opri=='OPRI4')) / nrow(subset(ied, flowcell==3)) # 20+ % of reads on flowcell3 are OPRI4
+# >> are there more OPRI4 somatic calls from flowcell3 than flowcell 1 & 2?
+subset(oprflo, somOPR=='OPRI4')
+# no 4 OPRI call from flowcell3
+
+# and how many somatic mutation from flowcell3 (only samples without a 4 OPRI)
+nrow(subset(iedSom, sample %in% as.character(unlist(subset(meta, flowcell==3 & SV != 'OPRI4', sample)))))
+# >>> I do not believe that chimeric read/index hopping can explain any somatic call
+
+
+
+
+
+# write the somatic calls -------------------------------------------------
+
+# export iedSom
+write.xlsx(iedSom, file=here('needleSam', 'somaticCalls.xlsx'))
+
+# export cabs
+write.xlsx(cabs, file=here('needleSam', 'callsBySample.xlsx'))
+
+
+
+
+
+
+# prepare a shortlist -----------------------------------------------------
+
+# some of the criteria below cannot be applied 'equitably' to all samples
+# only interested here in preparing a shortlist the best somatic mutation calls
+
+# will do by read IDs, safer than row indices
+
+# shortlist somatic calls
+cat('\t \t \t \t >>>', nrow(iedSom), 'calls before shortlisting \n')
+
+callsns <- c() # calls not shortlisted (read IDs)
+
+# 1-- delete any haplotype0 calls from samples we could haplotag
+# (the read is probably messy if Whatshap could not haplotag it)
+tmp <- iedSom[which(iedSom$sample %in% pdgs_tag & iedSom$haplotype==0), 'read']
+cat('\t \t \t \t >>>', lengths(tmp), 'calls are haplotype0 calls even though from a haplotagged sample \n')
+callsns <- c(callsns, tmp)
+
+
+# 2-- decrease allowed mismatches
+
+  # original threshold: mean + sd of mismatch by bp of true reads
+    # makes us keep the ~ 86% most accurate reads
+length(which(ied$true_misbp < thrMis)) / length(which(!is.na(ied$true_misbp)))
+
+# we saw somatic calls had a surplus of reads in the 5% mismatch range even after thresholding
+  # >> more stringent threshold: just mean
+    # makes us keep ~ 55% most accurate reads
+thrMis2 <- mean(ied$true_misbp, na.rm=TRUE)
+length(which(ied$true_misbp < thrMis2)) / length(which(!is.na(ied$true_misbp)))
+
+tmp <- iedSom[which(iedSom$som_misbp > thrMis2) , 'read']
+
+cat('\t \t \t \t >>>', length(tmp), 'calls are above stringent mismatch threshold \n')
+callsns <- c(callsns, tmp)
+
+
+# 3-- exclude any calls with low haplotype assignment score
+tmp <- iedSom[which(iedSom$hapscore < 1.0), 'read'] # setting threshold at 0.5, basically 50% of the expected SNVs are not present
+cat('\t \t \t \t >>>', length(tmp), 'calls have a low haplotype assignment score \n')
+callsns <- c(callsns, tmp)
+
+
+# remove these calls
+callsns <- unique(callsns)
+cat('\t \t \t \t >>> Excluding', length(callsns), 'calls \n')
+
+iedSomsl <- iedSom[ - match(callsns, iedSom$read) ,] # sl for shortlist
+
+cat('\t \t \t \t >>> Shortlisted to', nrow(iedSomsl), 'calls \n')
+
+# in shortlist; any replicates?
+  # i.e. more than 1 read same sample/haplotype/somOPR
+iedSomsl %>%
+  group_by(sample, haplotype, exphapOPR, somOPR, somOPRchange) %>%
+  tally(name='nSom') %>%
+  subset(nSom>1)
+
+
+
+
+# reads for figure --------------------------------------------------------
+
+# alignments in figure
+# from top to bottom
+figreads <- c('8f9756e2-1fb9-41d7-a6bf-a11463a55990',
+              'a5b5b295-3e6d-4f48-8839-24aa3fd50718',
+              'e1c59ac3-a547-4616-a120-4dd7d3463e24',
+              '2f8a89cb-73d7-4591-af8b-f814ac373a24',
+              'e67c028a-35e2-44b0-80b8-45ccaa100392',
+              '9d387703-c39e-47df-bd96-49b0a1d62b4d',
+              '86f5c651-7ce0-4778-a55f-bc01cacec79b',
+              'e43b902e-cbaf-4feb-9170-51caecb1cf81',
+              'aa058a24-7e53-4d55-ad91-f5dcdc54602a',
+              '5a5e7247-0550-4f16-aaf2-e51ad2b5c9a5')
+
+length(figreads)
+
+figReds <- ied[match(figreads, ied$read),]
+
+if (exportOrNo) {
+  write.xlsx(figReds, file=here('needleSam', 'figureSomaticReads.xlsx'))
+}
+
+
+
+# write shortlist ---------------------------------------------------------
+
+# write the shortlist
+
+if (exportOrNo) {
+  write.xlsx(iedSomsl, file=here('needleSam', 'somaticCallsShortlist.xlsx'))
+}
diff --git a/prnp_nanopore.Rproj b/prnp_nanopore.Rproj
new file mode 100644
index 0000000..8e3c2eb
--- /dev/null
+++ b/prnp_nanopore.Rproj
@@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
diff --git a/utilities/filterBam.command b/utilities/filterBam.command
new file mode 100644
index 0000000..0488d1e
--- /dev/null
+++ b/utilities/filterBam.command
@@ -0,0 +1,176 @@
+#!/bin/bash
+
+# filter reads in a BAM file
+
+# currently handles
+  # minimum Phred score
+  # minimum reference span (previously read length)
+    # Note; reference span is basically read length as you see it in IGV, i.e. how much of the reference it covers
+    # it includes deletions/insertions and soft-clipped
+    # e.g. read has 150 bp aligned + 10 bp deletion + 20 bp soft-clipped, reference span is 150 + 10 + 20 = 180 bp
+    # however, if we were looking at its length (as in number of characters), it would be 150 + 20 = 170 bp, so doing so has the risk to bias against deletions
+  # maximum read length (not currently in this version, see in v1 if need to add it back)
+  # maximum proportion soft-clipped
+  # only primary alignments (or all)
+
+# explanation
+  # filterBAM.command -i XXX.bam -e int -f int -s float -p (yes or no) -o XXX.bam
+    # -i = input = bam file to process
+    # -e = PhrEd score = minimum Phred score
+    # -f = floor = minimum read span
+    # -s = maximum proportion Soft-clipped
+    # -o = output = name of bam file to output
+
+# example
+  #  filterBam.command -i F08.bam -e 40 -f 149 -s 0.2 -p no -o clean.bam
+
+# v2: relies more heavily on R script readsToThrow.R to name the reads to remove
+# easier as can edit there if want to add filters
+
+######
+
+# read the flags/arguments
+while getopts i:e:f:s:p:o: flag
+do
+    case "${flag}" in
+        i) bam=${OPTARG};;
+        e) min_phred=${OPTARG};;
+        f) min_readspan=${OPTARG};;
+        s) max_softprop=${OPTARG};;
+        p) primary=${OPTARG};;
+        o) out=${OPTARG};;
+    esac
+done
+
+echo
+echo "Input BAM = $bam";
+echo "Minimum Phred score = $min_phred";
+echo "Minimum reference span = $min_readspan";
+echo "Maximum proportion soft-clipped = $max_softprop";
+echo "Keep only primary alignments = $primary";
+echo "Output BAM = $out";
+echo
+
+######
+
+nreads=$(samtools view -c $bam)
+echo "Number of alignments: $nreads"
+
+# will write a temporary SAM to remove secondary alignments and/or filter with readsToThrow.R script
+tmp1=$"tmp1.sam"
+
+samtools view -h -o $tmp1 $bam # convert to sam for below
+
+######
+
+# remove secondary alignments if needed
+  # write temporary SAM = without secondary alignments
+  # (or just a copy of tmp2.sam if -primary no)
+tmp2=$"tmp2.sam"
+
+if [ $primary = "yes" ]
+then
+
+  # counting number of alignments before
+  before=$(samtools view -c $tmp1)
+
+  # removing secondary alignments
+  samtools view -h -F 256 -F 4 -F 2048 $tmp1 > $tmp2
+    # See https://broadinstitute.github.io/picard/explain-flags.html
+    # Note I think - F 4 (unmapped) -F 256 (not primary) are either not used or were removed at some point before
+      # but safer to keep
+    # -F 2048 = supplementary alignment, seems to include -2064 as well
+    # but if I explicitly add -F 2064, it removes all Reverse (-F 16) for some reason
+
+  # counting number of alignments after
+  after=$(samtools view -c $tmp2)
+
+  # how many did we remove?
+  ndel=$( expr $before - $after )
+  echo "Number of secondary alignments removed: $ndel"
+
+else # if No (or anything else)
+  # just store copy of tmp1 as tmp2 so we can continue below
+  cp $tmp1 $tmp2
+
+fi
+
+# in both cases, also copy a BAM copy for below
+tmp2b=$"tmp2.bam"
+samtools view -bS $tmp2 -o $tmp2b
+
+
+######
+
+# Note; how to get full path of a file, example $sam
+# echo "$(cd "$(dirname "$sam")"; pwd -P)/$(basename "$sam")"
+
+# filter by soft-clipping
+
+sampath="$( echo "$(cd "$(dirname "$tmp3")"; pwd -P)/$(basename "$tmp2")" )" # get full path of SAM file we wrote above
+
+# removereads.txt = list of read names to be removed as too much soft-clipping
+touch removereads.txt # write an empty file
+# get its full path
+removereadspath="$( echo "$(cd "$(dirname removereads.txt)"; pwd -P)/$(basename removereads.txt)" )" # get full path of SAM file we wrote above
+
+Rscript ~/packages/myscripts/readsToThrow.R \
+  $sampath \
+  $min_phred \
+  $min_readspan \
+  $max_softprop \
+  $removereadspath
+
+# tell user number of reads which will remove as above soft-clipping threshold
+ndel=$(wc -l < removereads.txt)
+echo "Number of alignments removed by Phred/read span length/soft-clipped filters:$ndel"
+
+# now exclude these reads from tmp2b.bam (the reads listed in removereads.txt)
+tmp3=$"tmp3.bam"
+
+# previous solution; but takes ages: samtools view -h $tmp3b | grep -vf removereads.txt | samtools view -bS -o $tmp4
+# v2
+echo # FilterSamReads still talks even though quiet, so leave some space
+echo
+
+# ! skip that step if 0 reads to remove
+if [ $ndel -ne 0 ] # if not 0 reads to remove
+then
+
+  java -jar ~/packages/picard/picard.jar FilterSamReads \
+    I=$tmp2b \
+    O=$tmp3 \
+    READ_LIST_FILE=removereads.txt FILTER=excludeReadList \
+    use_jdk_deflater=true \
+    use_jdk_inflater=true \
+    quiet=TRUE
+
+else
+  # just store copy of tmp3b as tmp4 so we can continue below
+  cp $tmp2b $tmp3
+
+fi
+
+
+# sort tmp2.bam and write final bam file
+# index the final bam file
+samtools sort $tmp3 > $out
+samtools index $out
+
+# tell final number of alignments
+fin=$(samtools view -c $out)
+echo
+echo
+echo "Final number of alignments: $fin"
+echo
+echo
+echo
+
+######
+
+# clean after ourselves
+rm $tmp1
+rm $tmp2
+rm $tmp2b
+rm $tmp3
+rm removereads.txt
diff --git a/utilities/prepareVCFforWhatshap.R b/utilities/prepareVCFforWhatshap.R
new file mode 100644
index 0000000..b27c515
--- /dev/null
+++ b/utilities/prepareVCFforWhatshap.R
@@ -0,0 +1,199 @@
+# prepare VCFs for whatshap
+  # from curated nanopolish SNVs calls
+
+# packages ----------------------------------------------------------------
+
+library(openxlsx)
+library(here)
+library(dplyr)
+library(tidyr)
+
+
+
+
+# import ------------------------------------------------------------------
+
+# will make VCFs from filtered SNVs
+
+# import filtered SNVs
+snv <- read.xlsx(here('AdditionalFile1.xlsx'), sheet='SNVs_filtered')
+# will assume all these calls are true positives
+
+# import unfiltered SNVs
+snvunf <- read.xlsx(here('AdditionalFile1.xlsx'), sheet='SNVs_unfiltered') # SNVs unfiltered
+
+# import sample's metadata
+meta <- read.xlsx(here('AdditionalFile1.xlsx'), sheet='samples')
+
+
+
+
+# do we have enough SNVs for each sample? ---------------------------------
+# for haplotype phasing, we need heterozygous variants in the gene-body reads
+
+# remove all promoter SNVs from snv
+snv <- snv[-which(snv$POS < 4687751),] # i.e. before end of genebody_Forward primer binding site
+
+# are there any homozygous SNVs? These are not useful for haplotype phasing
+subset(snv, Genotype=='1/1') # 0 homozygous SNP
+
+# are all samples represented in SNVs? (i.e. have at least 1 heterozygous SNV that can be used for haplotype phasing)
+pdgs <- unique(snv$SAMPLE)
+
+meta[!meta$sample %in% pdgs,] # No; 5 samples have 0 SNV calls
+nosv <- sort(meta$sample[!meta$sample %in% pdgs]) # PDG numbers of samples without useful SNVs
+
+# for these samples, any usable SNVs thrown away durinig StrandBias filtering?
+
+# delete SNVs in the promoter, for sure those are not useful here
+snvunf <- snvunf[-which(snvunf$POS < 4687751),] # i.e. before end of genebody_Forward primer binding site
+
+subset(snvunf, SAMPLE==nosv[1])
+subset(snvunf, SAMPLE==nosv[2])
+subset(snvunf, SAMPLE==nosv[3])
+subset(snvunf, SAMPLE==nosv[4])
+subset(snvunf, SAMPLE==nosv[5])
+# all high strandbias
+# >> nothing to salvage from the bin in terms of SNVs
+
+# samples without SNVs are
+# PDG 53747: 6 OPRI
+# PDG 58648: 6 OPRI
+# PDG 54890: no SV (sCJD)
+# PDG 55050: no SV (sCJD)
+# PDG 59060: 6 OPRI
+
+# (59060/53747 are same individual)
+
+# Note: I have tried doing haplotype phasing using a single SV (6 OPRI of PDG 58648), it runs but does not work well
+  # whatshap haplotag seems too stringent about identifying the SV:
+    # one haplotype gathers all the reads with a relatively 'clean' insertion
+    # and all the other reads are considered other haplotype (reference), but I can see in IGV many contain the 6 OPRI
+    # it may still be possible to do it more 'manually' in R by looking at the CIGARs, but probably a bit rough
+
+# >> will only use SNV for haplotype phasing
+# and keep OPR SVs as a way to check if the haplotype phasing is correct
+
+# for the samples that present in snv:
+bysmp <- snv %>%
+  group_by(SAMPLE) %>%
+  tally()
+
+
+
+# prepare VCFs ------------------------------------------------------------
+
+# create 2 folders
+dir.create(here('haplotypephasing'))
+dir.create(here('haplotypephasing/vcf_unphased')) # will store the vcf before phasing, i.e. the ones for which we have more than 1 SNV
+dir.create(here('haplotypephasing/vcf_phased')) # will store the vcf after phasing
+  # i.e. the ones for which we have a single SNV (we are writing it 'as if' it went through whatshap phase)
+  # + whatshap phase will put the phased version of the VCF above in that folder as well
+
+# loop through samples and prepare each VCF
+
+for (p in 1:nrow(meta)) { # for each sample
+  
+  # get PDG number
+  pdg <- meta$sample[p]
+  
+  # ! we do not have any SNV to do haplotype phasing for 5 samples (nosv above), so skip these
+  if (pdg %in% nosv) {
+    next
+  } else if (pdg %in% unique(snv$SAMPLE)) { # for the samples that have enough SNV calls
+    
+    # get the filtered SNV calls for that sample
+    vcf <- subset(snv, SAMPLE==pdg)
+    
+    # need to reconstruct the INFO column
+    infocolumn <- as.character(apply(vcf, 1,
+                                     function(ro){
+                                       paste0('BaseCalledReadsWithVariant=', ro['BaseCalledReadsWithVariant'], ';',
+                                              'BaseCalledFraction=', ro['BaseCalledFraction'], ';',
+                                              'TotalReads=', ro['TotalReads'], ';',
+                                              'AlleleCount=', ro['AlleleCount'], ';',
+                                              'SupportFraction=', ro['SupportFraction'], ';',
+                                              'SupportFractionByStrand=', ro['SupportFractionByStrand'], ';',
+                                              'StrandFisherTest=', ro['StrandFisherTest'], ';',
+                                              'SOR=', ro['SOR'], ';',
+                                              'RefContext=', ro['RefContext'], ';')
+                                     }))
+    # will then add it below
+    
+    # format the VCF
+    vcf <- vcf %>%
+      select('CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'FILTER', 'Genotype') %>% # take only the columns we need
+      mutate(ID=row_number(), .before='REF') %>% # whatshap needs column ID, which was deleted somewhere; will just add the row number
+      mutate(FORMAT='GT', .before='Genotype') %>% # whatshap needs column FORMAT, which was deleted somewhere; I think always GT in my case
+      mutate(INFO=infocolumn, .before='FORMAT') %>% # whatshap needs column FORMAT, which was splitted into its components for readibility, will just add NA here
+      rename('sample'='Genotype') %>% # column Genotype was originally called 'sample' I think, will change it back. It gives the zygosity
+      rename('#CHROM'='CHROM') # add comment character so understands it is the header
+    
+    # ! whenever there is only SNV, need to edit the VCF to 'make it look like' it went through phasing
+    # (whatshap phase does not work with a single SNV as the goal is to phase variants in relation to each other,
+      # but whatshap haplotag works fine and that what we need to separate the reads from each haplotype)
+    
+    if (as.numeric(subset(bysmp, SAMPLE==pdg, n)) == 1) {
+      
+      # change FORMAT and sample to make it look like phased by whatshap phase
+      vcf$FORMAT <- 'GT:PS'
+      vcf$sample <- '0|1:4688888'
+      
+      # header is a bit different than below to write as if it went through whatshap phase
+      header <- '##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##nanopolish_window=chr20:4680000-4705000
+##INFO=<ID=TotalReads,Number=1,Type=Integer,Description="The number of event-space reads used to call the variant">
+##INFO=<ID=SupportFraction,Number=1,Type=Float,Description="The fraction of event-space reads that support the variant">
+##INFO=<ID=SupportFractionByStrand,Number=2,Type=Float,Description="Fraction of event-space reads that support the variant for each strand">
+##INFO=<ID=BaseCalledReadsWithVariant,Number=1,Type=Integer,Description="The number of base-space reads that support the variant">
+##INFO=<ID=BaseCalledFraction,Number=1,Type=Float,Description="The fraction of base-space reads that support the variant">
+##INFO=<ID=AlleleCount,Number=1,Type=Integer,Description="The inferred number of copies of the allele">
+##INFO=<ID=StrandFisherTest,Number=1,Type=Integer,Description="Strand bias fisher test">
+##INFO=<ID=SOR,Number=1,Type=Float,Description="StrandOddsRatio test from GATK">
+##INFO=<ID=RefContext,Number=1,Type=String,Description="The reference sequence context surrounding the variant call">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##contig=<ID=chr20>
+##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set identifier">'
+      
+      # write the header
+      vcfname <- paste0(pdg, '_whap.vcf') # notice p (below is _wha) to indicate already phased
+      
+      write.table(header, file=here('haplotypephasing/vcf_phased', vcfname),
+                  quote=FALSE, row.names=FALSE, col.names=FALSE)
+      
+      # then append the VCF
+      write.table(vcf, file=here('haplotypephasing/vcf_phased', vcfname),
+                  quote=FALSE, row.names=FALSE, sep='\t', append=TRUE)
+      
+    } else if ((as.numeric(subset(bysmp, SAMPLE==pdg, n)) > 1)) {
+      # in the other cases, i.e. when we have more than 1 SNP, will go through whatshap phase so write a more normal VCF
+      
+      
+      # first write the header
+      # ! deleted StrandSupport as there were formatting errors
+      # SupportFractionByBase field also disappeared
+      header <- '##fileformat=VCFv4.2
+##nanopolish_window=chr20:4680000-4705000
+##INFO=<ID=TotalReads,Number=1,Type=Integer,Description="The number of event-space reads used to call the variant">
+##INFO=<ID=SupportFraction,Number=1,Type=Float,Description="The fraction of event-space reads that support the variant">
+##INFO=<ID=SupportFractionByStrand,Number=2,Type=Float,Description="Fraction of event-space reads that support the variant for each strand">
+##INFO=<ID=BaseCalledReadsWithVariant,Number=1,Type=Integer,Description="The number of base-space reads that support the variant">
+##INFO=<ID=BaseCalledFraction,Number=1,Type=Float,Description="The fraction of base-space reads that support the variant">
+##INFO=<ID=AlleleCount,Number=1,Type=Integer,Description="The inferred number of copies of the allele">
+##INFO=<ID=StrandFisherTest,Number=1,Type=Integer,Description="Strand bias fisher test">
+##INFO=<ID=SOR,Number=1,Type=Float,Description="StrandOddsRatio test from GATK">
+##INFO=<ID=RefContext,Number=1,Type=String,Description="The reference sequence context surrounding the variant call">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">'
+      
+      vcfname <- paste0(pdg, '_wha.vcf')
+      write.table(header, file=here('haplotypephasing/vcf_unphased', vcfname),
+                  quote=FALSE, row.names=FALSE, col.names=FALSE)
+      
+      # then append the VCF
+      write.table(vcf, file=here('haplotypephasing/vcf_unphased', vcfname),
+                  quote=FALSE, row.names=FALSE, sep='\t', append=TRUE)
+      
+    }
+  }
+}
\ No newline at end of file
diff --git a/utilities/prnp_OPR.command b/utilities/prnp_OPR.command
new file mode 100644
index 0000000..c05fe13
--- /dev/null
+++ b/utilities/prnp_OPR.command
@@ -0,0 +1,103 @@
+# see somaticOPR_README.md for more complete explanations
+# you probably want to be in ~/.../haplotypephasing/bam_haplotagged/
+
+#!/bin/bash
+
+for i in *.bam # loop thru files in directory that finish by .bam
+do
+
+  BAM="$i"
+
+
+####
+# 1- trim BAM to keep only OPR
+  # genomic windows we do NOT want are in OPRpos.bed
+
+  trim="trim.bam" # temporary file, will delete later
+  pdg="$(echo "$BAM" | cut -d'_' -f 1)" # everything before first _
+  # names of BAM files eg. 1906_sorted_mdfix.bam, so will give PDG (1906)
+
+  mkdir clipLogs
+  log="$(echo $"clipLogs/""$pdg"$"_clipLog.txt")"
+
+  echo
+  echo
+  echo
+  echo "---- [ Sample"$" $BAM ] ----"
+  echo
+  echo "  Trimming to keep only OPR..."
+
+  samtools ampliconclip --hard-clip --both-ends --clipped \
+  -b ~/Dropbox/prnp_nanopore/needleSam/OPRpos.bed \
+  $BAM \
+  -o $trim \
+  -f $log
+
+
+####
+# 2- filter trim.bam
+  # minimum read length
+    # = 21 bp
+    # equivalent to 4 OPRD (only R1 left, 27 bp) - 6 bp for low noise
+  # maximum read length
+    # = 702 bp
+    # equivalent to reference (123 bp) + 24 OPRI (576 bp) + 3 bp for high noise
+  # maximum proportion of read soft-clipped = 0.1
+  # keep only primary alignments = yes
+
+  echo "  Filtering the trimmed BAM..."
+
+  mkdir OPRtrim
+  out="$(echo $"OPRtrim/""$pdg"$"_opr.bam")"
+
+  filterBam.command -i $trim -f 21 -c 702 -s 0.1 -p yes -o $out
+  # Note; this will partly overlap with the filtering performed in prnp_haplotypePhasing.command
+
+  # we do not need to keep trim.bam
+  rm $trim
+
+
+####
+# 3- count read lengths
+  # from https://www.biostars.org/p/65216/
+
+  mkdir readlengths
+  readlghts="$(echo $"readlengths/""$pdg"$"_lengths.txt")"
+
+
+  # readlghts="$(echo "$pdg"$"_lengths.txt")"
+
+  samtools view -F 4 "$out"| # -F 4 = only mapped reads
+    cut -f 10 |
+    perl -ne 'chomp;print length($_) . "\n"' |
+    sort -n |
+    uniq -c > $readlghts
+
+
+####
+# 4- convert to SAM so can work on it in R
+  mkdir OPRtrim_sam
+  tmp1="$(echo $"OPRtrim_sam/tmp1.sam")"
+  tmp2="$(echo $"OPRtrim_sam/tmp2.sam")"
+  sam="$(echo $"OPRtrim_sam/""$pdg"$"_opr.sam")"
+
+  samtools view -h -o $tmp1 $out # convert to SAM (first a temporary file)
+
+  # remove the SA tag from the temporary SAM
+  # (it tags chimeric read)
+    # it is a pain in R later because it is not present for all reads, so it shifts the columns for certain reads
+  sed 's/\tSA\:Z\:[^\t]*//' $tmp1 > $tmp2
+
+  # drop the QUAL column (in practice, switch it to *)
+    # I am not using it and it is full of special characters, including @ which is the comment character when importing in R
+      # makes it very challenging to import in R correctly if not removing it
+  cat $tmp2 | awk -v OFS='\t' '$11="*"' > $sam
+    # above is not a particularly elegant solution because it also affects the header lines, but I am not importing them in R...
+
+  # delete the temporary SAM
+  rm $tmp1
+  rm $tmp2
+
+####
+
+done
diff --git a/utilities/prnp_filterBam.command b/utilities/prnp_filterBam.command
new file mode 100644
index 0000000..a84dda2
--- /dev/null
+++ b/utilities/prnp_filterBam.command
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# Filtering the gene-body BAM files before OPR analysis
+  # Maximum length
+    # reads should be chr20:4687732--4701756 = 14024 bp
+    # allowing some extra length for long OPR insertions + adapter/barcode >> say maximum 15000 bp
+    # Why care? Longer reads should not be possible and so are suspicious
+      # see eg. https://community.nanoporetech.com/posts/a-quick-guide-to-tiling-am, below Barcode mismatches
+
+  # Minimum length
+    # this can be slightly more flexible
+    # but keeping only reads that span most of region should help haplotype phasing (as most reads should cover all available SNVs)
+    # say minimum 10000 bp
+
+  # Soft-clipping
+    # from looking at some reads in IGV, soft-clipping is typically ~ 105 left + 50 right / 14,024 = 1.1% of the read
+    # longer is potentially suspicious
+    # say maximum 5%
+
+  # Keep only primary alignments
+    # should guarantee that read IDs are unique
+
+mkdir bamfilt
+
+####
+for i in genebodyBams/*.bam # loop through the bam files
+do
+
+  bam="$i"
+
+  echo
+  echo
+  echo
+  echo "---- [ filtering BAM"$" $bam ] ----"
+  echo
+
+  # get the PDG number
+  pdg="$(echo "$bam" | cut -d'_' -f 1 | cut -d'/' -f 2)"
+  # BAM can be eg. 2021bam/1906_sorted_mdfix.bam
+  # above will cut to 2021bam/1906, then 1906
+
+  tmp="$(echo $"bamfilt/tmp.bam")"
+  bamfil="$(echo $"bamfilt/""$pdg"$"_f.bam")"
+
+  # filter the bam file
+  filterBam.command -i $bam -f 10000 -c 15000 -s 0.05 -p yes -o $tmp
+
+  # sort and index the filtered bam
+  samtools sort $tmp > $bamfil
+  samtools index $bamfil
+
+  rm $tmp
+  rm bamfilt/tmp.bam.bai # not sure where it comes from!
+
+done
diff --git a/utilities/prnp_haplotypePhasing.command b/utilities/prnp_haplotypePhasing.command
new file mode 100644
index 0000000..5ba0a94
--- /dev/null
+++ b/utilities/prnp_haplotypePhasing.command
@@ -0,0 +1,128 @@
+#!/bin/bash
+
+# script below is meant to be run being in ~/Dropbox/prnp_nanopore/
+
+# for SNVs phasing/haplotagging, there are 2 possibilities
+
+  # 1- there is only 1 SNV available for that sample, so VCF written by prepareVCFforWhatshap.R is 'as if' it went through whatshap phase already
+    # the reason being: whatshap phase does not work with a single SNV as the goal is to phase variants in relation to each other
+      # however, whatshap haplotag works fine and that what we need to separate the reads from each haplotype; but the variant need to 'look' phased by whatshap phase
+
+  # 2- there are more than 1 SNV available for that sample, go through whatshap phase, then whatshap haplotag (normal process)
+
+
+####
+# Possibility 1-
+  # if only 1 SNV; prepareVCFforWhatshap.R wrote VCF as pdg_whap.vcf (p for phased)
+for i in haplotypephasing/vcf_phased/*_whap.vcf # loop through those VCF files (that end with whap.vcf) in the folder
+do
+  # zip the VCF, index it, run directly whatshap haplotag
+  vcf="$i"
+
+  echo
+  echo
+  echo
+  echo "---- [ using VCF"$" $vcf ] ----"
+  echo
+
+  gz="$(echo "$vcf"$".gz")" # append .gz to the VCF filename
+
+  bgzip -c $vcf > $gz # zip it (! need to bgzip and not gzip for tabix below)
+  tabix -p vcf $gz # index the zipped vcf
+
+  # get the PDG number
+  pdg="$(echo "$vcf" | cut -d'_' -f 2 | cut -d'/' -f 2)"
+  # vcf above will be eg. 2021haplotypephasing/vcf_phased/55048_whap.vcf
+  # above will cut to 55048
+
+  mkdir haplotypephasing/bam_haplotagged
+  hptmp="$(echo $"haplotypephasing/bam_haplotagged/hptmp.bam")" # will be haplotagged bam, before samtools sort
+  bamp="$(echo $"haplotypephasing/bam_haplotagged/""$pdg"$"_hp.bam")" # final output: bam where reads are haplotagged (added tag to each read to tell which haplotype it is from)
+  # hp for haplotype-phased
+
+  bamf="$(echo $"bamfilt/""$pdg"$"_f.bam")" # filtered bam (before phasing, input to whatshap haplotag)
+  # should be eg. bamfilt/1906_f.bam
+
+  # run directly whatshap haplotag
+  whatshap haplotag \
+  --ignore-read-groups \
+  -o $hptmp \
+  --reference /Users/francoiskroll/Dropbox/rotations/Rotation3/hg38/hg38.fa \
+  $gz $bamf
+
+  # sort and index the phased bam
+  samtools sort $hptmp > $bamp
+  samtools index $bamp
+
+  # delete tmp file
+  rm $hptmp
+
+done
+
+###
+# Possibility 2-
+  # if more than 1 SNV; prepareVCFforWhatshap.R wrote VCF as pdg_wha.vcf
+    # need to first run whatshap phase to phase variants in relation to each other
+    # then whatshap haplotag to tag the haplotype on each read
+for i in haplotypephasing/vcf_unphased/*_wha.vcf # loop through those VCF files (that end with wha.vcf) in the folder
+do
+  # zip the VCF and index it (same as Possibility 1-)
+  vcf="$i"
+
+  echo
+  echo
+  echo "---- [ using VCF"$" $vcf ] ----"
+  echo
+  echo
+
+  gz="$(echo "$vcf"$".gz")" # append .gz to the VCF filename
+
+  bgzip -c $vcf > $gz # zip it (! need to bgzip and not gzip for tabix below)
+  tabix -p vcf $gz # index the zipped vcf
+
+  # get the PDG number
+  pdg="$(echo "$vcf" | cut -d'_' -f 2 | cut -d'/' -f 2)"
+  # vcf above will be eg. 2021haplotypephasing/vcf_unphased/1906_wha.vcf
+  # above will cut to 1906
+
+  vcfp="$(echo $"haplotypephasing/vcf_phased/""$pdg"$"_whap.vcf")" # vcf phased (output of whatshap phase)
+
+  bamf="$(echo $"bamfilt/""$pdg"$"_f.bam")" # filtered bam (before phasing, input to whatshap haplotag)
+  # should be eg. bamfilt/1906_f.bam
+
+  mkdir haplotypephasing/bam_haplotagged # will place some temporary files (that we will delete), then the final haplotagged file
+  hptmp="$(echo $"haplotypephasing/bam_haplotagged/hptmp.bam")" # will be haplotagged bam, before samtools sort
+  bamp="$(echo $"haplotypephasing/bam_haplotagged/""$pdg"$"_hp.bam")" # final output: bam where reads are haplotagged (added tag to each read to tell which haplotype it is from)
+  # hp for haplotype-phased
+
+  # run whatshap phase, using the filtered bam
+  whatshap phase \
+    --ignore-read-groups \
+    -o $vcfp \
+    --reference=/Users/francoiskroll/Dropbox/rotations/Rotation3/hg38/hg38.fa \
+    $gz \
+    $bamf
+
+  # now use phased vcf to tag the reads with the haplotype from which they are from (whatshap haplotag)
+
+  # zip & index the vcf created above
+  gzp="$(echo "$vcfp"$".gz")" # append .gz to the phased VCF filename
+
+  bgzip -c $vcfp > $gzp # zip it (! need to bgzip and not gzip for tabix below)
+  tabix -p vcf $gzp # index the zipped vcf
+
+  # use it to haplotag reads
+  whatshap haplotag \
+  --ignore-read-groups \
+  -o $hptmp \
+  --reference /Users/francoiskroll/Dropbox/rotations/Rotation3/hg38/hg38.fa \
+  $gzp $bamf
+
+  # sort and index the phased bam
+  samtools sort $hptmp > $bamp
+  samtools index $bamp
+
+  # delete tmp file
+  rm $hptmp
+
+done
diff --git a/utilities/promoter_filterBam.command b/utilities/promoter_filterBam.command
new file mode 100644
index 0000000..5a4bf20
--- /dev/null
+++ b/utilities/promoter_filterBam.command
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# Filtering the promoter BAM files
+  # Maximum length
+    # reads should be chr20:4685060--4688047 = 2988 bp
+    # allowing some extra length for adapter/barcode >> say maximum 3500 bp
+    # Why care? Longer reads should not be possible and so are suspicious
+      # see eg. https://community.nanoporetech.com/posts/a-quick-guide-to-tiling-am, below Barcode mismatches
+
+  # Minimum length
+    # this can be slightly more flexible
+    # say minimum 1000 bp
+
+  # Soft-clipping
+    # from looking at some reads in IGV, soft-clipping is typically can up to ~ 200 bp on each side and read still look good
+    # i.e. 400 bp / 2988 bp = 13%
+    # longer is potentially suspicious
+    # say maximum 15%
+
+  # Keep only primary alignments
+    # should guarantee that read IDs are unique
+
+mkdir promoterbamfilt
+
+####
+for i in promoterBams/*.bam # loop through the bam files
+do
+
+  bam="$i"
+
+  echo
+  echo
+  echo
+  echo "---- [ filtering BAM"$" $bam ] ----"
+  echo
+
+  # get the PDG number
+  pdg="$(echo "$bam" | cut -d'_' -f 1 | cut -d'/' -f 2)"
+  # BAM can be eg. 2021bam/1906_sorted_mdfix.bam
+  # above will cut to 2021bam/1906, then 1906
+
+  tmp="$(echo $"promoterbamfilt/tmp.bam")"
+  bamfil="$(echo $"promoterbamfilt/""$pdg"$"_f.bam")"
+
+  # filter the bam file
+  filterBam.command -i $bam -f 1000 -c 3500 -s 0.15 -p yes -o $tmp
+
+  # sort and index the filtered bam
+  samtools sort $tmp > $bamfil
+  samtools index $bamfil
+
+  rm $tmp
+  rm promoterbamfilt/tmp.bam.bai # not sure where it comes from!
+
+done
diff --git a/utilities/readsToThrow.R b/utilities/readsToThrow.R
new file mode 100644
index 0000000..955df30
--- /dev/null
+++ b/utilities/readsToThrow.R
@@ -0,0 +1,152 @@
+# take a SAM file as input, outputs the names of the reads which are soft-clipped
+
+
+# packages & small functions ----------------------------------------------
+
+library(stringr)
+
+substrEnding <- function(x, n){ # x = string (or vector of strings); n = last n characters
+  substr(x, nchar(x)-n+1, nchar(x))
+}
+
+
+
+
+# function readNametoThrow() ----------------------------------------------
+
+# given one read (row) from a SAM file, checks whether we need to throw it or not based on filters
+# if no need to throw the read; does not do anything
+# if need to throw the read; outputs its name
+
+# Note; probably useful in the future: how to split a CIGAR in its component:
+# str_extract_all('29S45M633H59S', '[0-9]+[A-Z]')
+
+readNametoThrow <- function(read, min_phred, min_readspan, max_softprop) { # read = one row of a correctly-imported SAM file
+  
+  ###
+  
+  # 1- check Phred score
+  
+  # get the read's Phred score
+  phred <- as.numeric(read[5]) # ! assumes it is in column 5
+  
+  if (phred < min_phred) { # if it is below minimum Phred score, name that read
+    cat('\t \t \t \t >>> Read', read[1], 'excluded because its Phred score is', phred, '\n')
+    return(as.character(read[1]))
+  } # Note; if Phred is below threshold, read name is output and function stops here due to return() call
+  # is below only read if Phred is okay;
+  
+  # for debugging
+  # cat('\n \n Phred is ok \n')
+  
+  ###
+  
+  # 2- check the read's reference span
+  
+  # get the read's CIGAR
+  cigar <- as.character(read[6]) # ! assumes it is column 6
+  
+  tmp <- str_extract_all(cigar, '[0-9]+[A-Z]')[[1]] # splits the CIGAR into its componenets; e.g. 59S129M32S >> 59S 129M 32S
+  span <- sum(as.numeric(unlist(strsplit(tmp, '[A-Z]')))) # sum the number parts, e.g. 59 + 129 + 32 = 220; this gives the reference span
+  
+  if (span < min_readspan) { # if read's span is below minimum length, name that read
+    cat('\t \t \t \t >>> Read', read[1], 'excluded because its reference span is', span, 'bp \n')
+    return(as.character(read[1]))
+  }
+  
+  # for debugging
+  # cat('Span is ok \n')
+  
+  
+  ###
+  
+  # 3- check the read's soft-clipping
+  # uses CIGAR from above
+  
+  if(length(grep(pattern='S', cigar)) != 0) { # if read is soft-clipped; i.e. there is S in CIGAR
+    tmp <- str_extract_all(cigar, '[0-9]+S')[[1]] # extracts the S components; eg. 59S129M32S >> 59S 32S
+    softbp <- sum(as.numeric(unlist(strsplit(tmp, '[A-Z]'))))
+    # split at any letter (here will be only S), so example above: 59 32
+    # convert to numeric & sum
+    
+    # divide by read length to get proportion soft-clipped
+    softp <- as.numeric(softbp/nchar(read[10]))
+    
+    if (softp > max_softprop) { # if proportion soft-clipped is above unwanted threshold; get the read name
+      cat('\t \t \t \t >>> Read', read[1], 'excluded because', round(softp * 100), '% of it is soft-clipped \n')
+      return(as.character(read[1]))
+    }
+    
+    # to get until here where nothing happens, read has to pass all filters above
+    
+  }
+  
+  # for debugging
+  # cat('Soft-clipping is ok \n')
+  
+}
+
+
+# main function -----------------------------------------------------------
+
+
+namesOfReadsToThrow <- function(sampath, min_phred, min_readspan, max_softprop, outpath) {
+  
+  # check the path looks ok
+  if(substrEnding(sampath, 4) != '.sam') stop('\t \t \t \t >>> That does not look like the path to a SAM file')
+  
+  # check the max_softprop argument looks ok
+  if(!(max_softprop > 0 & max_softprop < 1)) stop('\t \t \t \t >>> softmax must be 0--1')
+  
+  # if all good -- import the SAM file
+  numcols <- max(count.fields(sampath, sep='\t'), na.rm=TRUE)
+  sam <- read.table(file=sampath, fill=TRUE, comment.char='@', sep='\t',
+                    col.names=sprintf('col%i', 1:numcols))
+  
+  
+  # apply getSoftReadName() function to all the reads
+  # and write txt output
+  excludereads <- as.data.frame(unlist(apply(sam, 1,
+                                             readNametoThrow, min_phred, min_readspan, max_softprop)))
+  
+  # only keep unique names
+  excludereads <- unique(excludereads)
+  
+  # say how many we are removing
+  cat('\n \n \t \t \t \t >>> Excluding total', nrow(excludereads), 'reads \n \n \n')
+  
+  write.table(excludereads, file=outpath,
+              row.names=FALSE, col.names=FALSE, quote=FALSE)
+  
+}
+
+
+# run that function -------------------------------------------------------
+
+# first read the Terminal arguments
+args <- commandArgs(trailingOnly=TRUE)
+
+cat('\n \n readsToThrow.R got arguments: \n')
+
+sampath <- args[1] # read first argument from Terminal = path to SAM file
+cat('\t Path to SAM file = ', sampath, '\n')
+
+min_phred <- args[2] # read second argument from Terminal = minimum Phred score
+cat('\t Minimum Phred score = ', min_phred, '\n')
+
+min_readspan <- args[3] # read third argument from Terminal = minimum reference span
+cat('\t Minimum reference span = ', min_readspan, 'bp \n')
+
+max_softprop <- args[4] # read fourth argument from Terminal = maximum proportion soft-clipped
+cat('\t Maximum proportion soft-clipped ', max_softprop, '\n')
+
+outpath <- args[5] # read fifth argument from Terminal = name for output (output being list of reads to remove)
+cat('\t Output =', outpath, '\n')
+
+
+# run the main function = namesOfSoftReads()
+namesOfReadsToThrow(sampath=sampath,
+                    min_phred=min_phred,
+                    min_readspan=min_readspan,
+                    max_softprop=max_softprop,
+                    outpath=outpath)
\ No newline at end of file