feat: added python cli for SV

CCBR · Dec 8, 2023 · e7bc934 · e7bc934
1 parent f5ecd20
commit e7bc934
Show file tree

Hide file tree

Showing 17 changed files with 2,087 additions and 935 deletions.
diff --git a/bin/add_gt_lofreq.sh b/bin/add_gt_lofreq.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+#Author: Dr Charles Foster http://github.com/charlesfoster
+
+i_flag=''
+g_flag=''
+h_flag=''
+n_flag=''
+o_flag=''
+
+print_usage() {
+	printf "Usage: bash add_artificial_genotype.sh -i in.vcf.gz [-g genotype] [-n sample_name] -o out.vcf.gz\n"
+	printf "Genotype defaults to 1 if not specified\n"
+	printf "Sample name gussed from infile name if not specified\n"
+}
+
+if [[ $# -eq 0 ]] ; then
+	print_usage
+    exit 1
+fi
+
+while getopts 'i:g:n:ho:' flag; do
+  case "${flag}" in
+		i) IN="${OPTARG}" ;;
+		n) NAME="${OPTARG}" ;;
+		g) GENOTYPE="${OPTARG}" ;;
+    h) print_usage
+           exit 1 ;;
+    o) OUT="${OPTARG}" ;;
+    *) print_usage
+       exit 1 ;;
+  esac
+done
+
+if [ ! -f ${IN} ]; then
+    printf "\nError: input file not found\n"
+		print_usage
+		exit 1
+fi
+
+if [ -z "${GENOTYPE}" ]
+  then
+    printf "\nNo genotype specified: setting to 1"
+		GENOTYPE=1
+fi
+
+if [ -z "${NAME}" ]
+  then
+		NAME=$(basename ${IN} | cut -f1 -d ".")
+    printf "\nNo name specified: guessed it to be ${NAME}"
+fi
+
+if [ -z "${OUT}" ]
+  then
+		OUT=$(echo ${IN} | sed "s/.vcf.gz/_withGT.vcf.gz/")
+    printf "\nNo outfile specified: setting to ${OUT}\n"
+fi
+
+gunzip -kc ${IN} | \
+sed -e '6i##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">' \
+-e "s|FILTER\tINFO|FILTER\tINFO\tFORMAT\t${NAME}|g" | \
+awk -F'\t' -v genotype=${GENOTYPE} -v OFS="\t" '/^[^#]/{ $9 = "GT"; $10 = genotype }1' | \
+bgzip -c > ${OUT}
+tabix -p vcf ${OUT}
+printf "VCF with artificial genotype written to ${OUT}\n"
+exit 0
diff --git a/bin/ascat.R b/bin/ascat.R
@@ -0,0 +1,47 @@
+#!/usr/bin/env Rscript
+
+#######
+#
+#R Script for ASCAT
+#
+######
+library(ASCAT)
+library(RColorBrewer)
+
+args = commandArgs(trailingOnly=TRUE)
+tumor_bam=args[1]
+normal_bam=args[2]
+
+genome="hg38"
+
+ascat.prepareHTS(
+  tumourseqfile = tumor_bam,
+  normalseqfile = normal_bam,
+  tumourname = tumor_name,
+  normalname = normal_name,
+  allelecounter_exe = "/PATH/TO/allelecounter",
+  alleles.prefix = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/ASCAT/G1000_alleles",
+  loci.prefix = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/ASCAT_G1000_loci",
+  nthreads = 10
+  gender = "XX",
+  genomeVersion = genome,
+  nthreads = 8,
+  tumourLogR_file = "Tumor_LogR.txt",
+  tumourBAF_file = "Tumor_BAF.txt",
+  normalLogR_file = "Germline_LogR.txt",
+  normalBAF_file = "Germline_BAF.txt")
+
+ascat.bc = ascat.loadData(Tumor_LogR_file = "Tumor_LogR.txt", Tumor_BAF_file = "Tumor_BAF.txt", 
+    Germline_LogR_file = "Germline_LogR.txt", Germline_BAF_file = "Germline_BAF.txt", gender = 'XX', genomeVersion = "hg19")
+
+ascat.plotRawData(ascat.bc, img.prefix = "Before_correction_")
+ascat.bc = ascat.correctLogR(ascat.bc, GCcontentfile = "GC_file.txt", replictimingfile = "RT_file.txt")
+ascat.plotRawData(ascat.bc, img.prefix = "After_correction_")
+ascat.bc = ascat.aspcf(ascat.bc)
+ascat.plotSegmentedData(ascat.bc)
+ascat.output = ascat.runAscat(ascat.bc, gamma=1, write_segments = T)
+QC = ascat.metrics(ascat.bc,ascat.output)
+save(ascat.bc, ascat.output, QC, file = 'ASCAT_objects.Rdata')
+
+
+#####
diff --git a/bin/assess_significance.R b/bin/assess_significance.R
@@ -0,0 +1,59 @@
+#!/usr/bin/env Rscript
+
+library(rtracklayer)
+
+args <- commandArgs()
+
+dataTable <-read.table(args[5], header=TRUE);
+ratio<-data.frame(dataTable)
+
+dataTable <- read.table(args[4], header=FALSE)
+cnvs<- data.frame(dataTable)
+
+ratio$Ratio[which(ratio$Ratio==-1)]=NA
+
+cnvs.bed=GRanges(cnvs[,1],IRanges(cnvs[,2],cnvs[,3]))  
+ratio.bed=GRanges(ratio$Chromosome,IRanges(ratio$Start,ratio$Start),score=ratio$Ratio)
+
+overlaps <- subsetByOverlaps(ratio.bed,cnvs.bed)
+normals <- setdiff(ratio.bed,cnvs.bed)
+normals <- subsetByOverlaps(ratio.bed,normals)
+
+#mu <- mean(score(normals),na.rm=TRUE)
+#sigma<- sd(score(normals),na.rm=TRUE)
+
+#hist(score(normals),n=500,xlim=c(0,2))
+#hist(log(score(normals)),n=500,xlim=c(-1,1))
+
+#shapiro.test(score(normals)[which(!is.na(score(normals)))][5001:10000])
+#qqnorm (score(normals)[which(!is.na(score(normals)))],ylim=(c(0,10)))
+#qqline(score(normals)[which(!is.na(score(normals)))], col = 2)
+
+#shapiro.test(log(score(normals))[which(!is.na(score(normals)))][5001:10000])
+#qqnorm (log(score(normals))[which(!is.na(score(normals)))],ylim=(c(-6,10)))
+#qqline(log(score(normals))[which(!is.na(score(normals)))], col = 2)
+
+numberOfCol=length(cnvs)
+wscore=c()
+kscore=c()
+for (i in c(1:length(cnvs[,1]))) {
+values <- score(subsetByOverlaps(ratio.bed,cnvs.bed[i]))
+resultw <- class(try(wilcox.test(values,score(normals)), silent = TRUE))
+ifelse(resultw == "try-error", wscore <- c(wscore, "NA"), wscore <- c(wscore, wilcox.test(values,score(normals))$p.value))
+resultks <- class(try(ks.test(values,score(normals)), silent = TRUE))
+ifelse(resultks == "try-error",kscore <- c(kscore, "NA"),kscore <- c(kscore, ks.test(values,score(normals))$p.value))
+}
+cnvs = cbind(cnvs, as.numeric(wscore), as.numeric(kscore))
+
+if (numberOfCol==7) {
+  names(cnvs)=c("chr","start","end","copy number","status","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue")  
+}
+if (numberOfCol==9) {
+  names(cnvs)=c("chr","start","end","copy number","status","genotype","uncertainty","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue")  
+}
+if (numberOfCol==11) {
+  names(cnvs)=c("chr","start","end","copy number","status","genotype","uncertainty","somatic/germline","precentageOfGermline","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue")  
+}
+write.table(cnvs, file=paste(args[4],".p.value.txt",sep=""),sep="\t",quote=F,row.names=F)
+
+
diff --git a/bin/freec_paired.pl b/bin/freec_paired.pl
@@ -0,0 +1,43 @@
+#!/usr/bin/perl -w
+use strict;
+use List::Util 'shuffle';
+
+#INPUT
+
+#my $mergedmaf = $ARGV[1] . '_out/oncotator_out/' . $ARGV[1] . '_merged.maf'; #to fix...
+#open C, ">$mergedmaf";
+
+my $outfile = $ARGV[0] . '/freec_genome_config.txt';
+my $chrLenFile = $ARGV[1];
+my $chrFiles = $ARGV[2];
+my $tumormateFile = $ARGV[3];
+my $controlmateFile = $ARGV[4];
+my $makePileup = $ARGV[5];
+my $fastaFile = $ARGV[6];
+my $SNPfile = $ARGV[7];
+
+open C, ">$outfile";
+
+print C '[general]' . "\n\n";
+
+print C "BedGraphOutput = TRUE\ndegree = 3\nforceGCcontentNormalization = 0\nminCNAlength = 1\nreadCountThreshold = 10\n";
+print C "chrLenFile = $chrLenFile\n";
+print C "ploidy = 2,3,4\nbreakPointThreshold = 0.8\nwindow = 50000\n";
+print C "chrFiles = $chrFiles\n";
+print C "minimalSubclonePresence = 20\nmaxThreads = 8\n";
+print C "outputDir = $ARGV[0]\n\n";
+
+print C '[sample]' . "\n\n";
+
+print C "mateFile = $tumormateFile\n";
+print C "inputFormat = BAM\nmateOrientation = FR\n\n";
+
+print C '[BAF]' . "\n\n";
+
+print C "mateFile = $controlmateFile\n";
+print C "inputFormat = BAM\nmateOrientation = FR\n\n";
+
+print C "makePileup = $makePileup\n";
+print C "fastaFile = $fastaFile\n";
+print C "minimalCoveragePerPosition = 20\nminimalQualityPerPosition = 20\n";
+print C "SNPfile = $SNPfile";
diff --git a/bin/lofreq_convert.sh b/bin/lofreq_convert.sh
@@ -0,0 +1,32 @@
+INPUT_FILE="$1"
+TUMOR_NAME="$2"
+export TUMOR_NAME
+
+zcat "${INPUT_FILE}" \
+  | awk '($4=="A" || $4 == "C" || $4=="T" || $4=="G" || /^\#/)' \
+  | perl -ne 'print if /^#|^(chr)*[\dX]+\s.+/' \
+  | perl -ne 's/AF=/VAF=/g;s/ID=AF/ID=VAF/;print;' \
+  | perl -ne '
+              # Add 2 new rows to the description and 2 new columns in the header
+              if(/^#/){
+                if(/##INFO=<ID=DP,.+\n/){
+                  $DP=$&;
+                };
+                $DP =~ s/INFO/FORMAT/;
+                print $DP if /min_dp/;
+                if(/##INFO=<ID=DP4,.+\n/){
+                  $DP4=$&;
+                };
+                $DP4 =~ s/INFO/FORMAT/;
+                print $DP4 if /min_dp/;
+                if(/^#CHROM.+/){
+                  s/$&/$&\tFORMAT\t$ENV{'TUMOR_NAME'}/;
+                };
+                print;
+              }
+              # For each feature, add FORMAT column with descriptors and populate TUMOUR column with depth, reads counts
+              else{
+                my @data = map { chomp; [ split /=|;/ ] } $_;
+                $NEW_ROW = "$_\tDP:DP4\t$data[0][1]:$data[0][7]\n";
+                print $NEW_ROW;
+              }'
diff --git a/bin/makeGraph.R b/bin/makeGraph.R
@@ -0,0 +1,134 @@
+#!/usr/bin/env Rscript
+
+args <- commandArgs()
+
+dataTable <-read.table(args[5], header=TRUE);
+
+ratio<-data.frame(dataTable)
+ploidy <- type.convert(args[4])
+
+
+png(filename = paste(args[5],".log2.png",sep = ""), width = 1180, height = 1180,
+    units = "px", pointsize = 20, bg = "white", res = NA)
+plot(1:10)
+op <- par(mfrow = c(5,5))
+
+for (i in c(1:22,'X','Y')) {
+	tt <- which(ratio$Chromosome==i)
+	if (length(tt)>0) {
+	 plot(ratio$Start[tt],log2(ratio$Ratio[tt]),xlab = paste ("position, chr",i),ylab = "normalized copy number profile (log2)",pch = ".",col = colors()[88])
+	 tt <- which(ratio$Chromosome==i  & ratio$CopyNumber>ploidy )
+	 points(ratio$Start[tt],log2(ratio$Ratio[tt]),pch = ".",col = colors()[136])
+
+
+	tt <- which(ratio$Chromosome==i  & ratio$CopyNumber<ploidy & ratio$CopyNumber!= -1)
+	 points(ratio$Start[tt],log2(ratio$Ratio[tt]),pch = ".",col = colors()[461])
+	 tt <- which(ratio$Chromosome==i)
+
+	 #UNCOMMENT HERE TO SEE THE PREDICTED COPY NUMBER LEVEL:
+	 #points(ratio$Start[tt],log2(ratio$CopyNumber[tt]/ploidy), pch = ".", col = colors()[24],cex=4)
+
+	}
+	tt <- which(ratio$Chromosome==i)
+
+	#UNCOMMENT HERE TO SEE THE EVALUATED MEDIAN LEVEL PER SEGMENT:
+	#points(ratio$Start[tt],log2(ratio$MedianRatio[tt]), pch = ".", col = colors()[463],cex=4)
+
+}
+
+dev.off()
+
+
+png(filename = paste(args[5],".png",sep = ""), width = 1180, height = 1180,
+    units = "px", pointsize = 20, bg = "white", res = NA)
+plot(1:10)
+op <- par(mfrow = c(5,5))
+
+maxLevelToPlot <- 3
+for (i in c(1:length(ratio$Ratio))) {
+	if (ratio$Ratio[i]>maxLevelToPlot) {
+		ratio$Ratio[i]=maxLevelToPlot;
+	}
+}
+
+
+for (i in c(1:22,'X','Y')) {
+	tt <- which(ratio$Chromosome==i)
+	if (length(tt)>0) {
+	 plot(ratio$Start[tt],ratio$Ratio[tt]*ploidy,ylim = c(0,maxLevelToPlot*ploidy),xlab = paste ("position, chr",i),ylab = "normalized copy number profile",pch = ".",col = colors()[88])
+	 tt <- which(ratio$Chromosome==i  & ratio$CopyNumber>ploidy )
+	 points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136])
+
+	tt <- which(ratio$Chromosome==i  & ratio$Ratio==maxLevelToPlot & ratio$CopyNumber>ploidy)	
+	points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136],cex=4)
+
+	tt <- which(ratio$Chromosome==i  & ratio$CopyNumber<ploidy & ratio$CopyNumber!= -1)
+	 points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[461])
+	 tt <- which(ratio$Chromosome==i)
+
+	 #UNCOMMENT HERE TO SEE THE PREDICTED COPY NUMBER LEVEL:
+	 #points(ratio$Start[tt],ratio$CopyNumber[tt], pch = ".", col = colors()[24],cex=4)
+
+	}
+	tt <- which(ratio$Chromosome==i)
+
+	#UNCOMMENT HERE TO SEE THE EVALUATED MEDIAN LEVEL PER SEGMENT:
+	#points(ratio$Start[tt],ratio$MedianRatio[tt]*ploidy, pch = ".", col = colors()[463],cex=4)
+
+}
+
+dev.off()
+
+
+
+
+if (length(args)>5) {
+	dataTable <-read.table(args[6], header=TRUE);
+	BAF<-data.frame(dataTable)
+
+	png(filename = paste(args[6],".png",sep = ""), width = 1180, height = 1180,
+	    units = "px", pointsize = 20, bg = "white", res = NA)
+	plot(1:10)
+	op <- par(mfrow = c(5,5))
+
+	for (i in c(1:22,'X','Y')) {
+	    tt <- which(BAF$Chromosome==i)
+	    if (length(tt)>0){
+		lBAF <-BAF[tt,]
+		plot(lBAF$Position,lBAF$BAF,ylim = c(-0.1,1.1),xlab = paste ("position, chr",i),ylab = "BAF",pch = ".",col = colors()[1])
+
+		tt <- which(lBAF$A==0.5)		
+		points(lBAF$Position[tt],lBAF$BAF[tt],pch = ".",col = colors()[92])
+		tt <- which(lBAF$A!=0.5 & lBAF$A>=0)
+		points(lBAF$Position[tt],lBAF$BAF[tt],pch = ".",col = colors()[62])
+		tt <- 1
+		pres <- 1
+
+		if (length(lBAF$A)>4) {
+			for (j in c(2:(length(lBAF$A)-pres-1))) {
+				if (lBAF$A[j]==lBAF$A[j+pres]) {	
+					tt[length(tt)+1] <- j 
+				}
+			}
+			points(lBAF$Position[tt],lBAF$A[tt],pch = ".",col = colors()[24],cex=4)
+			points(lBAF$Position[tt],lBAF$B[tt],pch = ".",col = colors()[24],cex=4)	
+		}
+
+		tt <- 1
+		pres <- 1
+		if (length(lBAF$FittedA)>4) {
+			for (j in c(2:(length(lBAF$FittedA)-pres-1))) {
+				if (lBAF$FittedA[j]==lBAF$FittedA[j+pres]) {	
+					tt[length(tt)+1] <- j 
+				}
+			}
+			points(lBAF$Position[tt],lBAF$FittedA[tt],pch = ".",col = colors()[463],cex=4)
+			points(lBAF$Position[tt],lBAF$FittedB[tt],pch = ".",col = colors()[463],cex=4)	
+		}
+
+	   }
+
+	}
+	dev.off()
+
+}