Skip to content

Commit

Permalink
feat: added python cli for SV
Browse files Browse the repository at this point in the history
  • Loading branch information
dnousome committed Dec 8, 2023
1 parent f5ecd20 commit e7bc934
Show file tree
Hide file tree
Showing 17 changed files with 2,087 additions and 935 deletions.
65 changes: 65 additions & 0 deletions bin/add_gt_lofreq.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/bin/bash
#Author: Dr Charles Foster http://github.com/charlesfoster

i_flag=''
g_flag=''
h_flag=''
n_flag=''
o_flag=''

print_usage() {
printf "Usage: bash add_artificial_genotype.sh -i in.vcf.gz [-g genotype] [-n sample_name] -o out.vcf.gz\n"
printf "Genotype defaults to 1 if not specified\n"
printf "Sample name gussed from infile name if not specified\n"
}

if [[ $# -eq 0 ]] ; then
print_usage
exit 1
fi

while getopts 'i:g:n:ho:' flag; do
case "${flag}" in
i) IN="${OPTARG}" ;;
n) NAME="${OPTARG}" ;;
g) GENOTYPE="${OPTARG}" ;;
h) print_usage
exit 1 ;;
o) OUT="${OPTARG}" ;;
*) print_usage
exit 1 ;;
esac
done

if [ ! -f ${IN} ]; then
printf "\nError: input file not found\n"
print_usage
exit 1
fi

if [ -z "${GENOTYPE}" ]
then
printf "\nNo genotype specified: setting to 1"
GENOTYPE=1
fi

if [ -z "${NAME}" ]
then
NAME=$(basename ${IN} | cut -f1 -d ".")
printf "\nNo name specified: guessed it to be ${NAME}"
fi

if [ -z "${OUT}" ]
then
OUT=$(echo ${IN} | sed "s/.vcf.gz/_withGT.vcf.gz/")
printf "\nNo outfile specified: setting to ${OUT}\n"
fi

gunzip -kc ${IN} | \
sed -e '6i##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">' \
-e "s|FILTER\tINFO|FILTER\tINFO\tFORMAT\t${NAME}|g" | \
awk -F'\t' -v genotype=${GENOTYPE} -v OFS="\t" '/^[^#]/{ $9 = "GT"; $10 = genotype }1' | \
bgzip -c > ${OUT}
tabix -p vcf ${OUT}
printf "VCF with artificial genotype written to ${OUT}\n"
exit 0
47 changes: 47 additions & 0 deletions bin/ascat.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env Rscript

#######
#
#R Script for ASCAT
#
######
library(ASCAT)
library(RColorBrewer)

args = commandArgs(trailingOnly=TRUE)
tumor_bam=args[1]
normal_bam=args[2]

genome="hg38"

ascat.prepareHTS(
tumourseqfile = tumor_bam,
normalseqfile = normal_bam,
tumourname = tumor_name,
normalname = normal_name,
allelecounter_exe = "/PATH/TO/allelecounter",
alleles.prefix = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/ASCAT/G1000_alleles",
loci.prefix = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/ASCAT_G1000_loci",
nthreads = 10
gender = "XX",
genomeVersion = genome,
nthreads = 8,
tumourLogR_file = "Tumor_LogR.txt",
tumourBAF_file = "Tumor_BAF.txt",
normalLogR_file = "Germline_LogR.txt",
normalBAF_file = "Germline_BAF.txt")

ascat.bc = ascat.loadData(Tumor_LogR_file = "Tumor_LogR.txt", Tumor_BAF_file = "Tumor_BAF.txt",
Germline_LogR_file = "Germline_LogR.txt", Germline_BAF_file = "Germline_BAF.txt", gender = 'XX', genomeVersion = "hg19")

ascat.plotRawData(ascat.bc, img.prefix = "Before_correction_")
ascat.bc = ascat.correctLogR(ascat.bc, GCcontentfile = "GC_file.txt", replictimingfile = "RT_file.txt")
ascat.plotRawData(ascat.bc, img.prefix = "After_correction_")
ascat.bc = ascat.aspcf(ascat.bc)
ascat.plotSegmentedData(ascat.bc)
ascat.output = ascat.runAscat(ascat.bc, gamma=1, write_segments = T)
QC = ascat.metrics(ascat.bc,ascat.output)
save(ascat.bc, ascat.output, QC, file = 'ASCAT_objects.Rdata')


#####
59 changes: 59 additions & 0 deletions bin/assess_significance.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/usr/bin/env Rscript

library(rtracklayer)

args <- commandArgs()

dataTable <-read.table(args[5], header=TRUE);
ratio<-data.frame(dataTable)

dataTable <- read.table(args[4], header=FALSE)
cnvs<- data.frame(dataTable)

ratio$Ratio[which(ratio$Ratio==-1)]=NA

cnvs.bed=GRanges(cnvs[,1],IRanges(cnvs[,2],cnvs[,3]))
ratio.bed=GRanges(ratio$Chromosome,IRanges(ratio$Start,ratio$Start),score=ratio$Ratio)

overlaps <- subsetByOverlaps(ratio.bed,cnvs.bed)
normals <- setdiff(ratio.bed,cnvs.bed)
normals <- subsetByOverlaps(ratio.bed,normals)

#mu <- mean(score(normals),na.rm=TRUE)
#sigma<- sd(score(normals),na.rm=TRUE)

#hist(score(normals),n=500,xlim=c(0,2))
#hist(log(score(normals)),n=500,xlim=c(-1,1))

#shapiro.test(score(normals)[which(!is.na(score(normals)))][5001:10000])
#qqnorm (score(normals)[which(!is.na(score(normals)))],ylim=(c(0,10)))
#qqline(score(normals)[which(!is.na(score(normals)))], col = 2)

#shapiro.test(log(score(normals))[which(!is.na(score(normals)))][5001:10000])
#qqnorm (log(score(normals))[which(!is.na(score(normals)))],ylim=(c(-6,10)))
#qqline(log(score(normals))[which(!is.na(score(normals)))], col = 2)

numberOfCol=length(cnvs)
wscore=c()
kscore=c()
for (i in c(1:length(cnvs[,1]))) {
values <- score(subsetByOverlaps(ratio.bed,cnvs.bed[i]))
resultw <- class(try(wilcox.test(values,score(normals)), silent = TRUE))
ifelse(resultw == "try-error", wscore <- c(wscore, "NA"), wscore <- c(wscore, wilcox.test(values,score(normals))$p.value))
resultks <- class(try(ks.test(values,score(normals)), silent = TRUE))
ifelse(resultks == "try-error",kscore <- c(kscore, "NA"),kscore <- c(kscore, ks.test(values,score(normals))$p.value))
}
cnvs = cbind(cnvs, as.numeric(wscore), as.numeric(kscore))

if (numberOfCol==7) {
names(cnvs)=c("chr","start","end","copy number","status","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue")
}
if (numberOfCol==9) {
names(cnvs)=c("chr","start","end","copy number","status","genotype","uncertainty","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue")
}
if (numberOfCol==11) {
names(cnvs)=c("chr","start","end","copy number","status","genotype","uncertainty","somatic/germline","precentageOfGermline","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue")
}
write.table(cnvs, file=paste(args[4],".p.value.txt",sep=""),sep="\t",quote=F,row.names=F)


43 changes: 43 additions & 0 deletions bin/freec_paired.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/perl -w
use strict;
use List::Util 'shuffle';

#INPUT

#my $mergedmaf = $ARGV[1] . '_out/oncotator_out/' . $ARGV[1] . '_merged.maf'; #to fix...
#open C, ">$mergedmaf";

my $outfile = $ARGV[0] . '/freec_genome_config.txt';
my $chrLenFile = $ARGV[1];
my $chrFiles = $ARGV[2];
my $tumormateFile = $ARGV[3];
my $controlmateFile = $ARGV[4];
my $makePileup = $ARGV[5];
my $fastaFile = $ARGV[6];
my $SNPfile = $ARGV[7];

open C, ">$outfile";

print C '[general]' . "\n\n";

print C "BedGraphOutput = TRUE\ndegree = 3\nforceGCcontentNormalization = 0\nminCNAlength = 1\nreadCountThreshold = 10\n";
print C "chrLenFile = $chrLenFile\n";
print C "ploidy = 2,3,4\nbreakPointThreshold = 0.8\nwindow = 50000\n";
print C "chrFiles = $chrFiles\n";
print C "minimalSubclonePresence = 20\nmaxThreads = 8\n";
print C "outputDir = $ARGV[0]\n\n";

print C '[sample]' . "\n\n";

print C "mateFile = $tumormateFile\n";
print C "inputFormat = BAM\nmateOrientation = FR\n\n";

print C '[BAF]' . "\n\n";

print C "mateFile = $controlmateFile\n";
print C "inputFormat = BAM\nmateOrientation = FR\n\n";

print C "makePileup = $makePileup\n";
print C "fastaFile = $fastaFile\n";
print C "minimalCoveragePerPosition = 20\nminimalQualityPerPosition = 20\n";
print C "SNPfile = $SNPfile";
32 changes: 32 additions & 0 deletions bin/lofreq_convert.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
INPUT_FILE="$1"
TUMOR_NAME="$2"
export TUMOR_NAME

zcat "${INPUT_FILE}" \
| awk '($4=="A" || $4 == "C" || $4=="T" || $4=="G" || /^\#/)' \
| perl -ne 'print if /^#|^(chr)*[\dX]+\s.+/' \
| perl -ne 's/AF=/VAF=/g;s/ID=AF/ID=VAF/;print;' \
| perl -ne '
# Add 2 new rows to the description and 2 new columns in the header
if(/^#/){
if(/##INFO=<ID=DP,.+\n/){
$DP=$&;
};
$DP =~ s/INFO/FORMAT/;
print $DP if /min_dp/;
if(/##INFO=<ID=DP4,.+\n/){
$DP4=$&;
};
$DP4 =~ s/INFO/FORMAT/;
print $DP4 if /min_dp/;
if(/^#CHROM.+/){
s/$&/$&\tFORMAT\t$ENV{'TUMOR_NAME'}/;
};
print;
}
# For each feature, add FORMAT column with descriptors and populate TUMOUR column with depth, reads counts
else{
my @data = map { chomp; [ split /=|;/ ] } $_;
$NEW_ROW = "$_\tDP:DP4\t$data[0][1]:$data[0][7]\n";
print $NEW_ROW;
}'
134 changes: 134 additions & 0 deletions bin/makeGraph.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#!/usr/bin/env Rscript

args <- commandArgs()

dataTable <-read.table(args[5], header=TRUE);

ratio<-data.frame(dataTable)
ploidy <- type.convert(args[4])


png(filename = paste(args[5],".log2.png",sep = ""), width = 1180, height = 1180,
units = "px", pointsize = 20, bg = "white", res = NA)
plot(1:10)
op <- par(mfrow = c(5,5))

for (i in c(1:22,'X','Y')) {
tt <- which(ratio$Chromosome==i)
if (length(tt)>0) {
plot(ratio$Start[tt],log2(ratio$Ratio[tt]),xlab = paste ("position, chr",i),ylab = "normalized copy number profile (log2)",pch = ".",col = colors()[88])
tt <- which(ratio$Chromosome==i & ratio$CopyNumber>ploidy )
points(ratio$Start[tt],log2(ratio$Ratio[tt]),pch = ".",col = colors()[136])


tt <- which(ratio$Chromosome==i & ratio$CopyNumber<ploidy & ratio$CopyNumber!= -1)
points(ratio$Start[tt],log2(ratio$Ratio[tt]),pch = ".",col = colors()[461])
tt <- which(ratio$Chromosome==i)

#UNCOMMENT HERE TO SEE THE PREDICTED COPY NUMBER LEVEL:
#points(ratio$Start[tt],log2(ratio$CopyNumber[tt]/ploidy), pch = ".", col = colors()[24],cex=4)

}
tt <- which(ratio$Chromosome==i)

#UNCOMMENT HERE TO SEE THE EVALUATED MEDIAN LEVEL PER SEGMENT:
#points(ratio$Start[tt],log2(ratio$MedianRatio[tt]), pch = ".", col = colors()[463],cex=4)

}

dev.off()


png(filename = paste(args[5],".png",sep = ""), width = 1180, height = 1180,
units = "px", pointsize = 20, bg = "white", res = NA)
plot(1:10)
op <- par(mfrow = c(5,5))

maxLevelToPlot <- 3
for (i in c(1:length(ratio$Ratio))) {
if (ratio$Ratio[i]>maxLevelToPlot) {
ratio$Ratio[i]=maxLevelToPlot;
}
}


for (i in c(1:22,'X','Y')) {
tt <- which(ratio$Chromosome==i)
if (length(tt)>0) {
plot(ratio$Start[tt],ratio$Ratio[tt]*ploidy,ylim = c(0,maxLevelToPlot*ploidy),xlab = paste ("position, chr",i),ylab = "normalized copy number profile",pch = ".",col = colors()[88])
tt <- which(ratio$Chromosome==i & ratio$CopyNumber>ploidy )
points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136])

tt <- which(ratio$Chromosome==i & ratio$Ratio==maxLevelToPlot & ratio$CopyNumber>ploidy)
points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136],cex=4)

tt <- which(ratio$Chromosome==i & ratio$CopyNumber<ploidy & ratio$CopyNumber!= -1)
points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[461])
tt <- which(ratio$Chromosome==i)

#UNCOMMENT HERE TO SEE THE PREDICTED COPY NUMBER LEVEL:
#points(ratio$Start[tt],ratio$CopyNumber[tt], pch = ".", col = colors()[24],cex=4)

}
tt <- which(ratio$Chromosome==i)

#UNCOMMENT HERE TO SEE THE EVALUATED MEDIAN LEVEL PER SEGMENT:
#points(ratio$Start[tt],ratio$MedianRatio[tt]*ploidy, pch = ".", col = colors()[463],cex=4)

}

dev.off()




if (length(args)>5) {
dataTable <-read.table(args[6], header=TRUE);
BAF<-data.frame(dataTable)

png(filename = paste(args[6],".png",sep = ""), width = 1180, height = 1180,
units = "px", pointsize = 20, bg = "white", res = NA)
plot(1:10)
op <- par(mfrow = c(5,5))

for (i in c(1:22,'X','Y')) {
tt <- which(BAF$Chromosome==i)
if (length(tt)>0){
lBAF <-BAF[tt,]
plot(lBAF$Position,lBAF$BAF,ylim = c(-0.1,1.1),xlab = paste ("position, chr",i),ylab = "BAF",pch = ".",col = colors()[1])

tt <- which(lBAF$A==0.5)
points(lBAF$Position[tt],lBAF$BAF[tt],pch = ".",col = colors()[92])
tt <- which(lBAF$A!=0.5 & lBAF$A>=0)
points(lBAF$Position[tt],lBAF$BAF[tt],pch = ".",col = colors()[62])
tt <- 1
pres <- 1

if (length(lBAF$A)>4) {
for (j in c(2:(length(lBAF$A)-pres-1))) {
if (lBAF$A[j]==lBAF$A[j+pres]) {
tt[length(tt)+1] <- j
}
}
points(lBAF$Position[tt],lBAF$A[tt],pch = ".",col = colors()[24],cex=4)
points(lBAF$Position[tt],lBAF$B[tt],pch = ".",col = colors()[24],cex=4)
}

tt <- 1
pres <- 1
if (length(lBAF$FittedA)>4) {
for (j in c(2:(length(lBAF$FittedA)-pres-1))) {
if (lBAF$FittedA[j]==lBAF$FittedA[j+pres]) {
tt[length(tt)+1] <- j
}
}
points(lBAF$Position[tt],lBAF$FittedA[tt],pch = ".",col = colors()[463],cex=4)
points(lBAF$Position[tt],lBAF$FittedB[tt],pch = ".",col = colors()[463],cex=4)
}

}

}
dev.off()

}
Loading

0 comments on commit e7bc934

Please sign in to comment.