Skip to content

Commit 453b74a

Browse files
committed
new script for breseq only
implement breseq downsampling
1 parent 79a4442 commit 453b74a

File tree

5 files changed

+331
-1
lines changed

5 files changed

+331
-1
lines changed

Diff for: breseq/breseq.sh

+3-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ threads="$6"
1111
tools_dir="$7"
1212
timing="$8"
1313

14-
1514
# Think about using bwa BAM files, running bowtie on all files will take forever, especially with so many settings ?!
1615

1716
log_eval $PWD "docker run --name=breseq -u 1001:1001 -v $(pwd)/:$(pwd) -w $outdir pvstodghill/breseq:0.35.7__2021-08-03 breseq \
@@ -21,6 +20,9 @@ log_eval $PWD "docker run --name=breseq -u 1001:1001 -v $(pwd)/:$(pwd) -w $outdi
2120
-j $threads \
2221
$fastq1 $fastq2"
2322

23+
log_eval $PWD "docker run --rm --name=python3.8 -v $(pwd):$(pwd) -v $tools_dir:$tools_dir -w $outdir python:3.8-buster python3 $tools_dir/breseq/breseqToVCF.py \
24+
-i $outdir/data/output.gd -r $fasta -c LT962478.1:2263458 LT962479.1:1827941"
25+
2426
START=$(docker inspect --format='{{.State.StartedAt}}' breseq)
2527
STOP=$(docker inspect --format='{{.State.FinishedAt}}' breseq)
2628

Diff for: breseq/breseqToVCF.py

+133
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import glob
2+
import sys
3+
import argparse
4+
import csv
5+
import os
6+
7+
# adapted from : https://github.com/PapenfussLab/sv_benchmark/blob/master/breakdancer2vcf.py
8+
9+
if __name__ == '__main__':
10+
parser = argparse.ArgumentParser(
11+
prog='bdmaxToVCF', description='Reformat bdmax output to bedpe')
12+
parser.add_argument('-i', '--inFile', type=str, nargs='+',
13+
help='file containing bdmax tsv file')
14+
parser.add_argument('-r', '--reference', type=str, nargs='+',
15+
help='base reference fasta')
16+
parser.add_argument('-c', '--contigs', type=str, nargs='+',
17+
help='contigs and legths of contigs in form id:length')
18+
args = parser.parse_args()
19+
20+
in_file = vars(args).get("inFile")[0]
21+
22+
print(args)
23+
24+
if in_file is None:
25+
parser.print_help()
26+
sys.exit(1)
27+
28+
vcf_path = in_file.replace(".gd", ".vcf")
29+
30+
vcf_file = open(vcf_path, "w")
31+
32+
vcf_file.write("##fileformat=VCFv4.2\n")
33+
34+
vcf_file.write("##ALT=<ID=DEL,Description=\"Deletion\">\n")
35+
vcf_file.write("##ALT=<ID=DUP,Description=\"Duplication\">\n")
36+
vcf_file.write("##ALT=<ID=INV,Description=\"Inversion\">\n")
37+
vcf_file.write("##ALT=<ID=BND,Description=\"Translocation\">\n")
38+
vcf_file.write("##ALT=<ID=INS,Description=\"Insertion\">\n")
39+
40+
vcf_file.write("##FILTER=<ID=PASS,Description=\"All filters passed\">\n")
41+
42+
vcf_file.write(
43+
"##INFO=<ID=CHR2,Number=1,Type=String,Description=\"Chromosome for POS2 coordinate in case of an inter-chromosomal translocation\">\n")
44+
vcf_file.write(
45+
"##INFO=<ID=POS2,Number=1,Type=Integer,Description=\"Genomic position for CHR2 in case of an inter-chromosomal translocation\">\n")
46+
vcf_file.write("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the structural variant\">\n")
47+
vcf_file.write("##INFO=<ID=PE,Number=1,Type=Integer,Description=\"Paired-end support of the structural variant\">\n")
48+
vcf_file.write("##INFO=<ID=SR,Number=1,Type=Integer,Description=\"Split-read support\">\n")
49+
vcf_file.write("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"sv_type of structural variant\">\n")
50+
51+
vcf_file.write(f"##reference={vars(args).get('reference')[0]}\n")
52+
53+
for cont in vars(args).get("contigs"):
54+
conts = cont.split(":")
55+
vcf_file.write(f"##contig=<ID={conts[0]},length={conts[1]}>\n")
56+
57+
file_name = os.path.basename(in_file)
58+
59+
sv_sv_type = "."
60+
61+
62+
def toVcfBreakend(localChr, localPos, localPositive, remoteChr, remotePos, remotePositive):
63+
if remotePositive:
64+
remote = "]" + remoteChr + ":" + str(remotePos) + "]"
65+
else:
66+
remote = "[" + remoteChr + ":" + str(remotePos) + "["
67+
if localPositive:
68+
return "N" + remote
69+
else:
70+
return remote + "N"
71+
72+
with open(in_file) as file:
73+
tsv = csv.reader(file, delimiter="\t")
74+
for idx, line in enumerate(tsv):
75+
if line[0].startswith("#Chr1"):
76+
vcf_file.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\n")
77+
elif line[0].startswith("#"):
78+
vcf_file.write("#"+'\t'.join(line) + "\n")
79+
else:
80+
sv_type = line[0]
81+
sv_id = line[1]
82+
leftChr = line[3]
83+
leftPos = int(line[4])
84+
rightChr = leftChr
85+
rightPos = leftPos
86+
id = sv_type + "_" + sv_id
87+
88+
if sv_type == "MC":
89+
rightPos = int(line[5])
90+
leftOrientation = True
91+
rightOrientation = False
92+
93+
vcf_file.write(f"{leftChr}\t{leftPos}\t{id}a\tN\tN]{leftChr}:{leftPos}]\t.\tPASS\tCHR2={leftChr};END={leftPos};SVLEN=0;SVTYPE=BE;\n")
94+
vcf_file.write(f"{leftChr}\t{rightPos}\t{id}b\tN\t[{leftChr}:{rightPos}[N\t.\tPASS\tCHR2={leftChr};END={rightPos};SVLEN=0;SVTYPE=BE;\n")
95+
96+
elif sv_type == "JC":
97+
leftOrientation = int(line[5]) > 0
98+
rightChr = line[6]
99+
rightPos = int(line[7])
100+
rightOrientation = int(line[9]) > 0
101+
102+
vcf_file.write(
103+
f"{leftChr}\t{leftPos}\t{id}\tN\t{toVcfBreakend(leftChr, leftPos, leftOrientation, rightChr, rightPos, rightOrientation)}\t.\tPASS\tIMPRECISE;END={rightPos};SVLEN=0;SVTYPE=BP" + "\t.\n"
104+
# ;CIPOS={min(0, rightPos - leftPos - size)},{max(0, rightPos - leftPos - size)}
105+
)
106+
elif sv_type == "DEL":
107+
size = int(line[5])
108+
rightPos = leftPos + size
109+
leftOrientation = True
110+
rightOrientation = False
111+
112+
vcf_file.write(
113+
f"{leftChr}\t{leftPos}\t{id}\tN\t<DEL>\t.\tPASS\tIMPRECISE;END={rightPos};SVLEN={size};SVTYPE=DEL" + "\t.\n"
114+
# ;CIPOS={min(0, rightPos - leftPos - size)},{max(0, rightPos - leftPos - size)}
115+
)
116+
elif sv_type == "INS":
117+
rightPos = leftPos + 1
118+
leftOrientation = True
119+
rightOrientation = False
120+
121+
insert = line[5]
122+
size = len(insert)
123+
124+
vcf_file.write(
125+
f"{leftChr}\t{leftPos}\t{id}\tN\t<INS>\t.\tPASS\tIMPRECISE;END={rightPos};SVLEN={size};SVTYPE=INS" + "\t.\n"
126+
# ;CIPOS={min(0, rightPos - leftPos - size)},{max(0, rightPos - leftPos - size)}
127+
)
128+
elif sv_type == "UN":
129+
print("UN evidence, not processed")
130+
else:
131+
print("UNHANDLED VARIANT CALL")
132+
133+
vcf_file.close()

Diff for: breseq/breseq_bam.sh

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#!/bin/bash
2+
3+
threads=8
4+
5+
bam="$1"
6+
fasta="$2"
7+
fastq1="$3"
8+
fastq2="$4"
9+
outdir="$5"
10+
threads="$6"
11+
tools_dir="$7"
12+
timing="$8"
13+
14+
# Think about using bwa BAM files, running bowtie on all files will take forever, especially with so many settings ?!
15+
16+
log_eval $PWD "docker run --rm -v $(pwd):/in/ -w /in/ staphb/samtools:1.15 samtools sort -n -o ${bam/.bam/_nsort.bam} $bam"
17+
18+
log_eval $PWD "docker run --name=breseq -u 1001:1001 -v $(pwd)/:$(pwd) -w $outdir pvstodghill/breseq:0.35.7__2021-08-03 breseq \
19+
-o $outdir -r $fasta \
20+
--brief-html-output \
21+
--aligned-sam \
22+
--no-javascript \
23+
-j $threads \
24+
${bam/.bam/_nsort.bam}"
25+
26+
if [ -s "${bam/.bam/_nsort.bam}" ]; then
27+
rm "${bam/.bam/_nsort.bam}"
28+
fi
29+
30+
log_eval $PWD "docker run --rm --name=python3.8 -v $(pwd):$(pwd) -v $tools_dir:$tools_dir -w $outdir python:3.8-buster python3 $tools_dir/breseq/breseqToVCF.py \
31+
-i $outdir/data/output.gd -r $fasta -c LT962478.1:2263458 LT962479.1:1827941"
32+
33+
34+
START=$(docker inspect --format='{{.State.StartedAt}}' breseq)
35+
STOP=$(docker inspect --format='{{.State.FinishedAt}}' breseq)
36+
37+
START_TIMESTAMP=$(date --date=$START +%s)
38+
STOP_TIMESTAMP=$(date --date=$STOP +%s)
39+
40+
FTIME=$(($STOP_TIMESTAMP-$START_TIMESTAMP))
41+
42+
echo final time: $FTIME seconds
43+
echo -e "${outdir}\t${FTIME}" | tee -a "$timing"
44+
45+
docker container rm breseq

Diff for: callSvBreseq.sh

+149
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
#!/bin/bash
2+
3+
OPTSTRING="hr:s:i:o:t:l:c:"
4+
5+
usage()
6+
{
7+
echo -e "You did it wrong"
8+
}
9+
10+
declare SWITCH
11+
threads=8
12+
13+
# Examine individual options
14+
while getopts "$OPTSTRING" SWITCH; do
15+
case $SWITCH in
16+
17+
r) ref="$OPTARG"
18+
ref=$(readlink -e "$ref")
19+
echo "Reference = $ref"
20+
;;
21+
22+
s) settings="$OPTARG"
23+
echo "Settings = $settings"
24+
;;
25+
26+
i) fasta_dir="$OPTARG"
27+
fasta_dir=$(readlink -e "$fasta_dir")
28+
echo "Fasta Directory = $fasta_dir"
29+
;;
30+
31+
o) out_dir="$OPTARG"
32+
out_dir=$(readlink -e "$out_dir")
33+
echo "Outdir = $out_dir"
34+
;;
35+
36+
t) tools_dir="$OPTARG"
37+
tools_dir=$(readlink -e "$tools_dir")
38+
echo "Tools directory = $tools_dir"
39+
;;
40+
41+
l) tools_list="$OPTARG"
42+
declare -a tools=($tools_list)
43+
echo "Going to run the following tools = $tools"
44+
;;
45+
46+
c) threads="$OPTARG"
47+
echo "Threads = $threads"
48+
;;
49+
50+
*) echo "script error: unhandled argument"
51+
usage
52+
exit 1
53+
;;
54+
55+
56+
esac
57+
done
58+
59+
source $tools_dir/log_eval.sh
60+
61+
SAMBAMBA="docker run -u 1001:1001 --name sambamba --rm -v $PWD:$PWD -w $PWD clinicalgenomics/sambamba:0.8.0"
62+
63+
tool="breseq"
64+
declare -a fractionOfReads=(100)
65+
seed=87
66+
67+
stamp="$(date +'%Y_%d_%m-%H_%M_%S')"
68+
org="CBS7435"
69+
70+
grep -v '^#' $settings | while IFS=$'\t' read -r -a settings_array
71+
do
72+
73+
base_in=$(basename "$fasta_dir")
74+
75+
settings_string="${settings_array[0]}_f${settings_array[1]}_l${settings_array[2]}_m${settings_array[3]}_s${settings_array[4]}"
76+
bamdir=$out_dir/${base_in/fasta/bam}/$settings_string
77+
fastqdir=$out_dir/${base_in/fasta/fastq}/$settings_string
78+
svsdir=$out_dir/${base_in/fasta/svs}/$settings_string
79+
80+
echo "THIS IS the DIRECTORY: $fastqdir"
81+
82+
timing=${svsdir}/${stamp}_timing.tsv
83+
log=${svsdir}/${stamp}_sv_calling.log
84+
85+
echo "THIS IS the DIRECTORY: $bamdir"
86+
87+
if [ ! -d $svsdir ]; then
88+
mkdir $svsdir
89+
fi
90+
touch $timing
91+
touch $log
92+
93+
for dir in "$fasta_dir"/*; do
94+
95+
if [[ -d $dir ]]; then
96+
base=$(basename "$dir")
97+
98+
if [ ! -d $svsdir/$base ]; then
99+
mkdir $svsdir/$base
100+
fi
101+
102+
READ1_FILE="${fastqdir}/${base}_1.fq.gz"
103+
READ2_FILE="${fastqdir}/${base}_2.fq.gz"
104+
105+
echo "╔══════════════════════════════════════════════════════════════╗"
106+
echo "║ starting SV analysis ║"
107+
echo "╚══════════════════════════════════════════════════════════════╝"
108+
109+
if [ -s "$READ1_FILE" -a -s "$READ2_FILE" ]; then
110+
111+
for fraction in "${fractionOfReads[@]}"; do
112+
113+
# READ1_FILE_FRACTION=$READ1_FILE
114+
# READ2_FILE_FRACTION=$READ2_FILE
115+
116+
tool_outdir="$svsdir/$base/${tool}_${fraction}"
117+
118+
BAM_SORTED="$svsdir/$base/${tool}_100/base/reference.bam"
119+
BAM_FRACTION="NONE"
120+
121+
if [ $fraction -ne 100 ]; then
122+
BAM_FRACTION="${BAM_SORTED/.bam/-${fraction}.bam}"
123+
log_eval $PWD "$SAMBAMBA sambamba view -h -t $threads -s 0.$fraction -f bam \
124+
--subsampling-seed=$seed -o $BAM_FRACTION $BAM_SORTED"
125+
fi
126+
127+
# For teh sake of consistency
128+
129+
if [ ! -d "$tool_outdir" ]; then
130+
mkdir "$tool_outdir"
131+
if [ "$BAM_FRACTION" = "NONE" ]; then
132+
export -f log_eval
133+
log_eval $PWD "$tools_dir/$tool/${tool}.sh $BAM_FRACTION $ref $READ1_FILE $READ2_FILE $tool_outdir $threads $tools_dir $timing" "$log"
134+
elif [ -s "$BAM_SORTED" ]; then
135+
export -f log_eval
136+
log_eval $PWD "$tools_dir/$tool/breseq_bam.sh $BAM_FRACTION $ref $READ1_FILE $READ2_FILE $tool_outdir $threads $tools_dir $timing" "$log"
137+
fi
138+
fi
139+
140+
# if [ $fraction -ne 100 ]; then
141+
# rm $BAM_FRACTION
142+
# fi
143+
done
144+
else
145+
echo "Could not find files $READ1_FILE, $READ2_FILE"
146+
fi
147+
fi
148+
done
149+
done

Diff for: callSvFromFastq.sh

+1
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ do
120120
# log_eval $PWD "$SAMBAMBA sambamba view -h -t $threads -s 0.$fraction -f bam \
121121
# --subsampling-seed=$seed -o $BAM_FRACTION $BAM_SORTED"
122122
# fi
123+
123124
# For teh sake of consistency
124125
BAM_FRACTION="NONE"
125126

0 commit comments

Comments
 (0)