|
| 1 | +from bed_maker_supplementary import * |
| 2 | + |
| 3 | +import sys |
| 4 | +import argparse |
| 5 | +import csv |
| 6 | +from Bio import SeqIO |
| 7 | + |
| 8 | +# adapted from : https://github.com/PapenfussLab/sv_benchmark/blob/master/breakdancer2vcf.py |
| 9 | + |
| 10 | +if __name__ == '__main__': |
| 11 | + parser = argparse.ArgumentParser( |
| 12 | + prog='HockBedToVCF', description='Reformat the bedmaker output to VCF') |
| 13 | + parser.add_argument('-i', '--inFile', type=str, nargs='+', |
| 14 | + help='file containing summary bed file') |
| 15 | + parser.add_argument('-r', '--reference', type=str, nargs='+', |
| 16 | + help='base reference fasta') |
| 17 | + parser.add_argument('-p', '--hasPlasmid', type=bool, nargs='?', |
| 18 | + const=True, default=False, |
| 19 | + help='indicate if ref has plasmids') |
| 20 | + # parser.add_argument('-c', '--contigs', type=str, nargs='+', |
| 21 | + # help='contigs and legths of contigs in form id:length') |
| 22 | + args = parser.parse_args() |
| 23 | + |
| 24 | +in_file = vars(args).get("inFile")[0] |
| 25 | +reference = vars(args).get("reference")[0] |
| 26 | +has_plasmids = vars(args).get("hasPlasmid") |
| 27 | +plasmid_length = 0 |
| 28 | + |
| 29 | +print(args) |
| 30 | + |
| 31 | +if in_file is None or reference is None: |
| 32 | + parser.print_help() |
| 33 | + sys.exit(1) |
| 34 | + |
| 35 | +import os |
| 36 | + |
| 37 | +vcf_path = in_file.replace(".bed","-complex.vcf") |
| 38 | + |
| 39 | + |
| 40 | +vcf_file = open(vcf_path, "w") |
| 41 | + |
| 42 | +vcf_file.write("##fileformat=VCFv4.2\n") |
| 43 | + |
| 44 | +vcf_file.write("##ALT=<ID=DEL,Description=\"Deletion\">\n") |
| 45 | +vcf_file.write("##ALT=<ID=DUP,Description=\"Duplication\">\n") |
| 46 | +vcf_file.write("##ALT=<ID=INV,Description=\"Inversion\">\n") |
| 47 | +vcf_file.write("##ALT=<ID=BND,Description=\"Translocation\">\n") |
| 48 | +vcf_file.write("##ALT=<ID=INS,Description=\"Insertion\">\n") |
| 49 | + |
| 50 | +vcf_file.write("##FILTER=<ID=PASS,Description=\"All filters passed\">\n") |
| 51 | + |
| 52 | +vcf_file.write("##INFO=<ID=CHR2,Number=1,Type=String,Description=\"Chromosome for POS2 coordinate in case of an inter-chromosomal translocation\">\n") |
| 53 | +vcf_file.write("##INFO=<ID=POS2,Number=1,Type=Integer,Description=\"Genomic position for CHR2 in case of an inter-chromosomal translocation\">\n") |
| 54 | +vcf_file.write("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the structural variant\">\n") |
| 55 | +vcf_file.write("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"BE for breakend and BND for breakpoint \">\n") |
| 56 | +vcf_file.write("##INFO=<ID=SV_DEF,Number=1,Type=String,Description=\"Definition of complex SV \">\n") |
| 57 | +vcf_file.write("##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Length of SV \">\n") |
| 58 | +vcf_file.write("##INFO=<ID=EVENT,Number=1,Type=String,Description=\"EVENT the SV belongs to \">\n") |
| 59 | +vcf_file.write("##INFO=<ID=MATEID,Number=1,Type=String,Description=\"ID of mate breakend or breakpoint \">\n") |
| 60 | +vcf_file.write("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n") |
| 61 | + |
| 62 | +vcf_file.write(f"##reference={vars(args).get('reference')[0]}\n") |
| 63 | + |
| 64 | +# for cont in vars(args).get("contigs"): |
| 65 | +# conts = cont.split(":") |
| 66 | +# vcf_file.write(f"##contig=<ID={conts[0]},length={conts[1]}>\n") |
| 67 | + |
| 68 | +referenceFile = open(reference, 'r') |
| 69 | + |
| 70 | +for rec in SeqIO.parse(referenceFile, 'fasta'): |
| 71 | + name = rec.id |
| 72 | + seq = rec.seq |
| 73 | + seqLen = len(rec) |
| 74 | + vcf_file.write(f"##contig=<ID={name},length={seqLen}>\n") |
| 75 | + if has_plasmids and name == "plasmid": |
| 76 | + plasmid_length = seqLen |
| 77 | + print(plasmid_length) |
| 78 | + |
| 79 | +vcf_file.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tCHR1\n") |
| 80 | +def toVcfBreakend(localChr, localPos, localPositive, remoteChr, remotePos, remotePositive): |
| 81 | + if remotePositive: |
| 82 | + remote = "]" + remoteChr + ":" + str(remotePos) + "]" |
| 83 | + else: |
| 84 | + remote = "[" + remoteChr + ":" + str(remotePos) + "[" |
| 85 | + if localPositive: |
| 86 | + return "N" + remote |
| 87 | + else: |
| 88 | + return remote + "N" |
| 89 | + |
| 90 | +with open(in_file) as file: |
| 91 | + bed = csv.reader(file, delimiter="\t") |
| 92 | + for idx, line in enumerate(bed): |
| 93 | + chr1 = line[0] |
| 94 | + start = int(line[1]) |
| 95 | + end = int(line[2]) |
| 96 | + sv_def = line[3] |
| 97 | + insertion = line[4] |
| 98 | + |
| 99 | + if sv_def == options_dict.get("del"): |
| 100 | + vcf_file.write(f"{chr1}\t{start}\tDEL_{str(idx)}\tN\t<DEL>\t.\tPASS\tCHR2={chr1};END={end};SVLEN={str(end-start)};SVTYPE=DEL;SV_DEF={sv_def};EVENT=DEL_{str(idx)}\tGT\t1/1\n") |
| 101 | + elif sv_def == options_dict.get("ins"): |
| 102 | + if has_plasmids: |
| 103 | + vcf_file.write(f"{chr1}\t{start}\tINS_{str(idx)}a\tN\t{toVcfBreakend(chr1,start,True,'plasmid',0,False)}\t.\tPASS\tCHR2={chr1};END={end};SVLEN={str(end-start)};SVTYPE=BND;SV_DEF={sv_def};EVENT=INS_{str(idx)}\tGT\t1/1\n") |
| 104 | + vcf_file.write(f"{chr1}\t{end}\tINS_{str(idx)}b\tN\t{toVcfBreakend(chr1,end,False,'plasmid',plasmid_length,True)}\t.\tPASS\tCHR2={chr1};END={end};SVLEN={str(end-start)};SVTYPE=BND;SV_DEF={sv_def};EVENT=INS_{str(idx)}\tGT\t1/1\n") |
| 105 | + else: |
| 106 | + vcf_file.write(f"{chr1}\t{start}\tINS_{str(idx)}\tN\t<INS>\t.\tPASS\tCHR2={chr1};END={end};SVLEN={str(end-start)};SVTYPE=INS;SV_DEF={sv_def};EVENT=INS_{str(idx)}\tGT\t1/1\n") |
| 107 | + elif sv_def == options_dict.get("inv"): |
| 108 | + vcf_file.write(f"{chr1}\t{start}\tINV_{str(idx)}\tN\t<INV>\t.\tPASS\tCHR2={chr1};END={end};SVLEN={str(end-start)};SVTYPE=INV;SV_DEF={sv_def};EVENT=INV_{str(idx)}\tGT\t1/1\n") |
| 109 | + elif sv_def == options_dict.get("tdu"): |
| 110 | + vcf_file.write(f"{chr1}\t{start}\tTDU_{str(idx)}\tN\t<DUP>\t.\tPASS\tCHR2={chr1};END={end};SVLEN={str(end-start)};SVTYPE=DUP;SV_DEF={sv_def};EVENT=TDU_{str(idx)}\tGT\t1/1\n") |
| 111 | + elif sv_def == options_dict.get("itd"): |
| 112 | + vcf_file.write(f"{chr1}\t{start}\tITD_{str(idx)}b\tN\t<DUP>\t.\tPASS\tCHR2={chr1};END={end};SVLEN={str(end-start)};SVTYPE=BND;SV_DEF={sv_def};EVENT=ITD_{str(idx)}\tGT\t1/1\n") |
| 113 | + elif sv_def in (options_dict.get("tcp"), options_dict.get("txp"), options_dict.get("tci"), options_dict.get("txi"), options_dict.get("chr")): |
| 114 | + inserts = insertion.split(":") |
| 115 | + chr2 = inserts[1] |
| 116 | + pos2 = int(inserts[2]) |
| 117 | + direction = inserts[3] |
| 118 | + |
| 119 | + if sv_def in {options_dict.get("tcp"), options_dict.get("tci")}: |
| 120 | + vcf_file.write(f"{chr2}\t{pos2}\tTCP_{str(idx)}a\tN\t<DUP>\t.\tPASS\tCHR2={chr2};END={pos2+1};SVLEN=0;SVTYPE=BND;SV_DEF={sv_def};EVENT=TCP_{str(idx)}\tGT\t1/1\n") |
| 121 | + elif sv_def in {options_dict.get("txp"), options_dict.get("txi")}: |
| 122 | + vcf_file.write(f"{chr2}\t{pos2}\tTCP_{str(idx)}a\tN\t<TRA>\t.\tPASS\tCHR2={chr2};END={pos2+1};SVLEN=0;SVTYPE=BND;SV_DEF={sv_def};EVENT=TCP_{str(idx)}\tGT\t1/1\n") |
| 123 | + elif sv_def == options_dict.get("chr"): |
| 124 | + if direction == "forward": |
| 125 | + vcf_file.write(f"{chr1}\t{start}\tCHR_{str(idx)}a\tN\t{toVcfBreakend(chr1,start,True,chr2,pos2+1,False)}\t.\tPASS\tCHR2={chr2};END={pos2+1};SVLEN=0;SVTYPE=BND;SV_DEF={sv_def};EVENT=CHR_{str(idx)}\tGT\t1/1\n") |
| 126 | + vcf_file.write(f"{chr1}\t{end+1}\tCHR_{str(idx)}b\tN\t{toVcfBreakend(chr1,end+1,False,chr2,pos2,True)}\t.\tPASS\tCHR2={chr2};END={pos2};SVLEN=0;SVTYPE=BND;SV_DEF={sv_def};EVENT=CHR_{str(idx)}\tGT\t1/1\n") |
| 127 | + else: |
| 128 | + vcf_file.write(f"{chr1}\t{start}\tCHR_{str(idx)}a\tN\t{toVcfBreakend(chr1,start,True,chr2,pos2,True)}\t.\tPASS\tCHR2={chr2};END={pos2};SVLEN=0;SVTYPE=BND;SV_DEF={sv_def};EVENT=CHR_{str(idx)}\tGT\t1/1\n") |
| 129 | + vcf_file.write(f"{chr1}\t{end+1}\tCHR_{str(idx)}b\tN\t{toVcfBreakend(chr1,end+1,False,chr2,pos2+1,False)}\t.\tPASS\tCHR2={chr2};END={pos2+1};SVLEN=0;SVTYPE=BND;SV_DEF={sv_def};EVENT=CHR_{str(idx)}\tGT\t1/1\n") |
| 130 | + elif sv_def == options_dict.get("sub"): |
| 131 | + if has_plasmids: |
| 132 | + vcf_file.write(f"{chr1}\t{start}\tSUB_{str(idx)}a\tN\t{toVcfBreakend(chr1,start,True,'plasmid',0,False)}\t.\tPASS\tCHR2={chr1};END={start};SVLEN=0;SVTYPE=BE;SV_DEF={sv_def};EVENT=SUB_{str(idx)}\tGT\t1/1\n") |
| 133 | + vcf_file.write(f"{chr1}\t{end}\tSUB_{str(idx)}b\tN\t{toVcfBreakend(chr1,start,False,'plasmid',plasmid_length,True)}\t.\tPASS\tCHR2={chr1};END={end};SVLEN=0;SVTYPE=BE;SV_DEF={sv_def};EVENT=SUB_{str(idx)}\tGT\t1/1\n") |
| 134 | + else: |
| 135 | + vcf_file.write(f"{chr1}\t{start}\tSUB_{str(idx)}a\tN\tN]{chr1}:{start}]\t.\tPASS\tCHR2={chr1};END={start};SVLEN=0;SVTYPE=BE;SV_DEF={sv_def};EVENT=SUB_{str(idx)}\tGT\t1/1\n") |
| 136 | + vcf_file.write(f"{chr1}\t{end}\tSUB_{str(idx)}b\tN\t[{chr1}:{end}[N\t.\tPASS\tCHR2={chr1};END={end};SVLEN=0;SVTYPE=BE;SV_DEF={sv_def};EVENT=SUB_{str(idx)}\tGT\t1/1\n") |
| 137 | + elif sv_def == options_dict.get("tcs"): |
| 138 | + inserts = insertion.split(":") |
| 139 | + chr2 = inserts[1] |
| 140 | + pos1 = int(inserts[0]) |
| 141 | + pos2 = int(inserts[2]) |
| 142 | + direction = inserts[3] |
| 143 | + |
| 144 | + if direction == "forward": |
| 145 | + vcf_file.write(f"{chr1}\t{start}\tTCS_{str(idx)}a\tN\t{toVcfBreakend(chr1,start,True,chr2,pos1,False)}\t.\tPASS\tCHR2={chr2};END={pos1};SVLEN=0;SVTYPE=BND;SV_DEF={sv_def};EVENT=TCS_{str(idx)}\tGT\t1/1\n") |
| 146 | + vcf_file.write(f"{chr1}\t{end}\tTCS_{str(idx)}b\tN\t{toVcfBreakend(chr1,end,False,chr2,pos2+1,True)}\t.\tPASS\tCHR2={chr2};END={pos2+1};SVLEN=0;SVTYPE=BND;SV_DEF={sv_def};EVENT=TCS_{str(idx)}\tGT\t1/1\n") |
| 147 | + else: |
| 148 | + vcf_file.write(f"{chr1}\t{start}\tTCS_{str(idx)}a\tN\t{toVcfBreakend(chr1,start,True,chr2,pos2+1,True)}\t.\tPASS\tCHR2={chr2};END={end};SVLEN=0;SVTYPE=BND;SV_DEF={sv_def};EVENT=TSI_{str(idx)}\tGT\t1/1\n") |
| 149 | + vcf_file.write(f"{chr1}\t{end}\tTCS_{str(idx)}b\tN\t{toVcfBreakend(chr1,end,False,chr2,pos1,False)}\t.\tPASS\tCHR2={chr2};END={end};SVLEN=0;SVTYPE=BND;SV_DEF={sv_def};EVENT=TSI_{str(idx)}\tGT\t1/1\n") |
| 150 | + else: |
| 151 | + print("UNHANDLED VARIANT CALL: " + sv_def + " " + options_dict.get("tci")) |
| 152 | + |
| 153 | +vcf_file.close() |
0 commit comments