|
19 | 19 | parent_parser.add_argument("--unannotated", help='BED file of unannotated regions', type=str)
|
20 | 20 | parent_parser.add_argument("--distance_threshold", help='Distance threshold for merging breakpoints', type=int, default=10)
|
21 | 21 | parent_parser.add_argument("--min_support", help='Minimum read support for reporting gene fusion', type=int, default=2)
|
22 |
| - parent_parser.add_argument("--ref", help='Reference FASTA file', type=str, default='') |
| 22 | + parent_parser.add_argument("--transcripts", help='Transcripts FASTA file', type=str, default='') |
23 | 23 |
|
24 | 24 |
|
25 | 25 | detect_parser = main_subparsers.add_parser("detect", parents=[parent_parser],
|
|
37 | 37 | add_help=True,
|
38 | 38 | help="Merge and filter gene fusions using custom parameters using pre-computed read level pickled files.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
39 | 39 | merge_filter_parser.add_argument('--pickle', help="Read level pickled files from detect module", required=True)
|
40 |
| - |
41 |
| - |
42 |
| - |
43 |
| - |
| 40 | + |
44 | 41 | print('Command: python %s\n' %(' '.join(sys.argv)))
|
45 | 42 |
|
46 | 43 | args = parser.parse_args()
|
|
59 | 56 |
|
60 | 57 | if args.option=='detect':
|
61 | 58 | from src import detect
|
62 |
| - total_output, output, gene_id_to_name, gene_df=detect.call_manager(args) |
| 59 | + total_output, output, gene_id_to_name, gene_df, gene_transcript_map, gene_strand_map, get_gene_exons, non_coding_gene_id, raw_exons = detect.call_manager(args) |
| 60 | + |
63 | 61 |
|
64 | 62 | else:
|
65 | 63 | with open(args.pickle, 'rb') as handle:
|
66 | 64 | output=pickle.load(handle)
|
67 | 65 |
|
68 | 66 | gene_df=pd.read_csv(args.gff, sep='\t', comment='#', header=None)
|
69 | 67 | gene_df.rename(columns={0:'chrom', 2:'feature', 3:'start', 4:'end',6:'strand', 8:'info'}, inplace=True)
|
70 |
| - gene_df['gene_id']=gene_df['info'].str.extract(r'gene_id=([^;]+)') |
| 68 | + gene_df['gene_id']=gene_df['info'].str.extract(r'gene_id=([^;]+)')[0].str.extract(r"([^.]+)") |
| 69 | + gene_df['transcript_id']=gene_df['info'].str.extract(r'transcript_id=([^;]+)')[0].str.extract(r"([^.]+)") |
71 | 70 | gene_df['gene_name']=gene_df['info'].str.extract(r'gene_name=([^;]+)')
|
| 71 | + |
| 72 | + gene_tr_df=gene_df[['gene_id', 'transcript_id']].dropna().groupby('gene_id')['transcript_id'].apply(lambda x: sorted(list(set(x)))).reset_index() |
| 73 | + |
| 74 | + gene_transcript_map={x:y for x,y in zip(gene_tr_df['gene_id'], gene_tr_df['transcript_id'])} |
| 75 | + |
72 | 76 | gene_df=gene_df[gene_df.feature=='gene']
|
73 | 77 | gene_id_to_name={x:y for x,y in zip(gene_df.gene_id, gene_df.gene_name)}
|
74 | 78 |
|
75 | 79 |
|
76 | 80 | print('{}: Clustering gene fusions.'.format(str(datetime.datetime.now())))
|
77 |
| - final_gf_double_bp=post_process.get_GFs(output, gene_id_to_name, args.ref, gene_df, args.distance_threshold, args.min_support, args.threads) |
| 81 | + final_gf_double_bp=post_process.get_GFs(output, gene_transcript_map, args.transcripts, gene_df, non_coding_gene_id, args.distance_threshold, args.min_support, args.threads) |
78 | 82 |
|
79 | 83 | print('{}: Saving gene fusions results.'.format(str(datetime.datetime.now())))
|
80 | 84 |
|
81 | 85 | with open(os.path.join(output_path,args.prefix+'.final_gf_double_bp.pickle'), 'wb') as handle:
|
82 | 86 | pickle.dump(final_gf_double_bp, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
83 | 87 |
|
84 | 88 |
|
85 |
| - header="\t".join(["gene_fusion", "read_support", "num_annotated", "genes_overlap", "consistent", "readthrough", "gene_1_name", "gene_1_id", "chr_bp1", "pos_bp1", "range_bp1", "mapq_bp1", "max_len_bp1", "region_type_bp1", "gene_2_name", "gene_2_id", "chr_bp2", "pos_bp2", "range_bp2", "mapq_bp2", "max_len_bp2", "region_type_bp2"]) |
| 89 | + header="\t".join(["gene_fusion", "read_support", "genes_overlap", "consistent", "readthrough",\ |
| 90 | + "gene_1_name", "gene_1_id", "chr_bp1", "pos_bp1", "range_bp1", "mapq_bp1", "max_len_bp1", "region_type_bp1", "gene1_best_transcript", "gene1_transcript_mapq",\ |
| 91 | + "gene_2_name", "gene_2_id", "chr_bp2", "pos_bp2", "range_bp2", "mapq_bp2", "max_len_bp2", "region_type_bp2", "gene2_best_transcript", "gene2_transcript_mapq",\ |
| 92 | + 'gene1_sig_num_matches', 'gene1_sig_max_match', 'gene1_sig_score', 'gene1_sig_pval', 'gene1_sig_zscore',\ |
| 93 | + 'gene2_sig_num_matches', 'gene2_sig_max_match', 'gene2_sig_score', 'gene2_sig_pval', 'gene2_sig_zscore']) |
86 | 94 |
|
87 | 95 | with open(os.path.join(output_path,args.prefix+'.final_gf_double_bp'), 'w') as ann_file:
|
88 | 96 | ann_file.write(header+'\n')
|
89 | 97 |
|
90 | 98 | for k,v in sorted(final_gf_double_bp.items(), key=lambda x: x[1]['read_support'], reverse=True):
|
91 | 99 | gene_fusion="{}::{}".format(v['median_breakpoint_1'][0], v['median_breakpoint_2'][0])
|
92 |
| - read_support, num_annotated, genes_overlap, consistent, readthrough= v['read_support'], v['annotated'], v['genes_overlap'], v['consistent'], v['readthrough'] |
93 |
| - rec="\t".join(str(x) for x in [gene_fusion, read_support, num_annotated, genes_overlap, consistent, readthrough, *v['median_breakpoint_1'], *v['median_breakpoint_2']]) |
| 100 | + read_support, genes_overlap, consistent, readthrough= v['read_support'], v['genes_overlap'], v['consistent'], v['readthrough'] |
| 101 | + rec="\t".join(str(x) for x in [gene_fusion, read_support, genes_overlap, consistent, readthrough, *v['median_breakpoint_1'], *v['gene1_best_transcript'], *v['median_breakpoint_2'], *v['gene2_best_transcript'], *v['gene1_sig'].values(), *v['gene2_sig'].values()]) |
94 | 102 | ann_file.write(rec+'\n')
|
| 103 | + |
| 104 | + |
| 105 | + if not args.gf_only: |
| 106 | + final_skips, single_transcript_skips, single_transcript_non_repeat_non_skips, single_transcript_repeat_non_skips, multi_transcript_isoforms, reads_to_check=post_process.process_isoforms(args.bam, gene_strand_map, raw_exons, get_gene_exons, args.transcripts) |
| 107 | + |
| 108 | + print('{}: Saving novel isoforms results.'.format(str(datetime.datetime.now()))) |
| 109 | + with open(os.path.join(output_path,args.prefix+'.single_transcript_skips.pickle'), 'wb') as handle: |
| 110 | + pickle.dump(single_transcript_skips, handle, protocol=pickle.HIGHEST_PROTOCOL) |
| 111 | + |
| 112 | + with open(os.path.join(output_path,args.prefix+'.single_transcript_non_repeat_non_skips.pickle'), 'wb') as handle: |
| 113 | + pickle.dump(single_transcript_non_repeat_non_skips, handle, protocol=pickle.HIGHEST_PROTOCOL) |
| 114 | + |
| 115 | + with open(os.path.join(output_path,args.prefix+'.single_transcript_repeat_non_skips.pickle'), 'wb') as handle: |
| 116 | + pickle.dump(single_transcript_repeat_non_skips, handle, protocol=pickle.HIGHEST_PROTOCOL) |
| 117 | + |
| 118 | + with open(os.path.join(output_path,args.prefix+'.multi_transcript_isoforms.pickle'), 'wb') as handle: |
| 119 | + pickle.dump(multi_transcript_isoforms, handle, protocol=pickle.HIGHEST_PROTOCOL) |
| 120 | + |
| 121 | + with open(os.path.join(output_path,args.prefix+'.final_skips.pickle'), 'wb') as handle: |
| 122 | + pickle.dump(final_skips, handle, protocol=pickle.HIGHEST_PROTOCOL) |
95 | 123 |
|
96 | 124 | print('Time elapsed={}s'.format(time.time()-t))
|
0 commit comments