Skip to content

Commit d958f3b

Browse files
committed
best GF transcript, nover isoform output added, updated gene selection criterion
1 parent 5490ee6 commit d958f3b

File tree

5 files changed

+615
-198
lines changed

5 files changed

+615
-198
lines changed

InFuse.py

+39-11
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
parent_parser.add_argument("--unannotated", help='BED file of unannotated regions', type=str)
2020
parent_parser.add_argument("--distance_threshold", help='Distance threshold for merging breakpoints', type=int, default=10)
2121
parent_parser.add_argument("--min_support", help='Minimum read support for reporting gene fusion', type=int, default=2)
22-
parent_parser.add_argument("--ref", help='Reference FASTA file', type=str, default='')
22+
parent_parser.add_argument("--transcripts", help='Transcripts FASTA file', type=str, default='')
2323

2424

2525
detect_parser = main_subparsers.add_parser("detect", parents=[parent_parser],
@@ -37,10 +37,7 @@
3737
add_help=True,
3838
help="Merge and filter gene fusions using custom parameters using pre-computed read level pickled files.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
3939
merge_filter_parser.add_argument('--pickle', help="Read level pickled files from detect module", required=True)
40-
41-
42-
43-
40+
4441
print('Command: python %s\n' %(' '.join(sys.argv)))
4542

4643
args = parser.parse_args()
@@ -59,38 +56,69 @@
5956

6057
if args.option=='detect':
6158
from src import detect
62-
total_output, output, gene_id_to_name, gene_df=detect.call_manager(args)
59+
total_output, output, gene_id_to_name, gene_df, gene_transcript_map, gene_strand_map, get_gene_exons, non_coding_gene_id, raw_exons = detect.call_manager(args)
60+
6361

6462
else:
6563
with open(args.pickle, 'rb') as handle:
6664
output=pickle.load(handle)
6765

6866
gene_df=pd.read_csv(args.gff, sep='\t', comment='#', header=None)
6967
gene_df.rename(columns={0:'chrom', 2:'feature', 3:'start', 4:'end',6:'strand', 8:'info'}, inplace=True)
70-
gene_df['gene_id']=gene_df['info'].str.extract(r'gene_id=([^;]+)')
68+
gene_df['gene_id']=gene_df['info'].str.extract(r'gene_id=([^;]+)')[0].str.extract(r"([^.]+)")
69+
gene_df['transcript_id']=gene_df['info'].str.extract(r'transcript_id=([^;]+)')[0].str.extract(r"([^.]+)")
7170
gene_df['gene_name']=gene_df['info'].str.extract(r'gene_name=([^;]+)')
71+
72+
gene_tr_df=gene_df[['gene_id', 'transcript_id']].dropna().groupby('gene_id')['transcript_id'].apply(lambda x: sorted(list(set(x)))).reset_index()
73+
74+
gene_transcript_map={x:y for x,y in zip(gene_tr_df['gene_id'], gene_tr_df['transcript_id'])}
75+
7276
gene_df=gene_df[gene_df.feature=='gene']
7377
gene_id_to_name={x:y for x,y in zip(gene_df.gene_id, gene_df.gene_name)}
7478

7579

7680
print('{}: Clustering gene fusions.'.format(str(datetime.datetime.now())))
77-
final_gf_double_bp=post_process.get_GFs(output, gene_id_to_name, args.ref, gene_df, args.distance_threshold, args.min_support, args.threads)
81+
final_gf_double_bp=post_process.get_GFs(output, gene_transcript_map, args.transcripts, gene_df, non_coding_gene_id, args.distance_threshold, args.min_support, args.threads)
7882

7983
print('{}: Saving gene fusions results.'.format(str(datetime.datetime.now())))
8084

8185
with open(os.path.join(output_path,args.prefix+'.final_gf_double_bp.pickle'), 'wb') as handle:
8286
pickle.dump(final_gf_double_bp, handle, protocol=pickle.HIGHEST_PROTOCOL)
8387

8488

85-
header="\t".join(["gene_fusion", "read_support", "num_annotated", "genes_overlap", "consistent", "readthrough", "gene_1_name", "gene_1_id", "chr_bp1", "pos_bp1", "range_bp1", "mapq_bp1", "max_len_bp1", "region_type_bp1", "gene_2_name", "gene_2_id", "chr_bp2", "pos_bp2", "range_bp2", "mapq_bp2", "max_len_bp2", "region_type_bp2"])
89+
header="\t".join(["gene_fusion", "read_support", "genes_overlap", "consistent", "readthrough",\
90+
"gene_1_name", "gene_1_id", "chr_bp1", "pos_bp1", "range_bp1", "mapq_bp1", "max_len_bp1", "region_type_bp1", "gene1_best_transcript", "gene1_transcript_mapq",\
91+
"gene_2_name", "gene_2_id", "chr_bp2", "pos_bp2", "range_bp2", "mapq_bp2", "max_len_bp2", "region_type_bp2", "gene2_best_transcript", "gene2_transcript_mapq",\
92+
'gene1_sig_num_matches', 'gene1_sig_max_match', 'gene1_sig_score', 'gene1_sig_pval', 'gene1_sig_zscore',\
93+
'gene2_sig_num_matches', 'gene2_sig_max_match', 'gene2_sig_score', 'gene2_sig_pval', 'gene2_sig_zscore'])
8694

8795
with open(os.path.join(output_path,args.prefix+'.final_gf_double_bp'), 'w') as ann_file:
8896
ann_file.write(header+'\n')
8997

9098
for k,v in sorted(final_gf_double_bp.items(), key=lambda x: x[1]['read_support'], reverse=True):
9199
gene_fusion="{}::{}".format(v['median_breakpoint_1'][0], v['median_breakpoint_2'][0])
92-
read_support, num_annotated, genes_overlap, consistent, readthrough= v['read_support'], v['annotated'], v['genes_overlap'], v['consistent'], v['readthrough']
93-
rec="\t".join(str(x) for x in [gene_fusion, read_support, num_annotated, genes_overlap, consistent, readthrough, *v['median_breakpoint_1'], *v['median_breakpoint_2']])
100+
read_support, genes_overlap, consistent, readthrough= v['read_support'], v['genes_overlap'], v['consistent'], v['readthrough']
101+
rec="\t".join(str(x) for x in [gene_fusion, read_support, genes_overlap, consistent, readthrough, *v['median_breakpoint_1'], *v['gene1_best_transcript'], *v['median_breakpoint_2'], *v['gene2_best_transcript'], *v['gene1_sig'].values(), *v['gene2_sig'].values()])
94102
ann_file.write(rec+'\n')
103+
104+
105+
if not args.gf_only:
106+
final_skips, single_transcript_skips, single_transcript_non_repeat_non_skips, single_transcript_repeat_non_skips, multi_transcript_isoforms, reads_to_check=post_process.process_isoforms(args.bam, gene_strand_map, raw_exons, get_gene_exons, args.transcripts)
107+
108+
print('{}: Saving novel isoforms results.'.format(str(datetime.datetime.now())))
109+
with open(os.path.join(output_path,args.prefix+'.single_transcript_skips.pickle'), 'wb') as handle:
110+
pickle.dump(single_transcript_skips, handle, protocol=pickle.HIGHEST_PROTOCOL)
111+
112+
with open(os.path.join(output_path,args.prefix+'.single_transcript_non_repeat_non_skips.pickle'), 'wb') as handle:
113+
pickle.dump(single_transcript_non_repeat_non_skips, handle, protocol=pickle.HIGHEST_PROTOCOL)
114+
115+
with open(os.path.join(output_path,args.prefix+'.single_transcript_repeat_non_skips.pickle'), 'wb') as handle:
116+
pickle.dump(single_transcript_repeat_non_skips, handle, protocol=pickle.HIGHEST_PROTOCOL)
117+
118+
with open(os.path.join(output_path,args.prefix+'.multi_transcript_isoforms.pickle'), 'wb') as handle:
119+
pickle.dump(multi_transcript_isoforms, handle, protocol=pickle.HIGHEST_PROTOCOL)
120+
121+
with open(os.path.join(output_path,args.prefix+'.final_skips.pickle'), 'wb') as handle:
122+
pickle.dump(final_skips, handle, protocol=pickle.HIGHEST_PROTOCOL)
95123

96124
print('Time elapsed={}s'.format(time.time()-t))

0 commit comments

Comments
 (0)