From 38e762657a4154b4092581d183fab4b6be806cc0 Mon Sep 17 00:00:00 2001 From: j0kaso Date: Mon, 7 Jun 2021 14:33:26 +0200 Subject: [PATCH] [mp-stability-pipeline] add LAR and TMspan to prism files --- software/rosetta_ddG_pipeline/helper.py | 15 ++++++-- .../prism_rosetta_parser.py | 38 ++++++++++++++++++- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/software/rosetta_ddG_pipeline/helper.py b/software/rosetta_ddG_pipeline/helper.py index f05ca2b..2e4fc13 100644 --- a/software/rosetta_ddG_pipeline/helper.py +++ b/software/rosetta_ddG_pipeline/helper.py @@ -538,6 +538,12 @@ def generate_emission_stats(test_dir): def generate_output(folder, output_name='ddG.out', sys_name='', version=1, prism_nr='XXX', chain_id='A', output_gaps=False, bfac=True, zip_files=True, sha_tag='', MP=False): # generate emission stats generate_emission_stats(folder.output[:-7]) + if MP: + span_file = glob.glob(os.path.join(folder.prepare_checking[:-9], 'mp_files', 'membrane_span', '*.span'))[0] + lipid_file = glob.glob(os.path.join(folder.prepare_checking[:-9], 'mp_files', 'mp_lipid_acc', '*.json'))[0] + else: + span_file = '' + lipid_file = '' ddg_file = os.path.join(folder.ddG_run, output_name) pdb_file_raw = os.path.join(folder.ddG_input, 'input.pdb') @@ -565,7 +571,8 @@ def generate_output(folder, output_name='ddG.out', sys_name='', version=1, prism ddG_postprocessing(ddg_file, ddg_sorted_file, sec_all=None, startnr=1, chain_id=chain_id) prism_file = os.path.join(folder.ddG_output, f'prism_rosetta_{prism_nr}_{sys_name}.txt') rosetta_to_prism(ddg_sorted_file, prism_file, rosetta_seq, rosetta_info=None, - version=version, sys_name=sys_name, first_residue_number=1, sha_tag=sha_tag, MP=MP) + version=version, sys_name=sys_name, first_residue_number=1, sha_tag=sha_tag, MP=MP, + span_file=span_file, lipid_file=lipid_file) create_copy(prism_file, folder.output) create_copy(pdb_file, folder.output, name=f'{sys_name}_final.pdb') @@ -583,7 +590,8 @@ def generate_output(folder, output_name='ddG.out', sys_name='', version=1, prism prism_gap_shifted_file = os.path.join(folder.ddG_output, f'prism_rosetta_{prism_nr}_{sys_name}_gap-shifted.txt') ddG_postprocessing(ddg_file, ddg_shifted_gap_file, sec_all=sec_all, startnr=first_residue_number, chain_id=chain_id) rosetta_to_prism(ddg_shifted_gap_file, prism_gap_shifted_file, sequence, rosetta_info=None, - version=version, sys_name=sys_name, first_residue_number=first_residue_number, sha_tag=sha_tag, MP=MP) + version=version, sys_name=sys_name, first_residue_number=first_residue_number, sha_tag=sha_tag, MP=MP, + span_file=span_file, lipid_file=lipid_file) create_copy(prism_gap_shifted_file, folder.output) pdb_gap_shifted_file = os.path.join(folder.ddG_output, 'relaxed_gap_shifted.pdb') @@ -595,7 +603,8 @@ def generate_output(folder, output_name='ddG.out', sys_name='', version=1, prism ddG_postprocessing(ddg_file, ddg_gap_file, sec_all=sec_all, startnr=1, chain_id=chain_id) prism_gap_file = os.path.join(folder.ddG_output, f'prism_rosetta_{prism_nr}_{sys_name}-gap.txt') rosetta_to_prism(ddg_gap_file, prism_gap_file, sequence, rosetta_info=None, - version=version, sys_name=sys_name, first_residue_number=1, sha_tag=sha_tag, MP=MP) + version=version, sys_name=sys_name, first_residue_number=1, sha_tag=sha_tag, MP=MP, + span_file=span_file, lipid_file=lipid_file) create_copy(prism_gap_file, folder.output) pdb_gap_file = os.path.join(folder.ddG_output, 'relaxed_gap.pdb') diff --git a/software/rosetta_ddG_pipeline/prism_rosetta_parser.py b/software/rosetta_ddG_pipeline/prism_rosetta_parser.py index 67936aa..a3239f8 100644 --- a/software/rosetta_ddG_pipeline/prism_rosetta_parser.py +++ b/software/rosetta_ddG_pipeline/prism_rosetta_parser.py @@ -7,6 +7,7 @@ # Standard library imports from datetime import datetime +import json import logging as logger import re import subprocess @@ -24,7 +25,17 @@ from PrismData import PrismParser, VariantData -def rosetta_to_prism(ddg_file, prism_file, sequence, rosetta_info=None, version=1, sys_name='', first_residue_number=1, sha_tag='', MP=False): +def span_multi(x, region): + out_tms = [] + for resi in x: + if int(resi) in region: + out_tms.append('True') + else: + out_tms.append('False') + return ":".join(out_tms) + +def rosetta_to_prism(ddg_file, prism_file, sequence, rosetta_info=None, version=1, sys_name='', + first_residue_number=1, sha_tag='', MP=False, span_file='', lipid_file=''): sequence = sequence.replace('-', 'X') # create prism file with rosetta values @@ -53,6 +64,26 @@ def rosetta_to_prism(ddg_file, prism_file, sequence, rosetta_info=None, version= } dataframeset = pd.DataFrame(data) + if span_file!='': + TM_regions = [] + with open(span_file) as fp: + next(fp) + next(fp) + next(fp) + next(fp) + for line in fp: + line = line.strip().split() + TM_regions += range(int(line[0]), int(line[1])+1) + dataframeset['TMspan'] = dataframeset['resi'].apply(lambda x: span_multi(x, TM_regions)) + + if lipid_file!='': + with open(lipid_file, 'r') as fp: + data = json.load(fp) + lipid_df = pd.DataFrame.from_dict( data, orient='index', columns=['LAR']).reset_index(drop=False)#.T.set_index('index') + lipid_df = lipid_df.loc[lipid_df['LAR']=='true'].reset_index(drop=True) + lipid_df = lipid_df['index'].astype(int).unique() + dataframeset['LAR'] = dataframeset['resi'].apply(lambda x: span_multi(x, lipid_df)) + sha = sha_tag.split('tag')[0] tag = sha_tag.split('tag')[1] @@ -85,6 +116,11 @@ def rosetta_to_prism(ddg_file, prism_file, sequence, rosetta_info=None, version= "std_ddG": f"std Rosetta ddG values (std((MUT-mean(WT)){units})", }, } + if span_file!='': + metadata['columns']['TMspan'] = 'Residue within the TM region defined by the Rosetta span file' + if lipid_file!='': + metadata['columns']['LAR'] = 'Lipid accessible residue defined by Rosetta' + if first_residue_number != 1: metadata['protein']['first_residue_number'] = first_residue_number