-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcombineInterpretations.py
72 lines (60 loc) · 2.81 KB
/
combineInterpretations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
import pandas as pd
def renameProteins(cols_to_rename,somadict):
# Rename proteins
new_cols = []
for s in cols_to_rename:
if 'seq' in s:
if 'ratio' in s:
s1 = s.split('_seq')[1]
new_s1 = '-'.join(s1.split('.')[1:])
try:
this_gene1 = somadict[somadict['SeqID'] == new_s1].loc[:, 'GeneID'].values[0]
except:
this_gene2 = s1
s2 = s.split('_seq')[2]
new_s2 = '-'.join(s2.split('.')[1:])
try:
this_gene2 = somadict[somadict['SeqID'] == new_s2].loc[:, 'GeneID'].values[0]
except:
this_gene2 = s2
new_cols.append(this_gene1 + '/' + this_gene2)
else:
try:
new_s = '-'.join(s.split('.')[1:])
this_gene = somadict[somadict['SeqID'] == new_s].loc[:, 'GeneID'].values[0]
new_cols.append(this_gene)
except:
new_cols.append(s)
else:
# clinical feature
new_cols.append(s)
return new_cols
# Which data should be evaluated
output = 'Prediction_output'
model = 'RF'
names = ['PACS_6M_woDys_from_6MProteomics_withHealthy_RF_withFCorr_mutualProteins']
# 'PACS_6M_woDys_from_1MClinicalProteomics_withHealthy_'+model+'_withFCorr',
# 'PACS_6M_woDys_from_6MClinicalProteomics_withHealthy_'+model+'_withFCorr',
# 'PACS_6M_woDys_from_1Mand6MClinicalProteomics_withHealthy_'+model+'_withFCorr',
# 'PACS_12M_from_6MClinicalProteomics_withHealthy_'+model+'_withFCorr']
n_splits = 5
somadict = pd.read_excel('Data/SomaScanDict.xlsx')
somadict.columns = somadict.iloc[0,:]
somadict.drop(0,inplace = True)
for name in names:
folder = os.path.join(output,name)
# Load shap analysis results
this_importance_all = pd.read_csv(os.path.join(folder, name +'_cv'+str(1) + '_importance_val_all.csv'), index_col=0)
this_importance_all.set_index('col_name', inplace=True)
df_importance = pd.DataFrame(index = this_importance_all.index, columns = [cv for cv in range(0,n_splits)])
for cv in range(0,n_splits):
this_importance = pd.read_csv(os.path.join(folder, name+'_cv'+str(cv) + '_importance_val_all.csv'), index_col=0)
this_importance.set_index('col_name', inplace=True)
df_importance.loc[this_importance.index,cv] = this_importance.loc[:,'feature_importance_vals'].rank(ascending = False)
df_importance['sum'] = df_importance.loc[:,[0,1,2,3,4]].sum(axis =1)
df_importance= df_importance.sort_values('sum',ascending = True)
# Column names
new_cols = renameProteins(df_importance.index,somadict)
df_importance.index = new_cols
df_importance.to_csv(os.path.join(output,'eval', name +'_importance_val_all.csv'))