@@ -1357,26 +1357,39 @@ def format_tcr_data(self, tcrs, epitopes, pairwise, **kwargs):
13571357 for col in self ._rename_columns .values ():
13581358 df_tcrs = df_tcrs [(~ df_tcrs [col ].isna ()) & (df_tcrs [col ] != 'nan' ) & (df_tcrs [col ] != "" )]
13591359 df_tcrs = df_tcrs [(~ df_tcrs ["organism" ].isna ()) & (df_tcrs ["organism" ] != 'nan' ) & (df_tcrs ["organism" ] != "" )]
1360+ df_tcrs = self .filter_epitopes (df_tcrs , ** kwargs )
13601361 df_tcrs = df_tcrs .drop_duplicates ()
13611362 df_tcrs ["binder" ] = 0
13621363 return df_tcrs
13631364
1365+ def filter_epitopes (self , df , repository = None , ** kwargs ):
1366+ models = os .listdir (os .path .join (repository , "models" , "nettcr_2_2_pretrained" ))
1367+ models = [el for el in models if el not in ["negative_controls" , "cv_pred_df.csv" ]]
1368+ mask_epitope = df ["peptide" ].isin (models ).values
1369+ delta = len (df ) - sum (mask_epitope )
1370+ if delta > 0 :
1371+ warnings .warn (f"Filtering { delta } rows as Epitope not available for categorical model" )
1372+ df = df [mask_epitope ].copy ()
1373+ return df
1374+
13641375 def save_tmp_files (self , data , ** kwargs ):
13651376 tmp_folder = self .get_tmp_folder_path ()
13661377 path_in_raw = os .path .join (tmp_folder .name , f"{ self .name } _raw_input.csv" )
13671378 path_in_intermediate = os .path .join (tmp_folder .name , f"{ self .name } _intermediate_input.csv" )
13681379 path_in = os .path .join (tmp_folder .name , f"{ self .name } _input.csv" )
13691380 model = "t.0.v.1" if "model" not in kwargs else kwargs ["model" ]
1370- path_out = os .path .join (f"{ self .repository_path } " , "models/nettcr_2_2_pan" , f"{ model } _prediction.csv" )
1381+ model_type = kwargs .get ("model_type" , "pan" )
1382+ path_out = os .path .join (f"{ self .repository_path } " , f"models/nettcr_2_2_{ model_type } " , f"{ model } _prediction.csv" )
13711383 data .to_csv (path_in_raw )
13721384 return [path_in_raw , path_in_intermediate , path_in , path_out ], tmp_folder
13731385
13741386 def get_base_cmd (self , filenames , tmp_folder , interpreter = None , conda = None , cmd_prefix = None , ** kwargs ):
13751387 path_utils = os .path .dirname (__file__ )
1376- model = "t.0.v.1" if "model" not in kwargs else kwargs ["model" ]
1377- modeldir = f"{ self .repository_path } /models/nettcr_2_2_pan"
1388+ model = kwargs .get ("model" , "t.0.v.1" )
1389+ model_type = kwargs .get ("model_type" , "pan" )
1390+ modeldir = f"{ self .repository_path } /models/nettcr_2_2_{ model_type } "
13781391 cmd_reconstruct = f"{ path_utils } /Utils.py nettcr { filenames [0 ]} { filenames [1 ]} { filenames [2 ]} "
1379- cmd_predict = f"{ self .repository_path } /src/predict.py --test_data { filenames [2 ]} --outdir { modeldir } --model_name { model } --model_type pan "
1392+ cmd_predict = f"{ self .repository_path } /src/predict.py --test_data { filenames [2 ]} --outdir { modeldir } --model_name { model } --model_type { model_type } "
13801393 return [cmd_reconstruct , cmd_predict ]
13811394
13821395 def run_exec_cmd (self , cmd , filenames , interpreter = None , conda = None , cmd_prefix = None , ** kwargs ):
@@ -1565,119 +1578,6 @@ def format_results(self, filenames, tmp_folder, tcrs, epitopes, pairwise, **kwar
15651578 return df_out
15661579
15671580
1568- class NetTCR22Categorical (ARepoTCRSpecificityPrediction ):
1569- """
1570- Author: Fynbo Jensen, Nielsen
1571- Paper: https://www.biorxiv.org/content/10.1101/2023.10.12.562001v1.full
1572- Repo: https://github.com/mnielLab/NetTCR-2.2
1573- """
1574- __name = "NetTCR-categorical"
1575- __version = "2.2"
1576- __tcr_length = (1 , 9999 )
1577- __epitope_length = (1 , 12 )
1578- __organism = "H"
1579- __repo = "https://github.com/mnielLab/NetTCR-2.2.git"
1580-
1581- _rename_columns = {
1582- "VDJ_cdr3" : "cdr3_beta_aa" ,
1583- "VDJ_v_gene" : "TRBV_IMGT" ,
1584- "VDJ_j_gene" : "TRBJ_IMGT" ,
1585- "VJ_cdr3" : "cdr3_alpha_aa" ,
1586- "VJ_v_gene" : "TRAV_IMGT" ,
1587- "VJ_j_gene" : "TRAJ_IMGT" ,
1588- }
1589-
1590- @property
1591- def name (self ):
1592- return self .__name
1593-
1594- @property
1595- def version (self ):
1596- return self .__version
1597-
1598- @property
1599- def tcr_length (self ):
1600- return self .__tcr_length
1601-
1602- @property
1603- def epitope_length (self ):
1604- return self .__epitope_length
1605-
1606- @property
1607- def repo (self ):
1608- return self .__repo
1609-
1610- @property
1611- def organism (self ):
1612- return self .__organism
1613-
1614- def format_tcr_data (self , tcrs , epitopes , pairwise , ** kwargs ):
1615- df_tcrs = tcrs .to_pandas (rename_columns = self ._rename_columns )
1616- if pairwise :
1617- df_tcrs = self .combine_tcrs_epitopes_pairwise (df_tcrs , epitopes )
1618- else :
1619- df_tcrs = self .combine_tcrs_epitopes_list (df_tcrs , epitopes )
1620- df_tcrs = df_tcrs .rename (columns = {"Epitope" : "peptide" })
1621- for col in self ._rename_columns .values ():
1622- df_tcrs = df_tcrs [(~ df_tcrs [col ].isna ()) & (df_tcrs [col ] != 'nan' ) & (df_tcrs [col ] != "" )]
1623- df_tcrs = df_tcrs [(~ df_tcrs ["organism" ].isna ()) & (df_tcrs ["organism" ] != 'nan' ) & (df_tcrs ["organism" ] != "" )]
1624- df_tcrs = df_tcrs .drop_duplicates ()
1625- df_tcrs ["binder" ] = 0
1626- return df_tcrs
1627-
1628- def filter_epitopes (self , df , repository = None , ** kwargs ):
1629- models = os .listdir (os .path .join (repository , "models" , "nettcr_2_2_pretrained" ))
1630- models = [el for el in models if el not in ["negative_controls" , "cv_pred_df.csv" ]]
1631- mask_epitope = df ["Epitope" ].isin (models ).values
1632- delta = len (df ) - sum (mask_epitope )
1633- if delta > 0 :
1634- warnings .warn (f"Filtering { delta } rows as Epitope not available for categorical model" )
1635- df = df [mask_epitope ].copy ()
1636- return df
1637-
1638- def save_tmp_files (self , data , ** kwargs ):
1639- tmp_folder = self .get_tmp_folder_path ()
1640-
1641- for epitope in data ["Epitope" ].unique ():
1642- path_in_raw = os .path .join (tmp_folder .name , f"{ self .name } _raw_input.csv" )
1643- path_in_intermediate = os .path .join (tmp_folder .name , f"{ self .name } _intermediate_input.csv" )
1644- path_in = os .path .join (tmp_folder .name , f"{ self .name } _input.csv" )
1645- model = "t.0.v.1" if "model" not in kwargs else kwargs ["model" ]
1646- path_out = os .path .join (tmp_folder .name , f"{ model } _prediction.csv" )
1647- data .to_csv (path_in_raw )
1648- return [path_in_raw , path_in_intermediate , path_in , path_out ], tmp_folder
1649-
1650- def get_base_cmd (self , filenames , tmp_folder , interpreter = None , conda = None , cmd_prefix = None , ** kwargs ):
1651- path_utils = os .path .dirname (__file__ )
1652- model = kwargs .get ("model" , "t.0.v.1" )
1653- model_type = kwargs .get ("model_type" , "pretrained" )
1654- modeldir = f"{ self .repository_path } /models/nettcr_2_2_pan"
1655- cmd_reconstruct = f"{ path_utils } /Utils.py nettcr { filenames [0 ]} { filenames [1 ]} { filenames [2 ]} "
1656- cmd_predict = f"{ self .repository_path } /src/predict.py --test_data { filenames [2 ]} --outdir { modeldir } --model_name { model } --model_type pan"
1657- return [cmd_reconstruct , cmd_predict ]
1658-
1659- def run_exec_cmd (self , cmd , filenames , interpreter = None , conda = None , cmd_prefix = None , ** kwargs ):
1660- super ().run_exec_cmd (cmd [0 ], [None , filenames [2 ]], interpreter , conda , cmd_prefix , m_cmd = False , ** kwargs )
1661- super ().run_exec_cmd (cmd [1 ], [None , filenames [3 ]], interpreter , conda , cmd_prefix , m_cmd = False , ** kwargs )
1662-
1663- def format_results (self , filenames , tmp_folder , tcrs , epitopes , pairwise , ** kwargs ):
1664- results_predictor = pd .read_csv (filenames [3 ])
1665- results_predictor = results_predictor .fillna ("" )
1666- joining_list = ["VJ_cdr3" , "VDJ_cdr3" , "VDJ_v_gene" , "VDJ_j_gene" , "VJ_v_gene" , "VJ_j_gene" , "Epitope" ]
1667- results_predictor = results_predictor .rename (columns = {"cdr3_beta_aa" : "VDJ_cdr3" ,
1668- "TRBV_IMGT" : "VDJ_v_gene" ,
1669- "TRBJ_IMGT" : "VDJ_j_gene" ,
1670- "cdr3_alpha_aa" : "VJ_cdr3" ,
1671- "TRAV_IMGT" : "VJ_v_gene" ,
1672- "TRAJ_IMGT" : "VJ_j_gene" ,
1673- "peptide" : "Epitope" ,
1674- "prediction" : "Score" })
1675- required_columns = joining_list + ["Score" ]
1676- results_predictor = results_predictor [required_columns ]
1677- df_out = self .transform_output (results_predictor , tcrs , epitopes , pairwise , joining_list )
1678- return df_out
1679-
1680-
16811581class TCRGP (ARepoTCRSpecificityPrediction ):
16821582 """
16831583 Author: Jokinen et al.
0 commit comments