Skip to content

Commit 4d8f00f

Browse files
committed
NetTCR categorical models
1 parent f0f360e commit 4d8f00f

File tree

1 file changed

+17
-117
lines changed

1 file changed

+17
-117
lines changed

epytope/TCRSpecificityPrediction/External.py

Lines changed: 17 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -1357,26 +1357,39 @@ def format_tcr_data(self, tcrs, epitopes, pairwise, **kwargs):
13571357
for col in self._rename_columns.values():
13581358
df_tcrs = df_tcrs[(~df_tcrs[col].isna()) & (df_tcrs[col] != 'nan') & (df_tcrs[col] != "")]
13591359
df_tcrs = df_tcrs[(~df_tcrs["organism"].isna()) & (df_tcrs["organism"] != 'nan') & (df_tcrs["organism"] != "")]
1360+
df_tcrs = self.filter_epitopes(df_tcrs, **kwargs)
13601361
df_tcrs = df_tcrs.drop_duplicates()
13611362
df_tcrs["binder"] = 0
13621363
return df_tcrs
13631364

1365+
def filter_epitopes(self, df, repository=None, **kwargs):
1366+
models = os.listdir(os.path.join(repository, "models", "nettcr_2_2_pretrained"))
1367+
models = [el for el in models if el not in ["negative_controls", "cv_pred_df.csv"]]
1368+
mask_epitope = df["peptide"].isin(models).values
1369+
delta = len(df) - sum(mask_epitope)
1370+
if delta > 0:
1371+
warnings.warn(f"Filtering {delta} rows as Epitope not available for categorical model")
1372+
df = df[mask_epitope].copy()
1373+
return df
1374+
13641375
def save_tmp_files(self, data, **kwargs):
13651376
tmp_folder = self.get_tmp_folder_path()
13661377
path_in_raw = os.path.join(tmp_folder.name, f"{self.name}_raw_input.csv")
13671378
path_in_intermediate = os.path.join(tmp_folder.name, f"{self.name}_intermediate_input.csv")
13681379
path_in = os.path.join(tmp_folder.name, f"{self.name}_input.csv")
13691380
model = "t.0.v.1" if "model" not in kwargs else kwargs["model"]
1370-
path_out = os.path.join(f"{self.repository_path}", "models/nettcr_2_2_pan", f"{model}_prediction.csv")
1381+
model_type = kwargs.get("model_type", "pan")
1382+
path_out = os.path.join(f"{self.repository_path}", f"models/nettcr_2_2_{model_type}", f"{model}_prediction.csv")
13711383
data.to_csv(path_in_raw)
13721384
return [path_in_raw, path_in_intermediate, path_in, path_out], tmp_folder
13731385

13741386
def get_base_cmd(self, filenames, tmp_folder, interpreter=None, conda=None, cmd_prefix=None, **kwargs):
13751387
path_utils = os.path.dirname(__file__)
1376-
model = "t.0.v.1" if "model" not in kwargs else kwargs["model"]
1377-
modeldir = f"{self.repository_path}/models/nettcr_2_2_pan"
1388+
model = kwargs.get("model", "t.0.v.1")
1389+
model_type = kwargs.get("model_type", "pan")
1390+
modeldir = f"{self.repository_path}/models/nettcr_2_2_{model_type}"
13781391
cmd_reconstruct = f"{path_utils}/Utils.py nettcr {filenames[0]} {filenames[1]} {filenames[2]}"
1379-
cmd_predict = f"{self.repository_path}/src/predict.py --test_data {filenames[2]} --outdir {modeldir} --model_name {model} --model_type pan"
1392+
cmd_predict = f"{self.repository_path}/src/predict.py --test_data {filenames[2]} --outdir {modeldir} --model_name {model} --model_type {model_type}"
13801393
return [cmd_reconstruct, cmd_predict]
13811394

13821395
def run_exec_cmd(self, cmd, filenames, interpreter=None, conda=None, cmd_prefix=None, **kwargs):
@@ -1565,119 +1578,6 @@ def format_results(self, filenames, tmp_folder, tcrs, epitopes, pairwise, **kwar
15651578
return df_out
15661579

15671580

1568-
class NetTCR22Categorical(ARepoTCRSpecificityPrediction):
1569-
"""
1570-
Author: Fynbo Jensen, Nielsen
1571-
Paper: https://www.biorxiv.org/content/10.1101/2023.10.12.562001v1.full
1572-
Repo: https://github.com/mnielLab/NetTCR-2.2
1573-
"""
1574-
__name = "NetTCR-categorical"
1575-
__version = "2.2"
1576-
__tcr_length = (1, 9999)
1577-
__epitope_length = (1, 12)
1578-
__organism = "H"
1579-
__repo = "https://github.com/mnielLab/NetTCR-2.2.git"
1580-
1581-
_rename_columns = {
1582-
"VDJ_cdr3": "cdr3_beta_aa",
1583-
"VDJ_v_gene": "TRBV_IMGT",
1584-
"VDJ_j_gene": "TRBJ_IMGT",
1585-
"VJ_cdr3": "cdr3_alpha_aa",
1586-
"VJ_v_gene": "TRAV_IMGT",
1587-
"VJ_j_gene": "TRAJ_IMGT",
1588-
}
1589-
1590-
@property
1591-
def name(self):
1592-
return self.__name
1593-
1594-
@property
1595-
def version(self):
1596-
return self.__version
1597-
1598-
@property
1599-
def tcr_length(self):
1600-
return self.__tcr_length
1601-
1602-
@property
1603-
def epitope_length(self):
1604-
return self.__epitope_length
1605-
1606-
@property
1607-
def repo(self):
1608-
return self.__repo
1609-
1610-
@property
1611-
def organism(self):
1612-
return self.__organism
1613-
1614-
def format_tcr_data(self, tcrs, epitopes, pairwise, **kwargs):
1615-
df_tcrs = tcrs.to_pandas(rename_columns=self._rename_columns)
1616-
if pairwise:
1617-
df_tcrs = self.combine_tcrs_epitopes_pairwise(df_tcrs, epitopes)
1618-
else:
1619-
df_tcrs = self.combine_tcrs_epitopes_list(df_tcrs, epitopes)
1620-
df_tcrs = df_tcrs.rename(columns={"Epitope": "peptide"})
1621-
for col in self._rename_columns.values():
1622-
df_tcrs = df_tcrs[(~df_tcrs[col].isna()) & (df_tcrs[col] != 'nan') & (df_tcrs[col] != "")]
1623-
df_tcrs = df_tcrs[(~df_tcrs["organism"].isna()) & (df_tcrs["organism"] != 'nan') & (df_tcrs["organism"] != "")]
1624-
df_tcrs = df_tcrs.drop_duplicates()
1625-
df_tcrs["binder"] = 0
1626-
return df_tcrs
1627-
1628-
def filter_epitopes(self, df, repository=None, **kwargs):
1629-
models = os.listdir(os.path.join(repository, "models", "nettcr_2_2_pretrained"))
1630-
models = [el for el in models if el not in ["negative_controls", "cv_pred_df.csv"]]
1631-
mask_epitope = df["Epitope"].isin(models).values
1632-
delta = len(df) - sum(mask_epitope)
1633-
if delta > 0:
1634-
warnings.warn(f"Filtering {delta} rows as Epitope not available for categorical model")
1635-
df = df[mask_epitope].copy()
1636-
return df
1637-
1638-
def save_tmp_files(self, data, **kwargs):
1639-
tmp_folder = self.get_tmp_folder_path()
1640-
1641-
for epitope in data["Epitope"].unique():
1642-
path_in_raw = os.path.join(tmp_folder.name, f"{self.name}_raw_input.csv")
1643-
path_in_intermediate = os.path.join(tmp_folder.name, f"{self.name}_intermediate_input.csv")
1644-
path_in = os.path.join(tmp_folder.name, f"{self.name}_input.csv")
1645-
model = "t.0.v.1" if "model" not in kwargs else kwargs["model"]
1646-
path_out = os.path.join(tmp_folder.name, f"{model}_prediction.csv")
1647-
data.to_csv(path_in_raw)
1648-
return [path_in_raw, path_in_intermediate, path_in, path_out], tmp_folder
1649-
1650-
def get_base_cmd(self, filenames, tmp_folder, interpreter=None, conda=None, cmd_prefix=None, **kwargs):
1651-
path_utils = os.path.dirname(__file__)
1652-
model = kwargs.get("model", "t.0.v.1")
1653-
model_type = kwargs.get("model_type", "pretrained")
1654-
modeldir = f"{self.repository_path}/models/nettcr_2_2_pan"
1655-
cmd_reconstruct = f"{path_utils}/Utils.py nettcr {filenames[0]} {filenames[1]} {filenames[2]}"
1656-
cmd_predict = f"{self.repository_path}/src/predict.py --test_data {filenames[2]} --outdir {modeldir} --model_name {model} --model_type pan"
1657-
return [cmd_reconstruct, cmd_predict]
1658-
1659-
def run_exec_cmd(self, cmd, filenames, interpreter=None, conda=None, cmd_prefix=None, **kwargs):
1660-
super().run_exec_cmd(cmd[0], [None, filenames[2]], interpreter, conda, cmd_prefix, m_cmd=False, **kwargs)
1661-
super().run_exec_cmd(cmd[1], [None, filenames[3]], interpreter, conda, cmd_prefix, m_cmd=False, **kwargs)
1662-
1663-
def format_results(self, filenames, tmp_folder, tcrs, epitopes, pairwise, **kwargs):
1664-
results_predictor = pd.read_csv(filenames[3])
1665-
results_predictor = results_predictor.fillna("")
1666-
joining_list = ["VJ_cdr3", "VDJ_cdr3", "VDJ_v_gene", "VDJ_j_gene", "VJ_v_gene", "VJ_j_gene", "Epitope"]
1667-
results_predictor = results_predictor.rename(columns={"cdr3_beta_aa": "VDJ_cdr3",
1668-
"TRBV_IMGT": "VDJ_v_gene",
1669-
"TRBJ_IMGT": "VDJ_j_gene",
1670-
"cdr3_alpha_aa": "VJ_cdr3",
1671-
"TRAV_IMGT": "VJ_v_gene",
1672-
"TRAJ_IMGT": "VJ_j_gene",
1673-
"peptide": "Epitope",
1674-
"prediction": "Score"})
1675-
required_columns = joining_list + ["Score"]
1676-
results_predictor = results_predictor[required_columns]
1677-
df_out = self.transform_output(results_predictor, tcrs, epitopes, pairwise, joining_list)
1678-
return df_out
1679-
1680-
16811581
class TCRGP(ARepoTCRSpecificityPrediction):
16821582
"""
16831583
Author: Jokinen et al.

0 commit comments

Comments
 (0)