NetTCR categorical models

drEast · drEast · commit 4d8f00f47a4e · 2025-03-19T16:49:07.000+01:00
diff --git a/epytope/TCRSpecificityPrediction/External.py b/epytope/TCRSpecificityPrediction/External.py
@@ -1357,26 +1357,39 @@ def format_tcr_data(self, tcrs, epitopes, pairwise, **kwargs):
         for col in self._rename_columns.values():
             df_tcrs = df_tcrs[(~df_tcrs[col].isna()) & (df_tcrs[col] != 'nan') & (df_tcrs[col] != "")]
         df_tcrs = df_tcrs[(~df_tcrs["organism"].isna()) & (df_tcrs["organism"] != 'nan') & (df_tcrs["organism"] != "")]
+        df_tcrs = self.filter_epitopes(df_tcrs, **kwargs)
         df_tcrs = df_tcrs.drop_duplicates()
         df_tcrs["binder"] = 0
         return df_tcrs
 
+    def filter_epitopes(self, df, repository=None, **kwargs):
+        models = os.listdir(os.path.join(repository, "models", "nettcr_2_2_pretrained"))
+        models = [el for el in models if el not in ["negative_controls", "cv_pred_df.csv"]]
+        mask_epitope = df["peptide"].isin(models).values
+        delta = len(df) - sum(mask_epitope)
+        if delta > 0:
+            warnings.warn(f"Filtering {delta} rows as Epitope not available for categorical model")
+        df = df[mask_epitope].copy()
+        return df
+
     def save_tmp_files(self, data, **kwargs):
         tmp_folder = self.get_tmp_folder_path()
         path_in_raw = os.path.join(tmp_folder.name, f"{self.name}_raw_input.csv")
         path_in_intermediate = os.path.join(tmp_folder.name, f"{self.name}_intermediate_input.csv")
         path_in = os.path.join(tmp_folder.name, f"{self.name}_input.csv")
         model = "t.0.v.1" if "model" not in kwargs else kwargs["model"]
-        path_out = os.path.join(f"{self.repository_path}", "models/nettcr_2_2_pan", f"{model}_prediction.csv")
+        model_type = kwargs.get("model_type", "pan")
+        path_out = os.path.join(f"{self.repository_path}", f"models/nettcr_2_2_{model_type}", f"{model}_prediction.csv")
         data.to_csv(path_in_raw)
         return [path_in_raw, path_in_intermediate, path_in, path_out], tmp_folder
 
     def get_base_cmd(self, filenames, tmp_folder, interpreter=None, conda=None, cmd_prefix=None, **kwargs):
         path_utils = os.path.dirname(__file__)
-        model = "t.0.v.1" if "model" not in kwargs else kwargs["model"]
-        modeldir = f"{self.repository_path}/models/nettcr_2_2_pan"
+        model = kwargs.get("model", "t.0.v.1")
+        model_type = kwargs.get("model_type", "pan")
+        modeldir = f"{self.repository_path}/models/nettcr_2_2_{model_type}"
         cmd_reconstruct = f"{path_utils}/Utils.py nettcr {filenames[0]} {filenames[1]} {filenames[2]}"
-        cmd_predict = f"{self.repository_path}/src/predict.py --test_data {filenames[2]} --outdir {modeldir} --model_name {model} --model_type pan"
+        cmd_predict = f"{self.repository_path}/src/predict.py --test_data {filenames[2]} --outdir {modeldir} --model_name {model} --model_type {model_type}"
         return [cmd_reconstruct, cmd_predict]
 
     def run_exec_cmd(self, cmd, filenames, interpreter=None, conda=None, cmd_prefix=None, **kwargs):
@@ -1565,119 +1578,6 @@ def format_results(self, filenames, tmp_folder, tcrs, epitopes, pairwise, **kwar
         return df_out
 
 
-class NetTCR22Categorical(ARepoTCRSpecificityPrediction):
-    """
-    Author: Fynbo Jensen, Nielsen
-    Paper: https://www.biorxiv.org/content/10.1101/2023.10.12.562001v1.full
-    Repo: https://github.com/mnielLab/NetTCR-2.2
-    """
-    __name = "NetTCR-categorical"
-    __version = "2.2"
-    __tcr_length = (1, 9999)
-    __epitope_length = (1, 12)
-    __organism = "H"
-    __repo = "https://github.com/mnielLab/NetTCR-2.2.git"
-
-    _rename_columns = {
-        "VDJ_cdr3": "cdr3_beta_aa",
-        "VDJ_v_gene": "TRBV_IMGT",
-        "VDJ_j_gene": "TRBJ_IMGT",
-        "VJ_cdr3": "cdr3_alpha_aa",
-        "VJ_v_gene": "TRAV_IMGT",
-        "VJ_j_gene": "TRAJ_IMGT",
-    }
-
-    @property
-    def name(self):
-        return self.__name
-
-    @property
-    def version(self):
-        return self.__version
-
-    @property
-    def tcr_length(self):
-        return self.__tcr_length
-
-    @property
-    def epitope_length(self):
-        return self.__epitope_length
-
-    @property
-    def repo(self):
-        return self.__repo
-
-    @property
-    def organism(self):
-        return self.__organism
-
-    def format_tcr_data(self, tcrs, epitopes, pairwise, **kwargs):
-        df_tcrs = tcrs.to_pandas(rename_columns=self._rename_columns)
-        if pairwise:
-            df_tcrs = self.combine_tcrs_epitopes_pairwise(df_tcrs, epitopes)
-        else:
-            df_tcrs = self.combine_tcrs_epitopes_list(df_tcrs, epitopes)
-        df_tcrs = df_tcrs.rename(columns={"Epitope": "peptide"})
-        for col in self._rename_columns.values():
-            df_tcrs = df_tcrs[(~df_tcrs[col].isna()) & (df_tcrs[col] != 'nan') & (df_tcrs[col] != "")]
-        df_tcrs = df_tcrs[(~df_tcrs["organism"].isna()) & (df_tcrs["organism"] != 'nan') & (df_tcrs["organism"] != "")]
-        df_tcrs = df_tcrs.drop_duplicates()
-        df_tcrs["binder"] = 0
-        return df_tcrs
-    
-    def filter_epitopes(self, df, repository=None, **kwargs):
-        models = os.listdir(os.path.join(repository, "models", "nettcr_2_2_pretrained"))
-        models = [el for el in models if el not in ["negative_controls", "cv_pred_df.csv"]]
-        mask_epitope = df["Epitope"].isin(models).values
-        delta = len(df) - sum(mask_epitope)
-        if delta > 0:
-            warnings.warn(f"Filtering {delta} rows as Epitope not available for categorical model")
-        df = df[mask_epitope].copy()
-        return df
-
-    def save_tmp_files(self, data, **kwargs):
-        tmp_folder = self.get_tmp_folder_path()
-        
-        for epitope in data["Epitope"].unique():
-            path_in_raw = os.path.join(tmp_folder.name, f"{self.name}_raw_input.csv")
-            path_in_intermediate = os.path.join(tmp_folder.name, f"{self.name}_intermediate_input.csv")
-            path_in = os.path.join(tmp_folder.name, f"{self.name}_input.csv")
-            model = "t.0.v.1" if "model" not in kwargs else kwargs["model"]
-            path_out = os.path.join(tmp_folder.name, f"{model}_prediction.csv")
-            data.to_csv(path_in_raw)
-        return [path_in_raw, path_in_intermediate, path_in, path_out], tmp_folder
-
-    def get_base_cmd(self, filenames, tmp_folder, interpreter=None, conda=None, cmd_prefix=None, **kwargs):
-        path_utils = os.path.dirname(__file__)
-        model = kwargs.get("model", "t.0.v.1")
-        model_type = kwargs.get("model_type", "pretrained")
-        modeldir = f"{self.repository_path}/models/nettcr_2_2_pan"
-        cmd_reconstruct = f"{path_utils}/Utils.py nettcr {filenames[0]} {filenames[1]} {filenames[2]}"
-        cmd_predict = f"{self.repository_path}/src/predict.py --test_data {filenames[2]} --outdir {modeldir} --model_name {model} --model_type pan"
-        return [cmd_reconstruct, cmd_predict]
-
-    def run_exec_cmd(self, cmd, filenames, interpreter=None, conda=None, cmd_prefix=None, **kwargs):
-        super().run_exec_cmd(cmd[0], [None, filenames[2]], interpreter, conda, cmd_prefix, m_cmd=False, **kwargs)
-        super().run_exec_cmd(cmd[1], [None, filenames[3]], interpreter, conda, cmd_prefix, m_cmd=False, **kwargs)
-
-    def format_results(self, filenames, tmp_folder, tcrs, epitopes, pairwise, **kwargs):
-        results_predictor = pd.read_csv(filenames[3])
-        results_predictor = results_predictor.fillna("")
-        joining_list = ["VJ_cdr3", "VDJ_cdr3", "VDJ_v_gene", "VDJ_j_gene", "VJ_v_gene", "VJ_j_gene", "Epitope"]
-        results_predictor = results_predictor.rename(columns={"cdr3_beta_aa": "VDJ_cdr3",
-                                                              "TRBV_IMGT": "VDJ_v_gene",
-                                                              "TRBJ_IMGT": "VDJ_j_gene",
-                                                              "cdr3_alpha_aa": "VJ_cdr3",
-                                                              "TRAV_IMGT": "VJ_v_gene",
-                                                              "TRAJ_IMGT": "VJ_j_gene",
-                                                              "peptide": "Epitope",
-                                                              "prediction": "Score"})
-        required_columns = joining_list + ["Score"]
-        results_predictor = results_predictor[required_columns]
-        df_out = self.transform_output(results_predictor, tcrs, epitopes, pairwise, joining_list)
-        return df_out
-
-
 class TCRGP(ARepoTCRSpecificityPrediction):
     """
     Author: Jokinen et al.