MERGEFIX

drEast · drEast · commit 2347825e165e · 2025-03-18T10:52:11.000+01:00
diff --git a/epytope/Core/Result.py b/epytope/Core/Result.py
@@ -18,10 +18,6 @@
 from copy import deepcopy
 from sys import exit
 import logging
-<<<<<<< HEAD
-import math
-=======
->>>>>>> repo-b2/main
 
 
 class AResult(pandas.DataFrame, metaclass=abc.ABCMeta):
@@ -95,42 +91,25 @@ def filter_result(self, expressions, scoretype='Score'):
         """
         if isinstance(expressions, tuple):
             expressions = [expressions]
-<<<<<<< HEAD
-
-=======
             
->>>>>>> repo-b2/main
         df = deepcopy(self)
         methods = list(set(df.columns.get_level_values(1)))
         scoretypes = list(set(df.columns.get_level_values(2)))
         if scoretype not in scoretypes:
-<<<<<<< HEAD
-            raise ValueError(
-                "Specified ScoreType {} does not match ScoreTypes of data frame {}.".format(scoretype, scoretypes))
-
-=======
             raise ValueError("Specified ScoreType {} does not match ScoreTypes of data frame {}.".format(scoretype, scoretypes))
         
->>>>>>> repo-b2/main
         for expr in expressions:
             method, comp, thr = expr
             if method not in methods:
                 raise ValueError("Specified method {} does not match methods of data frame {}.".format(method, methods))
             else:
-<<<<<<< HEAD
-                filt = comp(df.xs(method, axis=1, level=1).xs(scoretype, axis=1, level=1), thr).values
-=======
                 filt = comp(df.xs(method, axis = 1, level = 1).xs(scoretype, axis = 1, level = 1), thr).values
->>>>>>> repo-b2/main
                 # Only keep rows which contain values fulfilling the comparators logic in the specified method
                 keep_row = [bool.any() for bool in filt]
                 df = df.loc[keep_row]
 
         return EpitopePredictionResult(df)
-<<<<<<< HEAD
-=======
         
->>>>>>> repo-b2/main
 
     def merge_results(self, others):
         """
@@ -145,23 +124,14 @@ def merge_results(self, others):
 
         if type(others) == type(self):
             others = [others]
-<<<<<<< HEAD
-
-=======
         
->>>>>>> repo-b2/main
         # Concatenates self and to be merged dataframe(s)
         for other in others:
             df = pandas.concat([df, other], axis=1)
 
         # Merge result of multiple predictors in others per allele
-<<<<<<< HEAD
-        df_merged = pandas.concat([group[1] for group in df.groupby(level=[0, 1], axis=1)], axis=1)
-
-=======
         df_merged = pandas.concat([group[1] for group in df.groupby(level=[0,1], axis=1)], axis=1)
     
->>>>>>> repo-b2/main
         return EpitopePredictionResult(df_merged)
 
     def from_dict(d, peps, method):
@@ -176,28 +146,17 @@ def from_dict(d, peps, method):
         """
         scoreType = numpy.asarray([list(m.keys()) for m in [metrics for a, metrics in d.items()]]).flatten()
         alleles = numpy.asarray([numpy.repeat(a, len(set(scoreType))) for a in d]).flatten()
-<<<<<<< HEAD
-
-        meth = numpy.repeat(method, len(scoreType))
-        multi_cols = pandas.MultiIndex.from_arrays([alleles, meth, scoreType], names=["Allele", "Method", "ScoreType"])
-        df = pandas.DataFrame(float(0), index=pandas.Index(peps), columns=multi_cols)
-=======
         
         meth = numpy.repeat(method, len(scoreType))
         multi_cols = pandas.MultiIndex.from_arrays([alleles, meth, scoreType], names=["Allele", "Method", "ScoreType"])
         df = pandas.DataFrame(float(0),index=pandas.Index(peps), columns=multi_cols)
->>>>>>> repo-b2/main
         df.index.name = 'Peptides'
         # Fill DataFrame
         for allele, metrics in d.items():
             for metric, pep_scores in metrics.items():
                 for pep, score in pep_scores.items():
                     df[allele][method][metric][pep] = score
-<<<<<<< HEAD
-
-=======
         
->>>>>>> repo-b2/main
         return EpitopePredictionResult(df)
 
 
@@ -207,19 +166,11 @@ class Distance2SelfResult(AResult):
     """
 
     def filter_result(self, expressions):
-<<<<<<< HEAD
-        # TODO: has to be implemented
-        pass
-
-    def merge_results(self, others):
-        # TODO: has to be implemented
-=======
         #TODO: has to be implemented
         pass
 
     def merge_results(self, others):
         #TODO: has to be implemented
->>>>>>> repo-b2/main
         pass
 
 
@@ -261,22 +212,14 @@ def filter_result(self, expressions):
         if isinstance(expressions, tuple):
             expressions = [expressions]
 
-<<<<<<< HEAD
-        # builde logical expression
-=======
         #builde logical expression
->>>>>>> repo-b2/main
         masks = [list(comp(self.loc[:, method], thr)) for method, comp, thr in expressions]
 
         if len(masks) > 1:
             masks = numpy.logical_and(*masks)
         else:
             masks = masks[0]
-<<<<<<< HEAD
-        # apply to all rows
-=======
         #apply to all rows
->>>>>>> repo-b2/main
 
         return CleavageSitePredictionResult(self.loc[masks, :])
 
@@ -296,11 +239,7 @@ def merge_results(self, others):
 
         for i in range(len(others)):
             o = others[i]
-<<<<<<< HEAD
-            df1a, df2a = df.align(o, )
-=======
             df1a, df2a = df.align(o,)
->>>>>>> repo-b2/main
 
             o_diff = o.index.difference(df.index)
             d_diff = df.index.difference(o.index)
@@ -324,11 +263,7 @@ def merge_results(self, others):
             df1 = df1a.fillna(0)
             df2 = df2a.fillna(0)
 
-<<<<<<< HEAD
-            df_merged = df1 + df2
-=======
             df_merged = df1+df2
->>>>>>> repo-b2/main
             false_zero = df_merged == 0
             zero = true_zero & false_zero
 
@@ -377,11 +312,7 @@ def filter_result(self, expressions):
             masks = numpy.logical_and(*masks)
         else:
             masks = masks[0]
-<<<<<<< HEAD
-        # apply to all rows
-=======
         #apply to all rows
->>>>>>> repo-b2/main
         return CleavageFragmentPredictionResult(self.loc[masks, :])
 
     def merge_results(self, others):
@@ -398,11 +329,7 @@ def merge_results(self, others):
         if type(others) == type(self):
             others = [others]
 
-<<<<<<< HEAD
-        return CleavageFragmentPredictionResult(pandas.concat([self] + others, axis=1))
-=======
         return CleavageFragmentPredictionResult(pandas.concat([self]+others, axis=1))
->>>>>>> repo-b2/main
 
 
 class TAPPredictionResult(AResult):
@@ -442,11 +369,7 @@ def filter_result(self, expressions):
             masks = numpy.logical_and(*masks)
         else:
             masks = masks[0]
-<<<<<<< HEAD
-        # apply to all rows
-=======
         #apply to all rows
->>>>>>> repo-b2/main
 
         return TAPPredictionResult(self.loc[masks, :])
 
@@ -463,7 +386,6 @@ def merge_results(self, others):
         if type(others) == type(self):
             others = [others]
 
-<<<<<<< HEAD
         return TAPPredictionResult(pandas.concat([self] + others, axis=1))
 
 
@@ -551,6 +473,3 @@ def merge_results(self, others):
             tcr.columns = pandas.MultiIndex.from_tuples(tuples)
             result = pandas.concat([tcr, result], axis=1)
         return result
-=======
-        return TAPPredictionResult(pandas.concat([self]+others, axis=1))
->>>>>>> repo-b2/main
diff --git a/epytope/IO/UniProtAdapter.py b/epytope/IO/UniProtAdapter.py
@@ -12,16 +12,9 @@
 import bisect
 
 from Bio import SeqIO
-<<<<<<< HEAD
-from epytope.Core.Base import deprecated
 
 
 class UniProtDB:
-    @deprecated  # TODO: refactor ... function based on old code
-=======
-
-class UniProtDB:
->>>>>>> repo-b2/main
     def __init__(self, name='fdb'):
         """
         UniProtDB class to give quick access to entries (fast exact match searches) and convenient ways to produce
@@ -42,11 +35,7 @@ def __init__(self, name='fdb'):
         """
         self.name = name
         self.collection = {}  # all the biopython seq records in a dict keyed by the id of the record
-<<<<<<< HEAD
-        self.searchstring = ''  # all sequences concatenated with a '#'
-=======
         self.search_string = ''  # all sequences concatenated with a '#'
->>>>>>> repo-b2/main
         self.accs = list()  # all accessions in respective order to searchstring
         self.idx = list()  # all indices of starting strings in the searchstring in respective order
 
@@ -61,11 +50,7 @@ def read_seqs(self, sequence_file):
         recs = sequence_file
         if not isinstance(sequence_file, dict) and not isinstance(sequence_file, list):
             try:
-<<<<<<< HEAD
-                with open(sequence_file, 'rb') as f:
-=======
                 with open(sequence_file, 'r') as f:
->>>>>>> repo-b2/main
                     if sequence_file.endswith('.fa') or sequence_file.endswith('.fasta'):
                         recs = SeqIO.to_dict(SeqIO.parse(f, "fasta"))
                     else:  # assume it is a dat file
@@ -77,11 +62,7 @@ def read_seqs(self, sequence_file):
             recs = SeqIO.to_dict(sequence_file)
         if recs:
             self.collection.update(recs)
-<<<<<<< HEAD
-            self.searchstring = '#'.join([str(x.seq) for x in self.collection.values()]).decode('ascii')
-=======
             self.search_string = '#'.join([str(x.seq) for x in self.collection.values()])#.decode('ascii')
->>>>>>> repo-b2/main
             self.accs = list(self.collection.keys())
             self.idx = list()
             self.idx.append(0)
@@ -106,11 +87,7 @@ def exists(self, seq):
         :return: True, if it is found somewhere, False otherwise
         """
         if isinstance(seq, str):
-<<<<<<< HEAD
-            index = self.searchstring.find(seq)
-=======
             index = self.search_string.find(seq)
->>>>>>> repo-b2/main
             if index >= 0:
                 return True
             else:
@@ -127,11 +104,7 @@ def search(self, seq):
         """
         if isinstance(seq, str):
             ids = 'null'
-<<<<<<< HEAD
-            index = self.searchstring.find(seq)
-=======
             index = self.search_string.find(seq)
->>>>>>> repo-b2/main
             if index >= 0:
                 j = bisect.bisect(self.idx, index) - 1
                 ids = self.accs[j]
@@ -141,11 +114,7 @@ def search(self, seq):
             for i in seq:
                 ids.append('null')
             for i, v in enumerate(seq):
-<<<<<<< HEAD
-                index = self.searchstring.find(v)
-=======
                 index = self.search_string.find(v)
->>>>>>> repo-b2/main
                 if index >= 0:
                     j = bisect.bisect(self.idx, index) - 1
                     ids[i] = self.accs[j]
@@ -164,13 +133,8 @@ def search_all(self, seq):
             ids = 'null'
             index = 0
             searchstring_length = len(seq)
-<<<<<<< HEAD
-            while index < len(self.searchstring):
-                index = self.searchstring.find(seq, index)
-=======
             while index < len(self.search_string):
                 index = self.search_string.find(seq, index)
->>>>>>> repo-b2/main
                 if index == -1:
                     break
                 j = bisect.bisect(self.idx, index) - 1
@@ -187,13 +151,8 @@ def search_all(self, seq):
             for i, v in enumerate(seq):
                 index = 0
                 searchstring_length = len(v)
-<<<<<<< HEAD
-                while index < len(self.searchstring):
-                    index = self.searchstring.find(v, index)
-=======
                 while index < len(self.search_string):
                     index = self.search_string.find(v, index)
->>>>>>> repo-b2/main
                     if index == -1:
                         break
                     j = bisect.bisect(self.idx, index) - 1
@@ -203,4 +162,4 @@ def search_all(self, seq):
                         ids[i] = ids[i] + ',' + self.accs[j]
                     index += searchstring_length
             return dict(zip(seq, ids))
-        return None
+        return None
diff --git a/epytope/TCRSpecificityPrediction/External.py b/epytope/TCRSpecificityPrediction/External.py
@@ -790,6 +790,11 @@ def format_tcr_data(self, tcrs, epitopes, pairwise, **kwargs):
         df_tcrs = self.filter_by_length(df_tcrs, None, "CDR3b", "epitope")
         df_tcrs = df_tcrs.drop_duplicates()
         df_tcrs = df_tcrs[(~df_tcrs["CDR3b"].isna()) & (df_tcrs["CDR3b"] != "")]
+
+        if len(df_tcrs) % 50 == 0:
+            # If the length of samples is devisible by the batch size, something weird happens
+            df_tcrs.loc[df_tcrs.index.max()+1] = df_tcrs.iloc[-1]
+
         return df_tcrs
 
     def save_tmp_files(self, data, **kwargs):
@@ -811,6 +816,7 @@ def get_base_cmd(self, filenames, tmp_folder, interpreter=None, conda=None, cmd_
 
     def format_results(self, filenames, tmp_folder, tcrs, epitopes, pairwise, **kwargs):
         results_predictor = pd.read_csv(filenames[1], sep='\t', names=["VDJ_cdr3", "Epitope", "Score"], header=None)
+        results_predictor = results_predictor.drop_duplicates()
         joining_list = ["Epitope", "VDJ_cdr3"]
         results_predictor = results_predictor[joining_list + ["Score"]]
         df_out = self.transform_output(results_predictor, tcrs, epitopes, pairwise, joining_list)
diff --git a/epytope/TCRSpecificityPrediction/ML.py b/epytope/TCRSpecificityPrediction/ML.py
@@ -341,9 +341,15 @@ def organism(self):
 
     def format_tcr_data(self, tcrs, epitopes, pairwise, **kwargs):
         df_tcrs = tcrs.to_pandas()
+        if df_tcrs["organism"].unique()[0] == "MusMusculus" and df_tcrs["organism"].nunique() == 1:
+            self.organism_in = "mouse"
+        else:
+            self.organism_in = "human"
+
         df_tcrs["VDJ_v_gene"] = df_tcrs["VDJ_v_gene"].apply(lambda x: x if re.search(r"\*\d+$", x) else x + "*01")
         df_tcrs["VDJ_j_gene"] = df_tcrs["VDJ_j_gene"].apply(lambda x: x if re.search(r"\*\d+$", x) else x + "*01")
-        df_tcrs = df_tcrs[df_tcrs["VDJ_v_gene"].isin(self._v_regions) & df_tcrs["VDJ_j_gene"].isin(self._j_regions)]
+        if self.organism_in == "human":
+            df_tcrs = df_tcrs[df_tcrs["VDJ_v_gene"].isin(self._v_regions) & df_tcrs["VDJ_j_gene"].isin(self._j_regions)]
         df_tcrs = df_tcrs[(~df_tcrs["VDJ_cdr3"].isna()) & (df_tcrs["VDJ_cdr3"] != "")]
 
         df_tcrs_unique = df_tcrs.drop_duplicates().copy()
@@ -398,7 +404,11 @@ def save_tmp_files(self, data, **kwargs):
     def get_base_cmd(self, filenames, tmp_folder, interpreter=None, conda=None, cmd_prefix=None, **kwargs):
         path_module = self.get_package_dir("paccmann_tcr", interpreter, conda, cmd_prefix).split(os.sep)[:-1] + [".."]
 
-        path_imgt = os.sep.join(path_module + ["datasets", "imgt"])
+        if self.organism_in == "mouse":
+            path_imgt = os.sep.join(path_module + ["datasets", "imgt_mouse"])
+        else:
+            path_imgt = os.sep.join(path_module + ["datasets", "imgt"])
+
         if not os.path.exists(os.sep.join([path_imgt, "V_segment_sequences.fasta"])) or \
                 not os.path.exists(os.sep.join([path_imgt, "J_segment_sequences.fasta"])):
             raise NotADirectoryError(f"Please download the V and J segment files from "
diff --git a/epytope/TCRSpecificityPrediction/Utils.py b/epytope/TCRSpecificityPrediction/Utils.py