Include top-k recall metrics in training pipeline, and bump version

amorehead · amorehead · commit 3d8c55aee270 · 2021-12-21T16:38:45.000-06:00
diff --git a/project/utils/deepinteract_modules.py b/project/utils/deepinteract_modules.py
@@ -19,7 +19,7 @@
 from project.utils.deepinteract_constants import FEATURE_INDICES, RESIDUE_COUNT_LIMIT, NODE_COUNT_LIMIT
 from project.utils.deepinteract_utils import construct_interact_tensor, glorot_orthogonal, get_geo_feats_from_edges, \
     construct_subsequenced_interact_tensors, insert_interact_tensor_logits, \
-    remove_padding, remove_subsequenced_input_padding, calculate_top_k_prec, extract_object
+    remove_padding, remove_subsequenced_input_padding, calculate_top_k_prec, calculate_top_k_recall, extract_object
 from project.utils.graph_utils import src_dot_dst, scaling, imp_exp_attn, out_edge_features, exp
 from project.utils.vision_modules import DeepLabV3Plus
 
@@ -1926,17 +1926,17 @@ def validation_step(self, batch, batch_idx):
 
         # Calculate top-k metrics
         calculating_l_by_n_metrics = True
-        # Log only first 50 validation top-k precisions to limit algorithmic complexity due to sorting (if requested)
+        # Log only first 50 validation top-k metrics to limit algorithmic complexity due to sorting (if requested)
         # calculating_l_by_n_metrics = batch_idx in [i for i in range(50)]
         if calculating_l_by_n_metrics:
             l = graph1.num_nodes() + graph2.num_nodes()
             sorted_pred_indices = torch.argsort(preds[:, 1], descending=True)
             top_10_prec = calculate_top_k_prec(sorted_pred_indices, labels, k=10)
-            top_25_prec = calculate_top_k_prec(sorted_pred_indices, labels, k=25)
-            top_50_prec = calculate_top_k_prec(sorted_pred_indices, labels, k=50) if l > 50 else 0.0  # Catch short seq.
             top_l_by_10_prec = calculate_top_k_prec(sorted_pred_indices, labels, k=(l // 10))
             top_l_by_5_prec = calculate_top_k_prec(sorted_pred_indices, labels, k=(l // 5))
-            top_l_prec = calculate_top_k_prec(sorted_pred_indices, labels, k=l)
+            top_l_recall = calculate_top_k_recall(sorted_pred_indices, labels, k=l)
+            top_l_by_2_recall = calculate_top_k_recall(sorted_pred_indices, labels, k=(l // 2))
+            top_l_by_5_recall = calculate_top_k_recall(sorted_pred_indices, labels, k=(l // 5))
 
         # Calculate the protein interface prediction (PICP) loss along with additional PIP metrics
         loss = self.loss_fn(sampled_logits, labels)  # Calculate loss of a single complex
@@ -1951,11 +1951,11 @@ def validation_step(self, batch, batch_idx):
         self.log(f'val_ce', loss, sync_dist=True)
         if calculating_l_by_n_metrics:
             self.log('val_top_10_prec', top_10_prec, sync_dist=True)
-            self.log('val_top_25_prec', top_25_prec, sync_dist=True)
-            self.log('val_top_50_prec', top_50_prec, sync_dist=True)
             self.log('val_top_l_by_10_prec', top_l_by_10_prec, sync_dist=True)
             self.log('val_top_l_by_5_prec', top_l_by_5_prec, sync_dist=True)
-            self.log('val_top_l_prec', top_l_prec, sync_dist=True)
+            self.log('val_top_l_recall', top_l_recall, sync_dist=True)
+            self.log('val_top_l_by_2_recall', top_l_by_2_recall, sync_dist=True)
+            self.log('val_top_l_by_5_recall', top_l_by_5_recall, sync_dist=True)
 
         return {
             'loss': loss,
@@ -2033,11 +2033,11 @@ def test_step(self, batch, batch_idx):
         l = min(graph1.num_nodes(), graph2.num_nodes())  # Use the smallest length of the two chains as our denominator
         sorted_pred_indices = torch.argsort(preds[:, 1], descending=True)
         top_10_prec = calculate_top_k_prec(sorted_pred_indices, labels, k=10)
-        top_25_prec = calculate_top_k_prec(sorted_pred_indices, labels, k=25)
-        top_50_prec = calculate_top_k_prec(sorted_pred_indices, labels, k=50) if l > 50 else 0.0  # Catch short seq.
         top_l_by_10_prec = calculate_top_k_prec(sorted_pred_indices, labels, k=(l // 10))
         top_l_by_5_prec = calculate_top_k_prec(sorted_pred_indices, labels, k=(l // 5))
-        top_l_prec = calculate_top_k_prec(sorted_pred_indices, labels, k=l)
+        top_l_recall = calculate_top_k_recall(sorted_pred_indices, labels, k=l)
+        top_l_by_2_recall = calculate_top_k_recall(sorted_pred_indices, labels, k=(l // 2))
+        top_l_by_5_recall = calculate_top_k_recall(sorted_pred_indices, labels, k=(l // 5))
 
         # Calculate the protein interface prediction (PICP) loss along with additional PIP metrics
         loss = self.loss_fn(sampled_logits, labels)  # Calculate loss of a single complex
@@ -2062,11 +2062,11 @@ def test_step(self, batch, batch_idx):
         # Log test step metric(s)
         self.log(f'test_ce', loss, sync_dist=True)
         self.log('test_top_10_prec', top_10_prec, sync_dist=True)
-        self.log('test_top_25_prec', top_25_prec, sync_dist=True)
-        self.log('test_top_50_prec', top_50_prec, sync_dist=True)
         self.log('test_top_l_by_10_prec', top_l_by_10_prec, sync_dist=True)
         self.log('test_top_l_by_5_prec', top_l_by_5_prec, sync_dist=True)
-        self.log('test_top_l_prec', top_l_prec, sync_dist=True)
+        self.log('test_top_l_recall', top_l_recall, sync_dist=True)
+        self.log('test_top_l_by_2_recall', top_l_by_2_recall, sync_dist=True)
+        self.log('test_top_l_by_5_recall', top_l_by_5_recall, sync_dist=True)
 
         return {
             'loss': loss,
@@ -2082,6 +2082,9 @@ def test_step(self, batch, batch_idx):
             'top_10_prec': top_10_prec,
             'top_l_by_10_prec': top_l_by_10_prec,
             'top_l_by_5_prec': top_l_by_5_prec,
+            'top_l_recall': top_l_recall,
+            'top_l_by_2_recall': top_l_by_2_recall,
+            'top_l_by_5_recall': top_l_by_5_recall,
             'target': filepaths[0].split(os.sep)[-1][:4]
         }
 
@@ -2112,17 +2115,20 @@ def test_epoch_end(self, outputs: pl.utilities.types.EPOCH_OUTPUT):
             test_preds_rounded = [(output_dict['test_preds_rounded']) for output_dict in outputs]
             test_labels = [output_dict['test_labels'] for output_dict in outputs]
 
-        # Write out test top-k precision results to CSV
-        prec_data = {
+        # Write out test top-k metric results to CSV
+        metrics_data = {
             'top_10_prec': [extract_object(output_dict['top_10_prec']) for output_dict in outputs],
             'top_l_by_10_prec': [extract_object(output_dict['top_l_by_10_prec']) for output_dict in outputs],
             'top_l_by_5_prec': [extract_object(output_dict['top_l_by_5_prec']) for output_dict in outputs],
+            'top_l_recall': [extract_object(output_dict['top_l_recall']) for output_dict in outputs],
+            'top_l_by_2_recall': [extract_object(output_dict['top_l_by_2_recall']) for output_dict in outputs],
+            'top_l_by_5_recall': [extract_object(output_dict['top_l_by_5_recall']) for output_dict in outputs],
             'target': [extract_object(output_dict['target']) for output_dict in outputs],
         }
-        prec_df = pd.DataFrame(data=prec_data)
-        prec_df_name_prefix = 'casp_capri' if self.testing_with_casp_capri else 'dips_plus_test'
-        prec_df_name = prec_df_name_prefix + '_top_prec.csv'
-        prec_df.to_csv(prec_df_name)
+        metrics_df = pd.DataFrame(data=metrics_data)
+        metrics_df_name_prefix = 'casp_capri' if self.testing_with_casp_capri else 'dips_plus_test'
+        metrics_df_name = metrics_df_name_prefix + '_top_metrics.csv'
+        metrics_df.to_csv(metrics_df_name)
 
         if not self.testing_with_casp_capri:  # Testing with DIPS-Plus
             # Filter out all but the first 55 test predictions and labels to reduce storage requirements
diff --git a/project/utils/deepinteract_utils.py b/project/utils/deepinteract_utils.py
@@ -29,12 +29,11 @@
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
 from biopandas.pdb import PandasPdb
-from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
-
 from project.utils.deepinteract_constants import FEAT_COLS, ALLOWABLE_FEATS, D3TO1
 from project.utils.dips_plus_utils import postprocess_pruned_pairs, impute_postprocessed_missing_feature_values
 from project.utils.graph_utils import prot_df_to_dgl_graph_feats
 from project.utils.protein_feature_utils import GeometricProteinFeatures
+from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
 
 try:
     from types import SliceType
@@ -328,7 +327,7 @@ def substitute_missing_atoms(struct_df: pd.DataFrame, all_atom_struct_df: pd.Dat
                     raise NotImplementedError('Error: A missing atom was found, and it is not possible to process it.')
 
                 # Choose a replacement for the missing atom
-                available_atom_keys = set(atom_names) - {missing_atom_key}
+                available_atom_keys = set(atom_names) - {missing_atom_key, 'CA'}  # Disallow CA atoms from being a sub
                 replacement_atom_name = available_atom_keys.pop()  # Choose the first available atom as the substitute
                 replacement_atom = ca_atom_support_atoms[ca_atom_support_atoms['atom_name'] == replacement_atom_name]
                 logging.info(f'Found a missing {missing_atom_key} atom for row number {ca_atom_idx} -'
@@ -973,6 +972,17 @@ def calculate_top_k_prec(sorted_pred_indices: torch.Tensor, labels: torch.Tensor
     return prec
 
 
+def calculate_top_k_recall(sorted_pred_indices: torch.Tensor, labels: torch.Tensor, k: int):
+    """Calculate the top-k interaction recall."""
+    num_interactions_to_score = k
+    selected_pred_indices = sorted_pred_indices[:num_interactions_to_score]
+    true_labels = labels[selected_pred_indices]
+    num_correct = torch.sum(true_labels).item()
+    num_pos_labels = torch.sum(labels).item()
+    recall = num_correct / num_pos_labels
+    return recall
+
+
 def extract_object(obj: any):
     """If incoming object is of type torch.Tensor, convert it to a NumPy array. If it is a scalar, simply return it."""
     return obj.cpu().numpy() if type(obj) == torch.Tensor else obj
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 
 setup(
     name='DeepInteract',
-    version='1.0.7',
+    version='1.0.8',
     description='A geometric deep learning pipeline for predicting protein interface contacts.',
     author='Alex Morehead',
     author_email='acmwhb@umsystem.edu',