add caching

terryyylim · terryyylim · commit b40a0a8aa349 · 2019-06-18T16:31:12.000-04:00
diff --git a/recommendations/create_prediction.py b/recommendations/create_prediction.py
@@ -1,7 +1,6 @@
 from typing import List
 from typing import Optional
 
-import os
 import click
 import logging
 import pandas as pd
@@ -26,7 +25,11 @@ def __init__(
         self.config = config
         self.persistence_path = self.config.PATHS.predictions
         self.load_model(input_file, input_model)
-    
+        self.user_df = self.model.dataset.user_df
+        self.item_df = self.model.dataset.item_df
+        self.interaction_df = self.model.dataset.interaction_df
+        self.item_no_interactions_list = self.model.dataset.item_no_interactions_list
+
     def load_model(self, input_file: Optional[str] = None, input_model: Optional[Model] = None) -> None:
         logging.info(f'Loading model...')
 
@@ -41,36 +44,91 @@ def load_model(self, input_file: Optional[str] = None, input_model: Optional[Mod
             self.input_file = click.format_filename(input_file)
             self.model = helpers.load_input_file(self.input_file)  # type: ignore
 
+    def create_scores_matrix(
+        self,
+        is_cab: bool
+    ) -> None:
+        '''
+        Formula: r^ui = f(q^u . p^i + b^u + b^i)
+        -------
+        r^ui = Prediction for user u and item i
+        q^u = User representations
+        p^i = Item representations
+        b^u = User feature bias
+        b^i = Item feature bias
+        '''
+        if is_cab:
+            model = self.model.cab_model
+        else:
+            model = self.model.model
+        user_bias, user_latent_repr = model.get_user_representations()
+        item_bias, item_latent_repr = model.get_item_representations()
+        logging.info(
+            f'Logging user latent features before broadcasting\n'
+            f'Type: {type(user_latent_repr)}\n'
+            f'Shape: {user_latent_repr.shape}'
+        )
+        logging.info(
+            f'Logging item latent features before broadcasting\n'
+            f'Type: {type(item_latent_repr)}\n'
+            f'Shape: {item_latent_repr.shape}'
+        )
+        logging.info(
+            f'Logging user bias features before broadcasting\n'
+            f'Type: {type(user_bias)}\n'
+            f'Shape: {user_bias.shape}'
+        )
+        logging.info(
+            f'Logging item bias features before broadcasting\n'
+            f'Type: {type(item_bias)}\n'
+            f'Shape: {item_bias.shape}'
+        )
+
+        user_bias = user_bias[:, np.newaxis]
+        item_bias = item_bias[:, np.newaxis]
+
+        self.dot_product = user_latent_repr @ item_latent_repr.T + user_bias + item_bias.T
+
     def get_lightfm_recommendation(
         self,
         user_index: int,
+        use_precomputed_scores: bool
     ) -> List[int]:
         '''
+        Top-picks
+        ---------
         Main function that creates user-variant recommendation lists.
         '''
-        model = self.model
-        interaction_df = self.model.dataset.interaction_df
-        item_df = self.model.dataset.item_df
-        user_df = self.model.dataset.user_df
-        item_no_interactions_list = self.model.dataset.item_no_interactions_list
         n_users, n_items = self.model.dataset.interactions.shape
 
-        is_new_user = (user_index not in list(user_df['user_id']))
+        is_new_user = (user_index not in list(self.user_df['user_id']))
+        logging.info('logging in create_prediction file')
+        logging.info('logging all users')
+        logging.info(self.user_df)
+        logging.info(self.user_df['user_id'])
+        logging.info(is_new_user)
+        logging.info(self.model.dataset)
+        logging.info(type(self.model.dataset.item_features))
+        logging.info(self.model.dataset.item_features)
+        logging.info(self.model.dataset.item_features.shape)
 
         if is_new_user:
             # TODO: Cold-start recommendation
             logging.info('Getting prediction for new user ~')
         else:
-            scores = self.model.model.predict(
-                user_index,
-                item_ids=np.arange(n_items),
-                item_features=self.model.dataset.item_features
-            )
-            item_df['scores'] = scores
-            top_items = item_df['product_id'][np.argsort(-scores)]
+            if use_precomputed_scores:
+                scores = self.dot_product[user_index]
+            else:
+                scores = self.model.model.predict(
+                    user_index,
+                    item_ids=np.arange(n_items),
+                    item_features=self.model.dataset.item_features
+                )
+            self.item_df['scores'] = scores
+            top_items = self.item_df['product_id'][np.argsort(-scores)]
             top_items_idx = top_items.index
-        
-        top_reccs_df = item_df.iloc[top_items_idx, :].head(10)
+
+        top_reccs_df = self.item_df.iloc[top_items_idx, :].head(10)
         topReccsTable = PrettyTable(['product_id', 'product_name', 'aisle', 'department', 'num', 'scores'])
         topReccsTable.add_row([
            top_reccs_df['product_id'],
@@ -87,24 +145,36 @@ def get_lightfm_recommendation(
     def get_similar_items(
         self,
         product_id: int,
+        rec_type: int
     ) -> pd.DataFrame:
         '''
-        Main function that creates similar variant recommendation lists.
+        Function that creates recommendation lists.
+
+        The intuition behind using less components is reducing the number of latent factors
+        that can be inferred. And, by excluding item features for the CAB model, recommendations
+        will be less based off explicit features such as `aisle` and `department`.
+        -------------------
+        type:
+        1 - Similar Items [DEFAULT_PARAMS]
+        2 - Complement Items [CAB_PARAMS]
         '''
-        item_df = self.model.dataset.item_df
-
-        annoy_model = AnnoyIndex(self.model.config.ANNOY_PARAMS['emb_dim'])
-        annoy_model.load(self.config.PATHS.models + '/item.ann')
+        logging.info(f'Logging recommendations for {self.model.config.ANNOY_PARAMS[rec_type]}')
+        if rec_type == 1:
+            annoy_model = AnnoyIndex(self.model.config.LIGHTFM_PARAMS['no_components'])
+            annoy_model.load(self.config.PATHS.models + '/item.ann')
+        elif rec_type == 2:
+            annoy_model = AnnoyIndex(self.model.config.LIGHTFM_CAB_PARAMS['no_components'])
+            annoy_model.load(self.config.PATHS.models + '/item_cab.ann')
         similar_variants = annoy_model.get_nns_by_item(
             product_id,
             self.model.config.ANNOY_PARAMS['nn_count'],
             search_k=-1,
             include_distances=False
         )
-        logging.info('inside sv')
+
         logging.info(type(similar_variants))
         logging.info(similar_variants)
-        similar_variants_df = item_df.iloc[similar_variants, :]
+        similar_variants_df = self.item_df.iloc[similar_variants, :]
 
         similarVariantsTable = PrettyTable(['product_id', 'product_name', 'aisle', 'department', 'num'])
         similarVariantsTable.add_row([
@@ -114,26 +184,40 @@ def get_similar_items(
            similar_variants_df['department'],
            similar_variants_df['num']
         ])
-        logging.info(f'Similar Variants Data: \n{similarVariantsTable}')
+        logging.info(f'{self.model.config.ANNOY_PARAMS[rec_type]} Data: \n{similarVariantsTable}')
 
         return similar_variants_df
 
+    def cache_top_picks(self) -> None:
+        '''
+        Function to store top recommendations for each user into a dictionary.
+        '''
+        logging.info('Getting Top-Picks for each User')
+        user_toppicks_cache = {}
+        for uid in range(len(self.user_df)):
+            logging.info(f'Caching Top-Picks Recommendation for User {uid}')
+            user_toppicks_cache[uid] = self.get_lightfm_recommendation(user_index=uid, use_precomputed_scores=False)
+
+        self.user_toppicks_cache = user_toppicks_cache
+
 
 @click.command()
 @click.option('--input_file', default=None, type=click.Path(exists=True, dir_okay=False))
-@click.option('--user', default=None, type=click.Path(exists=True, dir_okay=False))
 @click.option('--config', default='production')
-def main(input_file: str, config: str, user: int) -> None:
+def main(input_file: str, config: str) -> None:
 
     logging.info("Let's make a prediction!")
     configuration = helpers.get_configuration(config, prediction_configurations)
 
     predictor = UserItemPrediction(config=configuration, input_file=None)
-    predictor.get_similar_items(configuration.DEFAULT_ITEM_EG)
-    predictor.get_lightfm_recommendation(configuration.DEFAULT_USER_EG)
+    # predictor.create_scores_matrix(is_cab=False)
+    # predictor.create_scores_matrix(is_cab=True)
+    predictor.get_similar_items(product_id=configuration.DEFAULT_ITEM_EG, rec_type=1)
+    predictor.get_similar_items(product_id=configuration.DEFAULT_ITEM_EG, rec_type=2)
+    predictor.get_lightfm_recommendation(user_index=configuration.DEFAULT_USER_EG, use_precomputed_scores=False)
 
 
 if __name__ == "__main__":
     logger = helpers.get_logger()
 
-    main()
+    main()