Merge pull request #70 from ulissigroup/master

merge ktran updates
ulissigroup · Nov 28, 2018 · b9da68f · b9da68f
2 parents 8daa5dd + 49964db
commit b9da68f
Show file tree

Hide file tree

Showing 11 changed files with 139 additions and 97 deletions.
diff --git a/gaspy/gasdb.py b/gaspy/gasdb.py
@@ -212,17 +212,12 @@ def _pull_catalog_from_mongo(pipeline):
     return docs
 
 
-def get_catalog_docs_with_adsorption_energies(adsorbates=None, models=['model0'], latest_predictions=True):
+def get_catalog_docs_with_predictions(latest_predictions=True):
     '''
     Nearly identical to `get_catalog_docs`, except it also pulls our surrogate
     modeling predictions for adsorption energies.
 
     Args:
-        adsorbates          A list of strings indicating which sets of adsorbates
-                            you want to get adsorption energy predictions for,
-                            e.g., ['CO', 'H'] or ['O', 'OH', 'OOH'].
-        models              A list of strings indicating which models whose
-                            predictions you want to get.
         lastest_predictions Boolean indicating whether or not you want either
                             the latest predictions or all of them.
     Returns:
@@ -231,78 +226,90 @@ def get_catalog_docs_with_adsorption_energies(adsorbates=None, models=['model0']
                 with a 'predictions' key that has the surrogate modeling
                 predictions of adsorption energy.
     '''
-    if isinstance(models, str):
-        raise SyntaxError('The models argument must be a sequence of strings where each '
-                          'element is a model name. Do not pass a single string.')
-
-    # Get the prediction data
+    # Get the default catalog fingerprints, then append the fingerprints we
+    # need to get the predictions.
     fingerprints = defaults.catalog_fingerprints()
-    for adsorbate in adsorbates:
-        for model in models:
-            data_location = 'predictions.adsorption_energy.%s.%s' % (adsorbate, model)
-            if latest_predictions:
-                fingerprints[data_location] = {'$arrayElemAt': ['$'+data_location, -1]}
-            else:
-                fingerprints[data_location] = '$'+data_location
+    fingerprints = _add_adsorption_energy_predictions_to_fingerprints(fingerprints, latest_predictions)
+    fingerprints = _add_orr_predictions_to_fingerprints(fingerprints, latest_predictions)
 
     # Get the documents
     project = {'$project': fingerprints}
     pipeline = [project]
     docs = _pull_catalog_from_mongo(pipeline)
 
     # Clean the documents up
-    expected_keys = set(fingerprints.keys())
-    for adsorbate in adsorbates:
-        for model in models:
-            expected_keys.remove('predictions.adsorption_energy.%s.%s' % (adsorbate, model))
+    expected_keys = set(defaults.catalog_fingerprints())
     expected_keys.add('predictions')
     cleaned_docs = _clean_up_aggregated_docs(docs, expected_keys=expected_keys)
 
     return cleaned_docs
 
 
-def get_catalog_docs_with_orr_potentials(models=['model0'], latest_predictions=True):
+def _add_adsorption_energy_predictions_to_fingerprints(fingerprints, latest_predictions):
     '''
-    Nearly identical to `get_catalog_docs`, except it also pulls our surrogate
-    modeling predictions for adsorption energy.
+    This function will add particular keys to a `fingerprints` dictionary that
+    can be used in a Mongo projection to get the adsorption energy predictions
+    from our catalog.
 
     Args:
-        models              A list of strings indicating which models whose
-                            predictions you want to get.
+        fingerprints        A dictionary that you plan to pass as a projection
+                            command to a pymongo collection aggregation.
         lastest_predictions Boolean indicating whether or not you want either
                             the latest predictions or all of them.
-    Returns:
-        docs    A list of dictionaries whose key/value pairings are the
-                ones given by `gaspy.defaults.catalog_fingerprints`, along
-                with a 'predictions' key that has the surrogate modeling
-                predictions of adsorption energy.
     '''
-    if isinstance(models, str):
-        raise SyntaxError('The models argument must be a sequence of strings where each '
-                          'element is a model name. Do not pass a single string.')
+    # Figure out what type of json structure our adsorption energy predictions
+    # have. We do that by looking at the structure of one random document. Note
+    # that this assumes that all documents are structure identically.
+    with get_mongo_collection('catalog') as collection:
+        cursor = collection.aggregate([{"$sample": {"size": 1}}])
+        example_doc = list(cursor)[0]
+    predictions = example_doc['predictions']['adsorption_energy']
+    adsorbates = set(predictions.keys())
+    models = set(model for adsorbate in adsorbates for model in predictions[adsorbate])
+
+    # Make a projection query that targets predictions for each combination of
+    # adsorbate and model.
+    for adsorbate in adsorbates:
+        for model in models:
+            data_location = 'predictions.adsorption_energy.%s.%s' % (adsorbate, model)
+            if latest_predictions:
+                fingerprints[data_location] = {'$arrayElemAt': ['$'+data_location, -1]}
+            else:
+                fingerprints[data_location] = '$'+data_location
+
+    return fingerprints
+
+
+def _add_orr_predictions_to_fingerprints(fingerprints, latest_predictions):
+    '''
+    This function will add particular keys to a `fingerprints` dictionary that
+    can be used in a Mongo projection to get the ORR chemistry predictions from
+    our catalog.
 
-    # Get the prediction data
-    fingerprints = defaults.catalog_fingerprints()
+    Args:
+        fingerprints        A dictionary that you plan to pass as a projection
+                            command to a pymongo collection aggregation.
+        lastest_predictions Boolean indicating whether or not you want either
+                            the latest predictions or all of them.
+    '''
+    # Figure out what type of json structure our adsorption energy predictions
+    # have. We do that by looking at the structure of one random document. Note
+    # that this assumes that all documents are structure identically.
+    with get_mongo_collection('catalog') as collection:
+        cursor = collection.aggregate([{"$sample": {"size": 1}}])
+        example_doc = list(cursor)[0]
+    predictions = example_doc['predictions']['orr_onset_potential_4e']
+    models = set(predictions.keys())
+
+    # Make a projection query that targets predictions for each model.
     for model in models:
         data_location = 'predictions.orr_onset_potential_4e.%s' % model
         if latest_predictions:
             fingerprints[data_location] = {'$arrayElemAt': ['$'+data_location, -1]}
         else:
             fingerprints[data_location] = '$'+data_location
 
-    # Get the documents
-    project = {'$project': fingerprints}
-    pipeline = [project]
-    docs = _pull_catalog_from_mongo(pipeline)
-
-    # Clean the documents up
-    expected_keys = set(fingerprints.keys())
-    for model in models:
-        expected_keys.remove('predictions.orr_onset_potential_4e.%s' % model)
-    expected_keys.add('predictions')
-    cleaned_docs = _clean_up_aggregated_docs(docs, expected_keys=expected_keys)
-
-    return cleaned_docs
+    return fingerprints
 
 
 def get_surface_docs(extra_fingerprints=None, filters=None):

diff --git a/gaspy/tests/gasdb_test.py b/gaspy/tests/gasdb_test.py
@@ -20,8 +20,10 @@
                      get_adsorption_docs,
                      _clean_up_aggregated_docs,
                      get_catalog_docs,
-                     get_catalog_docs_with_adsorption_energies,
-                     get_catalog_docs_with_orr_potentials,
+                     _pull_catalog_from_mongo,
+                     get_catalog_docs_with_predictions,
+                     _add_adsorption_energy_predictions_to_fingerprints,
+                     _add_orr_predictions_to_fingerprints,
                      get_surface_docs,
                      get_unsimulated_catalog_docs,
                      _get_attempted_adsorption_docs,
@@ -45,6 +47,7 @@
 from pymongo.collection import Collection
 from pymongo.errors import OperationFailure
 from ..utils import read_rc
+from ..defaults import catalog_fingerprints
 
 REGRESSION_BASELINES_LOCATION = '/home/GASpy/gaspy/tests/regression_baselines/gasdb/'
 
@@ -241,60 +244,41 @@ def test_get_catalog_docs():
 
 
 @pytest.mark.baseline
-@pytest.mark.parametrize('adsorbates, models, latest_predictions',
-                         [(['CO'], ['model0'], True),
-                          (['CO', 'H'], ['model0'], True),
-                          (['CO', 'H'], ['model0'], False)])
-def test_to_create_catalog_docs_with_adsorption_energies(adsorbates, models, latest_predictions):
-    docs = get_catalog_docs_with_adsorption_energies(adsorbates, models, latest_predictions)
-
-    arg_hash = hashlib.sha224((str(adsorbates) + str(models) + str(latest_predictions)).encode()).hexdigest()
-    file_name = REGRESSION_BASELINES_LOCATION + 'catalog_with_adsorption_energies_%s' % arg_hash + '.pkl'
-    with open(file_name, 'wb') as file_handle:
+def test_to_create_unprocessed_catalog_docs():
+    fingerprints = catalog_fingerprints()
+    project = {'$project': fingerprints}
+    pipeline = [project]
+    docs = _pull_catalog_from_mongo(pipeline)
+
+    with open(REGRESSION_BASELINES_LOCATION + 'unprocessed_catalog_documents' + '.pkl', 'wb') as file_handle:
         pickle.dump(docs, file_handle)
-    assert True
 
 
-@pytest.mark.parametrize('adsorbates, models, latest_predictions',
-                         [(['CO'], ['model0'], True),
-                          (['CO', 'H'], ['model0'], True),
-                          (['CO', 'H'], ['model0'], False)])
-def test_get_catalog_docs_with_adsorption_energies(adsorbates, models, latest_predictions):
-    '''
-    This could be a "real" test, but I am really busy and don't have time to design one.
-    So I'm turning this into a regression test to let someone else (probably me)
-    deal with this later.
+def test__pull_catalog_from_mongo():
+    fingerprints = catalog_fingerprints()
+    project = {'$project': fingerprints}
+    pipeline = [project]
+    docs = _pull_catalog_from_mongo(pipeline)
 
-    If you do fix this, you should probably add more than one day's worth of predictions
-    to the unit testing catalog.
-    '''
-    docs = get_catalog_docs_with_adsorption_energies(adsorbates, models, latest_predictions)
-
-    arg_hash = hashlib.sha224((str(adsorbates) + str(models) + str(latest_predictions)).encode()).hexdigest()
-    file_name = REGRESSION_BASELINES_LOCATION + 'catalog_with_adsorption_energies_%s' % arg_hash + '.pkl'
-    with open(file_name, 'rb') as file_handle:
+    with open(REGRESSION_BASELINES_LOCATION + 'unprocessed_catalog_documents' + '.pkl', 'rb') as file_handle:
         expected_docs = pickle.load(file_handle)
     assert docs == expected_docs
 
 
 @pytest.mark.baseline
-@pytest.mark.parametrize('models, latest_predictions',
-                         [(['model0'], True),
-                          (['model0'], False)])
-def test_to_create_catalog_docs_with_orr_potentials(models, latest_predictions):
-    docs = get_catalog_docs_with_orr_potentials(models, latest_predictions)
-
-    arg_hash = hashlib.sha224((str(models) + str(latest_predictions)).encode()).hexdigest()
-    file_name = REGRESSION_BASELINES_LOCATION + 'catalog_with_orr_potentials_%s' % arg_hash + '.pkl'
+@pytest.mark.parametrize('latest_predictions', [True, False])
+def test_to_create_catalog_docs_with_predictions(latest_predictions):
+    docs = get_catalog_docs_with_predictions(latest_predictions)
+
+    arg_hash = hashlib.sha224(str(latest_predictions).encode()).hexdigest()
+    file_name = REGRESSION_BASELINES_LOCATION + 'catalog_with_predictions_%s' % arg_hash + '.pkl'
     with open(file_name, 'wb') as file_handle:
         pickle.dump(docs, file_handle)
     assert True
 
 
-@pytest.mark.parametrize('models, latest_predictions',
-                         [(['model0'], True),
-                          (['model0'], False)])
-def test_get_catalog_docs_with_orr_potentials(models, latest_predictions):
+@pytest.mark.parametrize('latest_predictions', [True, False])
+def test_get_catalog_docs_with_predictions(latest_predictions):
     '''
     This could be a "real" test, but I am really busy and don't have time to design one.
     So I'm turning this into a regression test to let someone else (probably me)
@@ -303,15 +287,67 @@ def test_get_catalog_docs_with_orr_potentials(models, latest_predictions):
     If you do fix this, you should probably add more than one day's worth of predictions
     to the unit testing catalog.
     '''
-    docs = get_catalog_docs_with_orr_potentials(models, latest_predictions)
+    docs = get_catalog_docs_with_predictions(latest_predictions)
 
-    arg_hash = hashlib.sha224((str(models) + str(latest_predictions)).encode()).hexdigest()
-    file_name = REGRESSION_BASELINES_LOCATION + 'catalog_with_orr_potentials_%s' % arg_hash + '.pkl'
+    arg_hash = hashlib.sha224(str(latest_predictions).encode()).hexdigest()
+    file_name = REGRESSION_BASELINES_LOCATION + 'catalog_with_predictions_%s' % arg_hash + '.pkl'
     with open(file_name, 'rb') as file_handle:
         expected_docs = pickle.load(file_handle)
     assert docs == expected_docs
 
 
+@pytest.mark.parametrize('latest_predictions', [True, False])
+def test__add_adsorption_energy_predictions_to_fingerprints(latest_predictions):
+    default_fingerprints = catalog_fingerprints()
+    fingerprints = _add_adsorption_energy_predictions_to_fingerprints(default_fingerprints, latest_predictions)
+
+    # Get ALL of the adsorbates and models in the unit testing collection
+    with get_mongo_collection('catalog') as collection:
+        cursor = collection.aggregate([{"$sample": {"size": 1}}])
+        docs = list(cursor)
+    adsorbates = set()
+    models = set()
+    for doc in docs:
+        predictions = doc['predictions']['adsorption_energy']
+        new_adsorbates = set(predictions.keys())
+        new_models = set(model for adsorbate in adsorbates for model in predictions[adsorbate])
+        adsorbates.update(new_adsorbates)
+        models.update(new_models)
+
+    # Make sure that every single query is there
+    for adsorbate in adsorbates:
+        for model in models:
+            data_location = 'predictions.adsorption_energy.%s.%s' % (adsorbate, model)
+            if latest_predictions:
+                assert fingerprints[data_location] == {'$arrayElemAt': ['$'+data_location, -1]}
+            else:
+                assert fingerprints[data_location] == '$'+data_location
+
+
+@pytest.mark.parametrize('latest_predictions', [True, False])
+def test__add_orr_predictions_to_fingerprints(latest_predictions):
+    default_fingerprints = catalog_fingerprints()
+    fingerprints = _add_orr_predictions_to_fingerprints(default_fingerprints, latest_predictions)
+
+    # Get ALL of the models in the unit testing collection
+    with get_mongo_collection('catalog') as collection:
+        cursor = collection.aggregate([{"$sample": {"size": 1}}])
+        docs = list(cursor)
+    models = set()
+    for doc in docs:
+        predictions = doc['predictions']['orr_onset_potential_4e']
+        new_models = set(predictions.keys())
+        models.update(new_models)
+
+    # Make sure that every single query is there
+    for model in models:
+        data_location = 'predictions.orr_onset_potential_4e.%s' % model
+        if latest_predictions:
+            assert fingerprints[data_location] == {'$arrayElemAt': ['$'+data_location, -1]}
+        else:
+            assert fingerprints[data_location] == '$'+data_location
+
+
 @pytest.mark.baseline
 @pytest.mark.parametrize('extra_fingerprints', [None, {'user': 'user'}])
 def test_to_create_aggregated_surface_documents(extra_fingerprints):
@@ -585,9 +621,8 @@ def test_get_low_coverage_ml_docs(adsorbates, model_tag):
     our adsorption collection has a higher (or equal) adsorption
     energy than the one reported by `get_low_coverage_ml_docs`
     '''
-    models = ['model0']
     low_coverage_docs = get_low_coverage_ml_docs(adsorbates)
-    all_docs = get_catalog_docs_with_adsorption_energies(adsorbates, models)
+    all_docs = get_catalog_docs_with_predictions()
 
     for doc in all_docs:
         energy = doc['predictions']['adsorption_energy'][adsorbates[0]][model_tag][1]

diff --git a/gaspy/tests/mongo_test_collections/unit_testing_catalog_docs.pkl b/gaspy/tests/mongo_test_collections/unit_testing_catalog_docs.pkl
diff --git a/...log_with_adsorption_energies_1e00b3a1391b3db33258704c1b30be66dd19618867e3a21512176017.pkl b/...log_with_adsorption_energies_1e00b3a1391b3db33258704c1b30be66dd19618867e3a21512176017.pkl
diff --git a/...log_with_adsorption_energies_67056b0d468243d70835b8771907c9e7b76332cd520fc22425e11e30.pkl b/...log_with_adsorption_energies_67056b0d468243d70835b8771907c9e7b76332cd520fc22425e11e30.pkl
diff --git a/...log_with_adsorption_energies_ef74264b3543bccd39d421166f11af73128fc2683bbdb1292845668a.pkl b/...log_with_adsorption_energies_ef74264b3543bccd39d421166f11af73128fc2683bbdb1292845668a.pkl
diff --git a/.../catalog_with_orr_potentials_3d6a4d604511ebd4f7c5bf6b65c8e5b59c62bc065f1b29ec5bbdd521.pkl b/.../catalog_with_orr_potentials_3d6a4d604511ebd4f7c5bf6b65c8e5b59c62bc065f1b29ec5bbdd521.pkl
diff --git a/.../catalog_with_orr_potentials_c760403b687c3a98368f534127f028e62e62ea64c46ee9624a112877.pkl b/.../catalog_with_orr_potentials_c760403b687c3a98368f534127f028e62e62ea64c46ee9624a112877.pkl
diff --git a/...sdb/catalog_with_predictions_623d4fc7bd6d8878dd37a9fd4a591ddfa41a2487f53809e84fd9e7c4.pkl b/...sdb/catalog_with_predictions_623d4fc7bd6d8878dd37a9fd4a591ddfa41a2487f53809e84fd9e7c4.pkl
diff --git a/...sdb/catalog_with_predictions_b45899583510159617e22fca2b6f561a09289be12ccb30f6df8d4a11.pkl b/...sdb/catalog_with_predictions_b45899583510159617e22fca2b6f561a09289be12ccb30f6df8d4a11.pkl
diff --git a/gaspy/tests/regression_baselines/gasdb/unprocessed_catalog_documents.pkl b/gaspy/tests/regression_baselines/gasdb/unprocessed_catalog_documents.pkl