Skip to content

Commit

Permalink
Merge pull request #70 from ulissigroup/master
Browse files Browse the repository at this point in the history
merge ktran updates
  • Loading branch information
zulissi authored Nov 28, 2018
2 parents 8daa5dd + 49964db commit b9da68f
Show file tree
Hide file tree
Showing 11 changed files with 139 additions and 97 deletions.
107 changes: 57 additions & 50 deletions gaspy/gasdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,17 +212,12 @@ def _pull_catalog_from_mongo(pipeline):
return docs


def get_catalog_docs_with_adsorption_energies(adsorbates=None, models=['model0'], latest_predictions=True):
def get_catalog_docs_with_predictions(latest_predictions=True):
'''
Nearly identical to `get_catalog_docs`, except it also pulls our surrogate
modeling predictions for adsorption energies.
Args:
adsorbates A list of strings indicating which sets of adsorbates
you want to get adsorption energy predictions for,
e.g., ['CO', 'H'] or ['O', 'OH', 'OOH'].
models A list of strings indicating which models whose
predictions you want to get.
lastest_predictions Boolean indicating whether or not you want either
the latest predictions or all of them.
Returns:
Expand All @@ -231,78 +226,90 @@ def get_catalog_docs_with_adsorption_energies(adsorbates=None, models=['model0']
with a 'predictions' key that has the surrogate modeling
predictions of adsorption energy.
'''
if isinstance(models, str):
raise SyntaxError('The models argument must be a sequence of strings where each '
'element is a model name. Do not pass a single string.')

# Get the prediction data
# Get the default catalog fingerprints, then append the fingerprints we
# need to get the predictions.
fingerprints = defaults.catalog_fingerprints()
for adsorbate in adsorbates:
for model in models:
data_location = 'predictions.adsorption_energy.%s.%s' % (adsorbate, model)
if latest_predictions:
fingerprints[data_location] = {'$arrayElemAt': ['$'+data_location, -1]}
else:
fingerprints[data_location] = '$'+data_location
fingerprints = _add_adsorption_energy_predictions_to_fingerprints(fingerprints, latest_predictions)
fingerprints = _add_orr_predictions_to_fingerprints(fingerprints, latest_predictions)

# Get the documents
project = {'$project': fingerprints}
pipeline = [project]
docs = _pull_catalog_from_mongo(pipeline)

# Clean the documents up
expected_keys = set(fingerprints.keys())
for adsorbate in adsorbates:
for model in models:
expected_keys.remove('predictions.adsorption_energy.%s.%s' % (adsorbate, model))
expected_keys = set(defaults.catalog_fingerprints())
expected_keys.add('predictions')
cleaned_docs = _clean_up_aggregated_docs(docs, expected_keys=expected_keys)

return cleaned_docs


def get_catalog_docs_with_orr_potentials(models=['model0'], latest_predictions=True):
def _add_adsorption_energy_predictions_to_fingerprints(fingerprints, latest_predictions):
'''
Nearly identical to `get_catalog_docs`, except it also pulls our surrogate
modeling predictions for adsorption energy.
This function will add particular keys to a `fingerprints` dictionary that
can be used in a Mongo projection to get the adsorption energy predictions
from our catalog.
Args:
models A list of strings indicating which models whose
predictions you want to get.
fingerprints A dictionary that you plan to pass as a projection
command to a pymongo collection aggregation.
lastest_predictions Boolean indicating whether or not you want either
the latest predictions or all of them.
Returns:
docs A list of dictionaries whose key/value pairings are the
ones given by `gaspy.defaults.catalog_fingerprints`, along
with a 'predictions' key that has the surrogate modeling
predictions of adsorption energy.
'''
if isinstance(models, str):
raise SyntaxError('The models argument must be a sequence of strings where each '
'element is a model name. Do not pass a single string.')
# Figure out what type of json structure our adsorption energy predictions
# have. We do that by looking at the structure of one random document. Note
# that this assumes that all documents are structure identically.
with get_mongo_collection('catalog') as collection:
cursor = collection.aggregate([{"$sample": {"size": 1}}])
example_doc = list(cursor)[0]
predictions = example_doc['predictions']['adsorption_energy']
adsorbates = set(predictions.keys())
models = set(model for adsorbate in adsorbates for model in predictions[adsorbate])

# Make a projection query that targets predictions for each combination of
# adsorbate and model.
for adsorbate in adsorbates:
for model in models:
data_location = 'predictions.adsorption_energy.%s.%s' % (adsorbate, model)
if latest_predictions:
fingerprints[data_location] = {'$arrayElemAt': ['$'+data_location, -1]}
else:
fingerprints[data_location] = '$'+data_location

return fingerprints


def _add_orr_predictions_to_fingerprints(fingerprints, latest_predictions):
'''
This function will add particular keys to a `fingerprints` dictionary that
can be used in a Mongo projection to get the ORR chemistry predictions from
our catalog.
# Get the prediction data
fingerprints = defaults.catalog_fingerprints()
Args:
fingerprints A dictionary that you plan to pass as a projection
command to a pymongo collection aggregation.
lastest_predictions Boolean indicating whether or not you want either
the latest predictions or all of them.
'''
# Figure out what type of json structure our adsorption energy predictions
# have. We do that by looking at the structure of one random document. Note
# that this assumes that all documents are structure identically.
with get_mongo_collection('catalog') as collection:
cursor = collection.aggregate([{"$sample": {"size": 1}}])
example_doc = list(cursor)[0]
predictions = example_doc['predictions']['orr_onset_potential_4e']
models = set(predictions.keys())

# Make a projection query that targets predictions for each model.
for model in models:
data_location = 'predictions.orr_onset_potential_4e.%s' % model
if latest_predictions:
fingerprints[data_location] = {'$arrayElemAt': ['$'+data_location, -1]}
else:
fingerprints[data_location] = '$'+data_location

# Get the documents
project = {'$project': fingerprints}
pipeline = [project]
docs = _pull_catalog_from_mongo(pipeline)

# Clean the documents up
expected_keys = set(fingerprints.keys())
for model in models:
expected_keys.remove('predictions.orr_onset_potential_4e.%s' % model)
expected_keys.add('predictions')
cleaned_docs = _clean_up_aggregated_docs(docs, expected_keys=expected_keys)

return cleaned_docs
return fingerprints


def get_surface_docs(extra_fingerprints=None, filters=None):
Expand Down
129 changes: 82 additions & 47 deletions gaspy/tests/gasdb_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@
get_adsorption_docs,
_clean_up_aggregated_docs,
get_catalog_docs,
get_catalog_docs_with_adsorption_energies,
get_catalog_docs_with_orr_potentials,
_pull_catalog_from_mongo,
get_catalog_docs_with_predictions,
_add_adsorption_energy_predictions_to_fingerprints,
_add_orr_predictions_to_fingerprints,
get_surface_docs,
get_unsimulated_catalog_docs,
_get_attempted_adsorption_docs,
Expand All @@ -45,6 +47,7 @@
from pymongo.collection import Collection
from pymongo.errors import OperationFailure
from ..utils import read_rc
from ..defaults import catalog_fingerprints

REGRESSION_BASELINES_LOCATION = '/home/GASpy/gaspy/tests/regression_baselines/gasdb/'

Expand Down Expand Up @@ -241,60 +244,41 @@ def test_get_catalog_docs():


@pytest.mark.baseline
@pytest.mark.parametrize('adsorbates, models, latest_predictions',
[(['CO'], ['model0'], True),
(['CO', 'H'], ['model0'], True),
(['CO', 'H'], ['model0'], False)])
def test_to_create_catalog_docs_with_adsorption_energies(adsorbates, models, latest_predictions):
docs = get_catalog_docs_with_adsorption_energies(adsorbates, models, latest_predictions)

arg_hash = hashlib.sha224((str(adsorbates) + str(models) + str(latest_predictions)).encode()).hexdigest()
file_name = REGRESSION_BASELINES_LOCATION + 'catalog_with_adsorption_energies_%s' % arg_hash + '.pkl'
with open(file_name, 'wb') as file_handle:
def test_to_create_unprocessed_catalog_docs():
fingerprints = catalog_fingerprints()
project = {'$project': fingerprints}
pipeline = [project]
docs = _pull_catalog_from_mongo(pipeline)

with open(REGRESSION_BASELINES_LOCATION + 'unprocessed_catalog_documents' + '.pkl', 'wb') as file_handle:
pickle.dump(docs, file_handle)
assert True


@pytest.mark.parametrize('adsorbates, models, latest_predictions',
[(['CO'], ['model0'], True),
(['CO', 'H'], ['model0'], True),
(['CO', 'H'], ['model0'], False)])
def test_get_catalog_docs_with_adsorption_energies(adsorbates, models, latest_predictions):
'''
This could be a "real" test, but I am really busy and don't have time to design one.
So I'm turning this into a regression test to let someone else (probably me)
deal with this later.
def test__pull_catalog_from_mongo():
fingerprints = catalog_fingerprints()
project = {'$project': fingerprints}
pipeline = [project]
docs = _pull_catalog_from_mongo(pipeline)

If you do fix this, you should probably add more than one day's worth of predictions
to the unit testing catalog.
'''
docs = get_catalog_docs_with_adsorption_energies(adsorbates, models, latest_predictions)

arg_hash = hashlib.sha224((str(adsorbates) + str(models) + str(latest_predictions)).encode()).hexdigest()
file_name = REGRESSION_BASELINES_LOCATION + 'catalog_with_adsorption_energies_%s' % arg_hash + '.pkl'
with open(file_name, 'rb') as file_handle:
with open(REGRESSION_BASELINES_LOCATION + 'unprocessed_catalog_documents' + '.pkl', 'rb') as file_handle:
expected_docs = pickle.load(file_handle)
assert docs == expected_docs


@pytest.mark.baseline
@pytest.mark.parametrize('models, latest_predictions',
[(['model0'], True),
(['model0'], False)])
def test_to_create_catalog_docs_with_orr_potentials(models, latest_predictions):
docs = get_catalog_docs_with_orr_potentials(models, latest_predictions)

arg_hash = hashlib.sha224((str(models) + str(latest_predictions)).encode()).hexdigest()
file_name = REGRESSION_BASELINES_LOCATION + 'catalog_with_orr_potentials_%s' % arg_hash + '.pkl'
@pytest.mark.parametrize('latest_predictions', [True, False])
def test_to_create_catalog_docs_with_predictions(latest_predictions):
docs = get_catalog_docs_with_predictions(latest_predictions)

arg_hash = hashlib.sha224(str(latest_predictions).encode()).hexdigest()
file_name = REGRESSION_BASELINES_LOCATION + 'catalog_with_predictions_%s' % arg_hash + '.pkl'
with open(file_name, 'wb') as file_handle:
pickle.dump(docs, file_handle)
assert True


@pytest.mark.parametrize('models, latest_predictions',
[(['model0'], True),
(['model0'], False)])
def test_get_catalog_docs_with_orr_potentials(models, latest_predictions):
@pytest.mark.parametrize('latest_predictions', [True, False])
def test_get_catalog_docs_with_predictions(latest_predictions):
'''
This could be a "real" test, but I am really busy and don't have time to design one.
So I'm turning this into a regression test to let someone else (probably me)
Expand All @@ -303,15 +287,67 @@ def test_get_catalog_docs_with_orr_potentials(models, latest_predictions):
If you do fix this, you should probably add more than one day's worth of predictions
to the unit testing catalog.
'''
docs = get_catalog_docs_with_orr_potentials(models, latest_predictions)
docs = get_catalog_docs_with_predictions(latest_predictions)

arg_hash = hashlib.sha224((str(models) + str(latest_predictions)).encode()).hexdigest()
file_name = REGRESSION_BASELINES_LOCATION + 'catalog_with_orr_potentials_%s' % arg_hash + '.pkl'
arg_hash = hashlib.sha224(str(latest_predictions).encode()).hexdigest()
file_name = REGRESSION_BASELINES_LOCATION + 'catalog_with_predictions_%s' % arg_hash + '.pkl'
with open(file_name, 'rb') as file_handle:
expected_docs = pickle.load(file_handle)
assert docs == expected_docs


@pytest.mark.parametrize('latest_predictions', [True, False])
def test__add_adsorption_energy_predictions_to_fingerprints(latest_predictions):
default_fingerprints = catalog_fingerprints()
fingerprints = _add_adsorption_energy_predictions_to_fingerprints(default_fingerprints, latest_predictions)

# Get ALL of the adsorbates and models in the unit testing collection
with get_mongo_collection('catalog') as collection:
cursor = collection.aggregate([{"$sample": {"size": 1}}])
docs = list(cursor)
adsorbates = set()
models = set()
for doc in docs:
predictions = doc['predictions']['adsorption_energy']
new_adsorbates = set(predictions.keys())
new_models = set(model for adsorbate in adsorbates for model in predictions[adsorbate])
adsorbates.update(new_adsorbates)
models.update(new_models)

# Make sure that every single query is there
for adsorbate in adsorbates:
for model in models:
data_location = 'predictions.adsorption_energy.%s.%s' % (adsorbate, model)
if latest_predictions:
assert fingerprints[data_location] == {'$arrayElemAt': ['$'+data_location, -1]}
else:
assert fingerprints[data_location] == '$'+data_location


@pytest.mark.parametrize('latest_predictions', [True, False])
def test__add_orr_predictions_to_fingerprints(latest_predictions):
default_fingerprints = catalog_fingerprints()
fingerprints = _add_orr_predictions_to_fingerprints(default_fingerprints, latest_predictions)

# Get ALL of the models in the unit testing collection
with get_mongo_collection('catalog') as collection:
cursor = collection.aggregate([{"$sample": {"size": 1}}])
docs = list(cursor)
models = set()
for doc in docs:
predictions = doc['predictions']['orr_onset_potential_4e']
new_models = set(predictions.keys())
models.update(new_models)

# Make sure that every single query is there
for model in models:
data_location = 'predictions.orr_onset_potential_4e.%s' % model
if latest_predictions:
assert fingerprints[data_location] == {'$arrayElemAt': ['$'+data_location, -1]}
else:
assert fingerprints[data_location] == '$'+data_location


@pytest.mark.baseline
@pytest.mark.parametrize('extra_fingerprints', [None, {'user': 'user'}])
def test_to_create_aggregated_surface_documents(extra_fingerprints):
Expand Down Expand Up @@ -585,9 +621,8 @@ def test_get_low_coverage_ml_docs(adsorbates, model_tag):
our adsorption collection has a higher (or equal) adsorption
energy than the one reported by `get_low_coverage_ml_docs`
'''
models = ['model0']
low_coverage_docs = get_low_coverage_ml_docs(adsorbates)
all_docs = get_catalog_docs_with_adsorption_energies(adsorbates, models)
all_docs = get_catalog_docs_with_predictions()

for doc in all_docs:
energy = doc['predictions']['adsorption_energy'][adsorbates[0]][model_tag][1]
Expand Down
Binary file modified gaspy/tests/mongo_test_collections/unit_testing_catalog_docs.pkl
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit b9da68f

Please sign in to comment.