Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merge ktran updates #70

Merged
merged 4 commits into from
Nov 28, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 57 additions & 50 deletions gaspy/gasdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,17 +212,12 @@ def _pull_catalog_from_mongo(pipeline):
return docs


def get_catalog_docs_with_adsorption_energies(adsorbates=None, models=['model0'], latest_predictions=True):
def get_catalog_docs_with_predictions(latest_predictions=True):
'''
Nearly identical to `get_catalog_docs`, except it also pulls our surrogate
modeling predictions for adsorption energies.

Args:
adsorbates A list of strings indicating which sets of adsorbates
you want to get adsorption energy predictions for,
e.g., ['CO', 'H'] or ['O', 'OH', 'OOH'].
models A list of strings indicating which models whose
predictions you want to get.
lastest_predictions Boolean indicating whether or not you want either
the latest predictions or all of them.
Returns:
Expand All @@ -231,78 +226,90 @@ def get_catalog_docs_with_adsorption_energies(adsorbates=None, models=['model0']
with a 'predictions' key that has the surrogate modeling
predictions of adsorption energy.
'''
if isinstance(models, str):
raise SyntaxError('The models argument must be a sequence of strings where each '
'element is a model name. Do not pass a single string.')

# Get the prediction data
# Get the default catalog fingerprints, then append the fingerprints we
# need to get the predictions.
fingerprints = defaults.catalog_fingerprints()
for adsorbate in adsorbates:
for model in models:
data_location = 'predictions.adsorption_energy.%s.%s' % (adsorbate, model)
if latest_predictions:
fingerprints[data_location] = {'$arrayElemAt': ['$'+data_location, -1]}
else:
fingerprints[data_location] = '$'+data_location
fingerprints = _add_adsorption_energy_predictions_to_fingerprints(fingerprints, latest_predictions)
fingerprints = _add_orr_predictions_to_fingerprints(fingerprints, latest_predictions)

# Get the documents
project = {'$project': fingerprints}
pipeline = [project]
docs = _pull_catalog_from_mongo(pipeline)

# Clean the documents up
expected_keys = set(fingerprints.keys())
for adsorbate in adsorbates:
for model in models:
expected_keys.remove('predictions.adsorption_energy.%s.%s' % (adsorbate, model))
expected_keys = set(defaults.catalog_fingerprints())
expected_keys.add('predictions')
cleaned_docs = _clean_up_aggregated_docs(docs, expected_keys=expected_keys)

return cleaned_docs


def get_catalog_docs_with_orr_potentials(models=['model0'], latest_predictions=True):
def _add_adsorption_energy_predictions_to_fingerprints(fingerprints, latest_predictions):
'''
Nearly identical to `get_catalog_docs`, except it also pulls our surrogate
modeling predictions for adsorption energy.
This function will add particular keys to a `fingerprints` dictionary that
can be used in a Mongo projection to get the adsorption energy predictions
from our catalog.

Args:
models A list of strings indicating which models whose
predictions you want to get.
fingerprints A dictionary that you plan to pass as a projection
command to a pymongo collection aggregation.
lastest_predictions Boolean indicating whether or not you want either
the latest predictions or all of them.
Returns:
docs A list of dictionaries whose key/value pairings are the
ones given by `gaspy.defaults.catalog_fingerprints`, along
with a 'predictions' key that has the surrogate modeling
predictions of adsorption energy.
'''
if isinstance(models, str):
raise SyntaxError('The models argument must be a sequence of strings where each '
'element is a model name. Do not pass a single string.')
# Figure out what type of json structure our adsorption energy predictions
# have. We do that by looking at the structure of one random document. Note
# that this assumes that all documents are structure identically.
with get_mongo_collection('catalog') as collection:
cursor = collection.aggregate([{"$sample": {"size": 1}}])
example_doc = list(cursor)[0]
predictions = example_doc['predictions']['adsorption_energy']
adsorbates = set(predictions.keys())
models = set(model for adsorbate in adsorbates for model in predictions[adsorbate])

# Make a projection query that targets predictions for each combination of
# adsorbate and model.
for adsorbate in adsorbates:
for model in models:
data_location = 'predictions.adsorption_energy.%s.%s' % (adsorbate, model)
if latest_predictions:
fingerprints[data_location] = {'$arrayElemAt': ['$'+data_location, -1]}
else:
fingerprints[data_location] = '$'+data_location

return fingerprints


def _add_orr_predictions_to_fingerprints(fingerprints, latest_predictions):
'''
This function will add particular keys to a `fingerprints` dictionary that
can be used in a Mongo projection to get the ORR chemistry predictions from
our catalog.

# Get the prediction data
fingerprints = defaults.catalog_fingerprints()
Args:
fingerprints A dictionary that you plan to pass as a projection
command to a pymongo collection aggregation.
lastest_predictions Boolean indicating whether or not you want either
the latest predictions or all of them.
'''
# Figure out what type of json structure our adsorption energy predictions
# have. We do that by looking at the structure of one random document. Note
# that this assumes that all documents are structure identically.
with get_mongo_collection('catalog') as collection:
cursor = collection.aggregate([{"$sample": {"size": 1}}])
example_doc = list(cursor)[0]
predictions = example_doc['predictions']['orr_onset_potential_4e']
models = set(predictions.keys())

# Make a projection query that targets predictions for each model.
for model in models:
data_location = 'predictions.orr_onset_potential_4e.%s' % model
if latest_predictions:
fingerprints[data_location] = {'$arrayElemAt': ['$'+data_location, -1]}
else:
fingerprints[data_location] = '$'+data_location

# Get the documents
project = {'$project': fingerprints}
pipeline = [project]
docs = _pull_catalog_from_mongo(pipeline)

# Clean the documents up
expected_keys = set(fingerprints.keys())
for model in models:
expected_keys.remove('predictions.orr_onset_potential_4e.%s' % model)
expected_keys.add('predictions')
cleaned_docs = _clean_up_aggregated_docs(docs, expected_keys=expected_keys)

return cleaned_docs
return fingerprints


def get_surface_docs(extra_fingerprints=None, filters=None):
Expand Down
129 changes: 82 additions & 47 deletions gaspy/tests/gasdb_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@
get_adsorption_docs,
_clean_up_aggregated_docs,
get_catalog_docs,
get_catalog_docs_with_adsorption_energies,
get_catalog_docs_with_orr_potentials,
_pull_catalog_from_mongo,
get_catalog_docs_with_predictions,
_add_adsorption_energy_predictions_to_fingerprints,
_add_orr_predictions_to_fingerprints,
get_surface_docs,
get_unsimulated_catalog_docs,
_get_attempted_adsorption_docs,
Expand All @@ -45,6 +47,7 @@
from pymongo.collection import Collection
from pymongo.errors import OperationFailure
from ..utils import read_rc
from ..defaults import catalog_fingerprints

REGRESSION_BASELINES_LOCATION = '/home/GASpy/gaspy/tests/regression_baselines/gasdb/'

Expand Down Expand Up @@ -241,60 +244,41 @@ def test_get_catalog_docs():


@pytest.mark.baseline
@pytest.mark.parametrize('adsorbates, models, latest_predictions',
[(['CO'], ['model0'], True),
(['CO', 'H'], ['model0'], True),
(['CO', 'H'], ['model0'], False)])
def test_to_create_catalog_docs_with_adsorption_energies(adsorbates, models, latest_predictions):
docs = get_catalog_docs_with_adsorption_energies(adsorbates, models, latest_predictions)

arg_hash = hashlib.sha224((str(adsorbates) + str(models) + str(latest_predictions)).encode()).hexdigest()
file_name = REGRESSION_BASELINES_LOCATION + 'catalog_with_adsorption_energies_%s' % arg_hash + '.pkl'
with open(file_name, 'wb') as file_handle:
def test_to_create_unprocessed_catalog_docs():
fingerprints = catalog_fingerprints()
project = {'$project': fingerprints}
pipeline = [project]
docs = _pull_catalog_from_mongo(pipeline)

with open(REGRESSION_BASELINES_LOCATION + 'unprocessed_catalog_documents' + '.pkl', 'wb') as file_handle:
pickle.dump(docs, file_handle)
assert True


@pytest.mark.parametrize('adsorbates, models, latest_predictions',
[(['CO'], ['model0'], True),
(['CO', 'H'], ['model0'], True),
(['CO', 'H'], ['model0'], False)])
def test_get_catalog_docs_with_adsorption_energies(adsorbates, models, latest_predictions):
'''
This could be a "real" test, but I am really busy and don't have time to design one.
So I'm turning this into a regression test to let someone else (probably me)
deal with this later.
def test__pull_catalog_from_mongo():
fingerprints = catalog_fingerprints()
project = {'$project': fingerprints}
pipeline = [project]
docs = _pull_catalog_from_mongo(pipeline)

If you do fix this, you should probably add more than one day's worth of predictions
to the unit testing catalog.
'''
docs = get_catalog_docs_with_adsorption_energies(adsorbates, models, latest_predictions)

arg_hash = hashlib.sha224((str(adsorbates) + str(models) + str(latest_predictions)).encode()).hexdigest()
file_name = REGRESSION_BASELINES_LOCATION + 'catalog_with_adsorption_energies_%s' % arg_hash + '.pkl'
with open(file_name, 'rb') as file_handle:
with open(REGRESSION_BASELINES_LOCATION + 'unprocessed_catalog_documents' + '.pkl', 'rb') as file_handle:
expected_docs = pickle.load(file_handle)
assert docs == expected_docs


@pytest.mark.baseline
@pytest.mark.parametrize('models, latest_predictions',
[(['model0'], True),
(['model0'], False)])
def test_to_create_catalog_docs_with_orr_potentials(models, latest_predictions):
docs = get_catalog_docs_with_orr_potentials(models, latest_predictions)

arg_hash = hashlib.sha224((str(models) + str(latest_predictions)).encode()).hexdigest()
file_name = REGRESSION_BASELINES_LOCATION + 'catalog_with_orr_potentials_%s' % arg_hash + '.pkl'
@pytest.mark.parametrize('latest_predictions', [True, False])
def test_to_create_catalog_docs_with_predictions(latest_predictions):
docs = get_catalog_docs_with_predictions(latest_predictions)

arg_hash = hashlib.sha224(str(latest_predictions).encode()).hexdigest()
file_name = REGRESSION_BASELINES_LOCATION + 'catalog_with_predictions_%s' % arg_hash + '.pkl'
with open(file_name, 'wb') as file_handle:
pickle.dump(docs, file_handle)
assert True


@pytest.mark.parametrize('models, latest_predictions',
[(['model0'], True),
(['model0'], False)])
def test_get_catalog_docs_with_orr_potentials(models, latest_predictions):
@pytest.mark.parametrize('latest_predictions', [True, False])
def test_get_catalog_docs_with_predictions(latest_predictions):
'''
This could be a "real" test, but I am really busy and don't have time to design one.
So I'm turning this into a regression test to let someone else (probably me)
Expand All @@ -303,15 +287,67 @@ def test_get_catalog_docs_with_orr_potentials(models, latest_predictions):
If you do fix this, you should probably add more than one day's worth of predictions
to the unit testing catalog.
'''
docs = get_catalog_docs_with_orr_potentials(models, latest_predictions)
docs = get_catalog_docs_with_predictions(latest_predictions)

arg_hash = hashlib.sha224((str(models) + str(latest_predictions)).encode()).hexdigest()
file_name = REGRESSION_BASELINES_LOCATION + 'catalog_with_orr_potentials_%s' % arg_hash + '.pkl'
arg_hash = hashlib.sha224(str(latest_predictions).encode()).hexdigest()
file_name = REGRESSION_BASELINES_LOCATION + 'catalog_with_predictions_%s' % arg_hash + '.pkl'
with open(file_name, 'rb') as file_handle:
expected_docs = pickle.load(file_handle)
assert docs == expected_docs


@pytest.mark.parametrize('latest_predictions', [True, False])
def test__add_adsorption_energy_predictions_to_fingerprints(latest_predictions):
default_fingerprints = catalog_fingerprints()
fingerprints = _add_adsorption_energy_predictions_to_fingerprints(default_fingerprints, latest_predictions)

# Get ALL of the adsorbates and models in the unit testing collection
with get_mongo_collection('catalog') as collection:
cursor = collection.aggregate([{"$sample": {"size": 1}}])
docs = list(cursor)
adsorbates = set()
models = set()
for doc in docs:
predictions = doc['predictions']['adsorption_energy']
new_adsorbates = set(predictions.keys())
new_models = set(model for adsorbate in adsorbates for model in predictions[adsorbate])
adsorbates.update(new_adsorbates)
models.update(new_models)

# Make sure that every single query is there
for adsorbate in adsorbates:
for model in models:
data_location = 'predictions.adsorption_energy.%s.%s' % (adsorbate, model)
if latest_predictions:
assert fingerprints[data_location] == {'$arrayElemAt': ['$'+data_location, -1]}
else:
assert fingerprints[data_location] == '$'+data_location


@pytest.mark.parametrize('latest_predictions', [True, False])
def test__add_orr_predictions_to_fingerprints(latest_predictions):
default_fingerprints = catalog_fingerprints()
fingerprints = _add_orr_predictions_to_fingerprints(default_fingerprints, latest_predictions)

# Get ALL of the models in the unit testing collection
with get_mongo_collection('catalog') as collection:
cursor = collection.aggregate([{"$sample": {"size": 1}}])
docs = list(cursor)
models = set()
for doc in docs:
predictions = doc['predictions']['orr_onset_potential_4e']
new_models = set(predictions.keys())
models.update(new_models)

# Make sure that every single query is there
for model in models:
data_location = 'predictions.orr_onset_potential_4e.%s' % model
if latest_predictions:
assert fingerprints[data_location] == {'$arrayElemAt': ['$'+data_location, -1]}
else:
assert fingerprints[data_location] == '$'+data_location


@pytest.mark.baseline
@pytest.mark.parametrize('extra_fingerprints', [None, {'user': 'user'}])
def test_to_create_aggregated_surface_documents(extra_fingerprints):
Expand Down Expand Up @@ -585,9 +621,8 @@ def test_get_low_coverage_ml_docs(adsorbates, model_tag):
our adsorption collection has a higher (or equal) adsorption
energy than the one reported by `get_low_coverage_ml_docs`
'''
models = ['model0']
low_coverage_docs = get_low_coverage_ml_docs(adsorbates)
all_docs = get_catalog_docs_with_adsorption_energies(adsorbates, models)
all_docs = get_catalog_docs_with_predictions()

for doc in all_docs:
energy = doc['predictions']['adsorption_energy'][adsorbates[0]][model_tag][1]
Expand Down
Binary file modified gaspy/tests/mongo_test_collections/unit_testing_catalog_docs.pkl
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.