Skip to content
This repository has been archived by the owner on Sep 11, 2023. It is now read-only.

Commit

Permalink
Merge pull request #967 from marscher/optional_output_checking
Browse files Browse the repository at this point in the history
[coordinates] introduced new config var "coordinates_check_output"
marscher authored Oct 21, 2016
2 parents d60d137 + 88c35a4 commit 4308906
Showing 6 changed files with 73 additions and 29 deletions.
36 changes: 21 additions & 15 deletions doc/source/CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -6,9 +6,15 @@ Changelog

**New features**:

- coordinates: for lag < chunksize improved speed (50%) for TICA. #960
- coordinates
- for lag < chunksize improved speed (50%) for TICA. #960
- new config variable "coordinates_check_output" to test for "NaN" and "inf" values in
iterator output for every chunk. The option is disabled by default. It gives insight
during debugging where faulty values are introduced into the pipeline. #967


**Fixes**:

- msm: low-level api removed (use msmtools for now, if you really need it). #550,

2.2.6 (9-23-16)
@@ -136,6 +142,7 @@ Service release. Fixes some
considerable high chunk size as well.

**Fixes**:

- In parallel environments (clusters with shared filesystem) there will be no
crashes due to the config module, which tried to write files in users home
directory. Config files are optional by now.
@@ -190,19 +197,18 @@ Service release. Fixes some
(reported as Warnings).

- coordinates:
- Completly re-designed class hierachy (user-code/API unaffected).
- Added trajectory info cache to avoid re-computing lengths, dimensions and
byte offsets of data sets.
- Random access strategies supported (eg. via slices).
- FeatureReader supports random access for XTC and TRR (in conjunction with mdtraj-1.6).
- Re-design API to support scikit-learn interface (fit, transform).
- Pipeline elements (former Transformer class) now uses iterator pattern to
obtain data and therefore supports now pipeline trees.
- pipeline elements support writing their output to csv files.
- TICA/PCA uses covartools to estimate covariance matrices.
- This now saves one pass over the data set.
- Supports sparsification data on the fly.

- Completely re-designed class hierachy (user-code/API unaffected).
- Added trajectory info cache to avoid re-computing lengths, dimensions and
byte offsets of data sets.
- Random access strategies supported (eg. via slices).
- FeatureReader supports random access for XTC and TRR (in conjunction with mdtraj-1.6).
- Re-design API to support scikit-learn interface (fit, transform).
- Pipeline elements (former Transformer class) now uses iterator pattern to
obtain data and therefore supports now pipeline trees.
- pipeline elements support writing their output to csv files.
- TICA/PCA uses covartools to estimate covariance matrices:
+ This now saves one pass over the data set.
+ Supports sparsification data on the fly.

**Fixes**:

@@ -342,7 +348,7 @@ reorganization of the code.
- coordinates package: allow metrics to be passed to cluster algorithms.
- coordinates package: cache trajectory lengths by default
(uncached led to 1 pass of reading for non indexed (XTC) formats).
This avoids re-reading e.g XTC files to determine their lengths.
This avoids re-reading e.g XTC files to determine their lengths.
- coordinates package: enable passing chunk size to readers and pipelines in API.
- coordinates package: assign_to_centers now allows all supported file formats as centers input.
- coordinates package: save_traj(s) now handles stride parameter.
11 changes: 11 additions & 0 deletions pyemma/coordinates/data/_base/datasource.py
Original file line number Diff line number Diff line change
@@ -671,6 +671,14 @@ def next(self):
(not self.return_traj_index and len(X) == 0) or (self.return_traj_index and len(X[1]) == 0)
):
X = self._it_next()
if config.coordinates_check_output:
array = X if not self.return_traj_index else X[1]
if not np.all(np.isfinite(array)):
# determine position
start = self.pos
msg = "Found invalid values in chunk in trajectory index {itraj} at chunk [{start}, {stop}]" \
.format(itraj=self.current_trajindex, start=start, stop=start+len(array))
raise InvalidDataInStreamException(msg)
return X

def __iter__(self):
@@ -683,3 +691,6 @@ def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
return False


class InvalidDataInStreamException(Exception):
"""Data stream contained NaN or (+/-) infinity"""
21 changes: 21 additions & 0 deletions pyemma/coordinates/tests/test_coordinates_iterator.py
Original file line number Diff line number Diff line change
@@ -2,6 +2,7 @@
import numpy as np

from pyemma.coordinates.data import DataInMemory
from pyemma.util.contexts import settings
from pyemma.util.files import TemporaryDirectory
import os
from glob import glob
@@ -153,5 +154,25 @@ def test_write_to_csv_propagate_filenames(self):
for a, e in zip(actual, expected):
np.testing.assert_allclose(a, e)

def test_invalid_data_in_input_nan(self):
self.d[0][-1] = np.nan
r = DataInMemory(self.d)
it = r.iterator()
from pyemma.coordinates.data._base.datasource import InvalidDataInStreamException
with settings(coordinates_check_output=True):
with self.assertRaises(InvalidDataInStreamException):
for itraj, X in it:
pass

def test_invalid_data_in_input_inf(self):
self.d[1][-1] = np.inf
r = DataInMemory(self.d, chunksize=5)
it = r.iterator()
from pyemma.coordinates.data._base.datasource import InvalidDataInStreamException
with settings(coordinates_check_output=True):
with self.assertRaises(InvalidDataInStreamException) as cm:
for itraj, X in it:
pass

if __name__ == '__main__':
unittest.main()
20 changes: 7 additions & 13 deletions pyemma/msm/api.py
Original file line number Diff line number Diff line change
@@ -1198,7 +1198,7 @@ def bayesian_hidden_markov_model(dtrajs, nstates, lag, nsamples=100, reversible=
def tpt(msmobj, A, B):
r""" A->B reactive flux from transition path theory (TPT)
The returned :class:`ReactiveFlux <msmtools.flux.ReactiveFlux>` object
The returned :class:`ReactiveFlux <pyemma.msm.models.ReactiveFlux>` object
can be used to extract various quantities of the flux, as well as to
compute A -> B transition pathways, their weights, and to coarse-grain
the flux onto sets of states.
@@ -1214,29 +1214,29 @@ def tpt(msmobj, A, B):
Returns
-------
tptobj : :class:`ReactiveFlux <pyemma.msm.reactive_flux.ReactiveFlux>` object
tptobj : :class:`ReactiveFlux <pyemma.msm.models.ReactiveFlux>` object
An object containing the reactive A->B flux network
and several additional quantities, such as the stationary probability,
committors and set definitions.
See also
--------
:class:`ReactiveFlux <pyemma.msm.reactive_flux.ReactiveFlux>`
:class:`ReactiveFlux <pyemma.msm.models.ReactiveFlux>`
Reactive Flux model
.. autoclass:: pyemma.msm.reactive_flux.ReactiveFlux
.. autoclass:: pyemma.msm.models.ReactiveFlux
:members:
:undoc-members:
.. rubric:: Methods
.. autoautosummary:: pyemma.msm.reactive_flux.ReactiveFlux
.. autoautosummary:: pyemma.msm.models.ReactiveFlux
:methods:
.. rubric:: Attributes
.. autoautosummary:: pyemma.msm.reactive_flux.ReactiveFlux
.. autoautosummary:: pyemma.msm.models.ReactiveFlux
:attributes:
References
@@ -1282,13 +1282,6 @@ def tpt(msmobj, A, B):
By default (False), T is a transition matrix.
If set to True, T is a rate matrix.
Returns
-------
tpt: msmtools.flux.ReactiveFlux object
A python object containing the reactive A->B flux network
and several additional quantities, such as stationary probability,
committors and set definitions.
Notes
-----
The central object used in transition path theory is
@@ -1330,6 +1323,7 @@ def tpt(msmobj, A, B):
raise ValueError('set A or B defines more states, than given transition matrix.')

# forward committor
#msmobj.
qplus = msmana.committor(T, A, B, forward=True)
# backward committor
if msmana.is_reversible(T, mu=mu):
5 changes: 4 additions & 1 deletion pyemma/pyemma.cfg
Original file line number Diff line number Diff line change
@@ -24,4 +24,7 @@ use_trajectory_lengths_cache = True
# maximum entries in database
traj_info_max_entries = 50000
# max size in MB
traj_info_max_size = 500
traj_info_max_size = 500

# check output of iterators in pyemma.coordinates for infinity and NaN, useful for debug purposes.
coordinates_check_output = False
9 changes: 9 additions & 0 deletions pyemma/util/config.py
Original file line number Diff line number Diff line change
@@ -45,6 +45,7 @@
'use_trajectory_lengths_cache',
'traj_info_max_entries',
'traj_info_max_size',
'coordinates_check_output',
)

if six.PY2:
@@ -364,6 +365,14 @@ def show_config_notification(self):
def show_config_notification(self, val):
self._conf_values.set('pyemma', 'show_config_notification', str(val))

@property
def coordinates_check_output(self):
return self._conf_values.getboolean('pyemma', 'coordinates_check_output')

@coordinates_check_output.setter
def coordinates_check_output(self, val):
self._conf_values.set('pyemma', 'coordinates_check_output', str(val))

### FIlE HANDLING

def __copy_default_files_to_cfg_dir(self, target_dir):

0 comments on commit 4308906

Please sign in to comment.