diff --git a/doc/source/CHANGELOG.rst b/doc/source/CHANGELOG.rst index 43ec54646..dddb91763 100644 --- a/doc/source/CHANGELOG.rst +++ b/doc/source/CHANGELOG.rst @@ -6,9 +6,15 @@ Changelog **New features**: -- coordinates: for lag < chunksize improved speed (50%) for TICA. #960 +- coordinates + - for lag < chunksize improved speed (50%) for TICA. #960 + - new config variable "coordinates_check_output" to test for "NaN" and "inf" values in + iterator output for every chunk. The option is disabled by default. It gives insight + during debugging where faulty values are introduced into the pipeline. #967 + **Fixes**: + - msm: low-level api removed (use msmtools for now, if you really need it). #550, 2.2.6 (9-23-16) @@ -136,6 +142,7 @@ Service release. Fixes some considerable high chunk size as well. **Fixes**: + - In parallel environments (clusters with shared filesystem) there will be no crashes due to the config module, which tried to write files in users home directory. Config files are optional by now. @@ -190,19 +197,18 @@ Service release. Fixes some (reported as Warnings). - coordinates: - - Completly re-designed class hierachy (user-code/API unaffected). - - Added trajectory info cache to avoid re-computing lengths, dimensions and - byte offsets of data sets. - - Random access strategies supported (eg. via slices). - - FeatureReader supports random access for XTC and TRR (in conjunction with mdtraj-1.6). - - Re-design API to support scikit-learn interface (fit, transform). - - Pipeline elements (former Transformer class) now uses iterator pattern to - obtain data and therefore supports now pipeline trees. - - pipeline elements support writing their output to csv files. - - TICA/PCA uses covartools to estimate covariance matrices. - - This now saves one pass over the data set. - - Supports sparsification data on the fly. - + - Completely re-designed class hierachy (user-code/API unaffected). + - Added trajectory info cache to avoid re-computing lengths, dimensions and + byte offsets of data sets. + - Random access strategies supported (eg. via slices). + - FeatureReader supports random access for XTC and TRR (in conjunction with mdtraj-1.6). + - Re-design API to support scikit-learn interface (fit, transform). + - Pipeline elements (former Transformer class) now uses iterator pattern to + obtain data and therefore supports now pipeline trees. + - pipeline elements support writing their output to csv files. + - TICA/PCA uses covartools to estimate covariance matrices: + + This now saves one pass over the data set. + + Supports sparsification data on the fly. **Fixes**: @@ -342,7 +348,7 @@ reorganization of the code. - coordinates package: allow metrics to be passed to cluster algorithms. - coordinates package: cache trajectory lengths by default (uncached led to 1 pass of reading for non indexed (XTC) formats). - This avoids re-reading e.g XTC files to determine their lengths. + This avoids re-reading e.g XTC files to determine their lengths. - coordinates package: enable passing chunk size to readers and pipelines in API. - coordinates package: assign_to_centers now allows all supported file formats as centers input. - coordinates package: save_traj(s) now handles stride parameter. diff --git a/pyemma/coordinates/data/_base/datasource.py b/pyemma/coordinates/data/_base/datasource.py index b314c1921..6328e18dd 100644 --- a/pyemma/coordinates/data/_base/datasource.py +++ b/pyemma/coordinates/data/_base/datasource.py @@ -671,6 +671,14 @@ def next(self): (not self.return_traj_index and len(X) == 0) or (self.return_traj_index and len(X[1]) == 0) ): X = self._it_next() + if config.coordinates_check_output: + array = X if not self.return_traj_index else X[1] + if not np.all(np.isfinite(array)): + # determine position + start = self.pos + msg = "Found invalid values in chunk in trajectory index {itraj} at chunk [{start}, {stop}]" \ + .format(itraj=self.current_trajindex, start=start, stop=start+len(array)) + raise InvalidDataInStreamException(msg) return X def __iter__(self): @@ -683,3 +691,6 @@ def __exit__(self, exc_type, exc_val, exc_tb): self.close() return False + +class InvalidDataInStreamException(Exception): + """Data stream contained NaN or (+/-) infinity""" diff --git a/pyemma/coordinates/tests/test_coordinates_iterator.py b/pyemma/coordinates/tests/test_coordinates_iterator.py index f9da47ac6..f7916a6f0 100644 --- a/pyemma/coordinates/tests/test_coordinates_iterator.py +++ b/pyemma/coordinates/tests/test_coordinates_iterator.py @@ -2,6 +2,7 @@ import numpy as np from pyemma.coordinates.data import DataInMemory +from pyemma.util.contexts import settings from pyemma.util.files import TemporaryDirectory import os from glob import glob @@ -153,5 +154,25 @@ def test_write_to_csv_propagate_filenames(self): for a, e in zip(actual, expected): np.testing.assert_allclose(a, e) + def test_invalid_data_in_input_nan(self): + self.d[0][-1] = np.nan + r = DataInMemory(self.d) + it = r.iterator() + from pyemma.coordinates.data._base.datasource import InvalidDataInStreamException + with settings(coordinates_check_output=True): + with self.assertRaises(InvalidDataInStreamException): + for itraj, X in it: + pass + + def test_invalid_data_in_input_inf(self): + self.d[1][-1] = np.inf + r = DataInMemory(self.d, chunksize=5) + it = r.iterator() + from pyemma.coordinates.data._base.datasource import InvalidDataInStreamException + with settings(coordinates_check_output=True): + with self.assertRaises(InvalidDataInStreamException) as cm: + for itraj, X in it: + pass + if __name__ == '__main__': unittest.main() diff --git a/pyemma/msm/api.py b/pyemma/msm/api.py index 139d7daf6..18706a0a1 100644 --- a/pyemma/msm/api.py +++ b/pyemma/msm/api.py @@ -1198,7 +1198,7 @@ def bayesian_hidden_markov_model(dtrajs, nstates, lag, nsamples=100, reversible= def tpt(msmobj, A, B): r""" A->B reactive flux from transition path theory (TPT) - The returned :class:`ReactiveFlux ` object + The returned :class:`ReactiveFlux ` object can be used to extract various quantities of the flux, as well as to compute A -> B transition pathways, their weights, and to coarse-grain the flux onto sets of states. @@ -1214,29 +1214,29 @@ def tpt(msmobj, A, B): Returns ------- - tptobj : :class:`ReactiveFlux ` object + tptobj : :class:`ReactiveFlux ` object An object containing the reactive A->B flux network and several additional quantities, such as the stationary probability, committors and set definitions. See also -------- - :class:`ReactiveFlux ` + :class:`ReactiveFlux ` Reactive Flux model - .. autoclass:: pyemma.msm.reactive_flux.ReactiveFlux + .. autoclass:: pyemma.msm.models.ReactiveFlux :members: :undoc-members: .. rubric:: Methods - .. autoautosummary:: pyemma.msm.reactive_flux.ReactiveFlux + .. autoautosummary:: pyemma.msm.models.ReactiveFlux :methods: .. rubric:: Attributes - .. autoautosummary:: pyemma.msm.reactive_flux.ReactiveFlux + .. autoautosummary:: pyemma.msm.models.ReactiveFlux :attributes: References @@ -1282,13 +1282,6 @@ def tpt(msmobj, A, B): By default (False), T is a transition matrix. If set to True, T is a rate matrix. - Returns - ------- - tpt: msmtools.flux.ReactiveFlux object - A python object containing the reactive A->B flux network - and several additional quantities, such as stationary probability, - committors and set definitions. - Notes ----- The central object used in transition path theory is @@ -1330,6 +1323,7 @@ def tpt(msmobj, A, B): raise ValueError('set A or B defines more states, than given transition matrix.') # forward committor + #msmobj. qplus = msmana.committor(T, A, B, forward=True) # backward committor if msmana.is_reversible(T, mu=mu): diff --git a/pyemma/pyemma.cfg b/pyemma/pyemma.cfg index 78370d411..1fad6b535 100644 --- a/pyemma/pyemma.cfg +++ b/pyemma/pyemma.cfg @@ -24,4 +24,7 @@ use_trajectory_lengths_cache = True # maximum entries in database traj_info_max_entries = 50000 # max size in MB -traj_info_max_size = 500 \ No newline at end of file +traj_info_max_size = 500 + +# check output of iterators in pyemma.coordinates for infinity and NaN, useful for debug purposes. +coordinates_check_output = False diff --git a/pyemma/util/config.py b/pyemma/util/config.py index e331693b4..e3b9062f3 100644 --- a/pyemma/util/config.py +++ b/pyemma/util/config.py @@ -45,6 +45,7 @@ 'use_trajectory_lengths_cache', 'traj_info_max_entries', 'traj_info_max_size', + 'coordinates_check_output', ) if six.PY2: @@ -364,6 +365,14 @@ def show_config_notification(self): def show_config_notification(self, val): self._conf_values.set('pyemma', 'show_config_notification', str(val)) + @property + def coordinates_check_output(self): + return self._conf_values.getboolean('pyemma', 'coordinates_check_output') + + @coordinates_check_output.setter + def coordinates_check_output(self, val): + self._conf_values.set('pyemma', 'coordinates_check_output', str(val)) + ### FIlE HANDLING def __copy_default_files_to_cfg_dir(self, target_dir):