Merge pull request #967 from marscher/optional_output_checking

[coordinates] introduced new config var "coordinates_check_output"
markovmodel · Oct 21, 2016 · 4308906 · 4308906
2 parents d60d137 + 88c35a4
commit 4308906
Showing 6 changed files with 73 additions and 29 deletions.
diff --git a/doc/source/CHANGELOG.rst b/doc/source/CHANGELOG.rst
@@ -6,9 +6,15 @@ Changelog
 
 **New features**:
 
-- coordinates: for lag < chunksize improved speed (50%) for TICA. #960
+- coordinates
+   - for lag < chunksize improved speed (50%) for TICA. #960
+   - new config variable "coordinates_check_output" to test for "NaN" and "inf" values in
+     iterator output for every chunk. The option is disabled by default. It gives insight
+     during debugging where faulty values are introduced into the pipeline. #967
+
 
 **Fixes**:
+
 - msm: low-level api removed (use msmtools for now, if you really need it). #550, 
 
 2.2.6 (9-23-16)
@@ -136,6 +142,7 @@ Service release. Fixes some
   considerable high chunk size as well.
 
 **Fixes**:
+
 - In parallel environments (clusters with shared filesystem) there will be no
   crashes due to the config module, which tried to write files in users home
   directory. Config files are optional by now.
@@ -190,19 +197,18 @@ Service release. Fixes some
     (reported as Warnings).
 
 - coordinates:
-  - Completly re-designed class hierachy (user-code/API unaffected).
-  - Added trajectory info cache to avoid re-computing lengths, dimensions and
-    byte offsets of data sets.
-  - Random access strategies supported (eg. via slices).
-  - FeatureReader supports random access for XTC and TRR (in conjunction with mdtraj-1.6).
-  - Re-design API to support scikit-learn interface (fit, transform).
-  - Pipeline elements (former Transformer class) now uses iterator pattern to
-    obtain data and therefore supports now pipeline trees.
-  - pipeline elements support writing their output to csv files.
-  - TICA/PCA uses covartools to estimate covariance matrices.
-    - This now saves one pass over the data set.
-    - Supports sparsification data on the fly.
-
+    - Completely re-designed class hierachy (user-code/API unaffected).
+    - Added trajectory info cache to avoid re-computing lengths, dimensions and
+      byte offsets of data sets.
+    - Random access strategies supported (eg. via slices).
+    - FeatureReader supports random access for XTC and TRR (in conjunction with mdtraj-1.6).
+    - Re-design API to support scikit-learn interface (fit, transform).
+    - Pipeline elements (former Transformer class) now uses iterator pattern to
+      obtain data and therefore supports now pipeline trees.
+    - pipeline elements support writing their output to csv files.
+    - TICA/PCA uses covartools to estimate covariance matrices:
+        + This now saves one pass over the data set.
+        + Supports sparsification data on the fly.
 
 **Fixes**:
 
@@ -342,7 +348,7 @@ reorganization of the code.
 - coordinates package: allow metrics to be passed to cluster algorithms.
 - coordinates package: cache trajectory lengths by default
                        (uncached led to 1 pass of reading for non indexed (XTC) formats).
-  This avoids re-reading e.g XTC files to determine their lengths.
+                       This avoids re-reading e.g XTC files to determine their lengths.
 - coordinates package: enable passing chunk size to readers and pipelines in API.
 - coordinates package: assign_to_centers now allows all supported file formats as centers input.
 - coordinates package: save_traj(s) now handles stride parameter.

diff --git a/pyemma/coordinates/data/_base/datasource.py b/pyemma/coordinates/data/_base/datasource.py
@@ -671,6 +671,14 @@ def next(self):
                 (not self.return_traj_index and len(X) == 0) or (self.return_traj_index and len(X[1]) == 0)
         ):
             X = self._it_next()
+        if config.coordinates_check_output:
+            array = X if not self.return_traj_index else X[1]
+            if not np.all(np.isfinite(array)):
+                # determine position
+                start = self.pos
+                msg = "Found invalid values in chunk in trajectory index {itraj} at chunk [{start}, {stop}]" \
+                    .format(itraj=self.current_trajindex, start=start, stop=start+len(array))
+                raise InvalidDataInStreamException(msg)
         return X
 
     def __iter__(self):
@@ -683,3 +691,6 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         self.close()
         return False
 
+
+class InvalidDataInStreamException(Exception):
+    """Data stream contained NaN or (+/-) infinity"""
diff --git a/pyemma/coordinates/tests/test_coordinates_iterator.py b/pyemma/coordinates/tests/test_coordinates_iterator.py
@@ -2,6 +2,7 @@
 import numpy as np
 
 from pyemma.coordinates.data import DataInMemory
+from pyemma.util.contexts import settings
 from pyemma.util.files import TemporaryDirectory
 import os
 from glob import glob
@@ -153,5 +154,25 @@ def test_write_to_csv_propagate_filenames(self):
             for a, e in zip(actual, expected):
                 np.testing.assert_allclose(a, e)
 
+    def test_invalid_data_in_input_nan(self):
+        self.d[0][-1] = np.nan
+        r = DataInMemory(self.d)
+        it = r.iterator()
+        from pyemma.coordinates.data._base.datasource import InvalidDataInStreamException
+        with settings(coordinates_check_output=True):
+            with self.assertRaises(InvalidDataInStreamException):
+                for itraj, X in it:
+                    pass
+
+    def test_invalid_data_in_input_inf(self):
+        self.d[1][-1] = np.inf
+        r = DataInMemory(self.d, chunksize=5)
+        it = r.iterator()
+        from pyemma.coordinates.data._base.datasource import InvalidDataInStreamException
+        with settings(coordinates_check_output=True):
+            with self.assertRaises(InvalidDataInStreamException) as cm:
+                for itraj, X in it:
+                    pass
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/pyemma/msm/api.py b/pyemma/msm/api.py
@@ -1198,7 +1198,7 @@ def bayesian_hidden_markov_model(dtrajs, nstates, lag, nsamples=100, reversible=
 def tpt(msmobj, A, B):
     r""" A->B reactive flux from transition path theory (TPT)
 
-    The returned :class:`ReactiveFlux <msmtools.flux.ReactiveFlux>` object
+    The returned :class:`ReactiveFlux <pyemma.msm.models.ReactiveFlux>` object
     can be used to extract various quantities of the flux, as well as to
     compute A -> B transition pathways, their weights, and to coarse-grain
     the flux onto sets of states.
@@ -1214,29 +1214,29 @@ def tpt(msmobj, A, B):
 
     Returns
     -------
-    tptobj : :class:`ReactiveFlux <pyemma.msm.reactive_flux.ReactiveFlux>` object
+    tptobj : :class:`ReactiveFlux <pyemma.msm.models.ReactiveFlux>` object
         An object containing the reactive A->B flux network
         and several additional quantities, such as the stationary probability,
         committors and set definitions.
 
     See also
     --------
-    :class:`ReactiveFlux <pyemma.msm.reactive_flux.ReactiveFlux>`
+    :class:`ReactiveFlux <pyemma.msm.models.ReactiveFlux>`
         Reactive Flux model
 
 
-    .. autoclass:: pyemma.msm.reactive_flux.ReactiveFlux
+    .. autoclass:: pyemma.msm.models.ReactiveFlux
         :members:
         :undoc-members:
 
         .. rubric:: Methods
 
-        .. autoautosummary:: pyemma.msm.reactive_flux.ReactiveFlux
+        .. autoautosummary:: pyemma.msm.models.ReactiveFlux
            :methods:
 
         .. rubric:: Attributes
 
-        .. autoautosummary:: pyemma.msm.reactive_flux.ReactiveFlux
+        .. autoautosummary:: pyemma.msm.models.ReactiveFlux
             :attributes:
 
     References
@@ -1282,13 +1282,6 @@ def tpt(msmobj, A, B):
         By default (False), T is a transition matrix.
         If set to True, T is a rate matrix.
 
-    Returns
-    -------
-    tpt: msmtools.flux.ReactiveFlux object
-        A python object containing the reactive A->B flux network
-        and several additional quantities, such as stationary probability,
-        committors and set definitions.
-
     Notes
     -----
     The central object used in transition path theory is
@@ -1330,6 +1323,7 @@ def tpt(msmobj, A, B):
         raise ValueError('set A or B defines more states, than given transition matrix.')
 
     # forward committor
+    #msmobj.
     qplus = msmana.committor(T, A, B, forward=True)
     # backward committor
     if msmana.is_reversible(T, mu=mu):

diff --git a/pyemma/pyemma.cfg b/pyemma/pyemma.cfg
@@ -24,4 +24,7 @@ use_trajectory_lengths_cache = True
 # maximum entries in database
 traj_info_max_entries = 50000
 # max size in MB
-traj_info_max_size = 500
+traj_info_max_size = 500
+
+# check output of iterators in pyemma.coordinates for infinity and NaN, useful for debug purposes.
+coordinates_check_output = False
diff --git a/pyemma/util/config.py b/pyemma/util/config.py
@@ -45,6 +45,7 @@
            'use_trajectory_lengths_cache',
            'traj_info_max_entries',
            'traj_info_max_size',
+           'coordinates_check_output',
            )
 
 if six.PY2:
@@ -364,6 +365,14 @@ def show_config_notification(self):
     def show_config_notification(self, val):
         self._conf_values.set('pyemma', 'show_config_notification', str(val))
 
+    @property
+    def coordinates_check_output(self):
+        return self._conf_values.getboolean('pyemma', 'coordinates_check_output')
+
+    @coordinates_check_output.setter
+    def coordinates_check_output(self, val):
+        self._conf_values.set('pyemma', 'coordinates_check_output', str(val))
+
     ### FIlE HANDLING
 
     def __copy_default_files_to_cfg_dir(self, target_dir):