[msm.tests.test_its]: Added integration tests for ImpliedTimescales w…

…ith various estimators an debugged the used estimator classes.
markovmodel · Jul 9, 2015 · 1f58f8e · 1f58f8e
1 parent b5ef7b8
commit 1f58f8e
Show file tree

Hide file tree

Showing 7 changed files with 193 additions and 125 deletions.
diff --git a/pyemma/msm/api.py b/pyemma/msm/api.py
@@ -62,7 +62,8 @@
 
 
 @shortcut('its')
-def timescales_msm(dtrajs, lags=None, nits=10, reversible=True, connected=True, errors=None, nsamples=50):
+def timescales_msm(dtrajs, lags=None, nits=None, reversible=True, connected=True, errors=None, nsamples=50):
+    # format data
     r""" Calculate implied timescales from Markov state models estimated at a series of lag times.
 
     Parameters
@@ -71,29 +72,34 @@ def timescales_msm(dtrajs, lags=None, nits=10, reversible=True, connected=True,
         discrete trajectories
 
     lags : array-like of integers (optional)
-        integer lag times at which the implied timescales will be
-        calculated
+        integer lag times at which the implied timescales will be calculated
 
     nits : int (optional)
         number of implied timescales to be computed. Will compute less
-        if the number of states are smaller
+        if the number of states are smaller. If None, the number of timescales
+        will be automatically determined.
 
     connected : boolean (optional)
-        If true compute the connected set before transition matrix
-        estimation at each lag separately
+        If true compute the connected set before transition matrix estimation
+        at each lag separately
+
     reversible : boolean (optional)
-        Estimate the transition matrix reversibly (True) or
-        nonreversibly (False)
+        Estimate transition matrix reversibly (True) or nonreversibly (False)
+
     errors : None | 'bayes' | 'bootstrap'
         Specifies whether to compute statistical uncertainties (by default
-        not), an which algorithm to use if yes.
-        Options are 'bayes' for Bayesian sampling of the posterior and
-        'bootstrap' for bootstrapping of the discrete trajectories.
+        not), an which algorithm to use if yes. Options are:
+
+        * 'bayes' for Bayesian sampling of the posterior
+
+        * 'bootstrap' for bootstrapping of the discrete trajectories.
+
         Attention: Computing errors can be *very* slow if the MSM has many
         states. Moreover there are still unsolved theoretical problems, and
-        therefore the uncertainty interval and the maximum likelihood
-        estimator can be inconsistent. Use this as a rough guess for
-        statistical uncertainties.
+        therefore the uncertainty interval and the maximum likelihood estimator
+        can be inconsistent. Use this as a rough guess for statistical
+        uncertainties.
+
     nsamples : int
         The number of approximately independent transition matrix samples
         generated for each lag time for uncertainty quantification.
@@ -161,7 +167,6 @@ def timescales_msm(dtrajs, lags=None, nits=10, reversible=True, connected=True,
      [ 5.13829397  2.59477703]]
 
     """
-    # format data
     dtrajs = _types.ensure_dtraj_list(dtrajs)
 
     if connected:
@@ -173,7 +178,7 @@ def timescales_msm(dtrajs, lags=None, nits=10, reversible=True, connected=True,
     if errors is None:
         estimator = _ML_MSM(reversible=reversible, connectivity=connectivity)
     elif errors == 'bayes':
-        estimator = _Bayes_MSM(reversible=reversible, connectivity=connectivity)
+        estimator = _Bayes_MSM(reversible=reversible, connectivity=connectivity, nsamples=nsamples)
     else:
         raise NotImplementedError('Error estimation method'+errors+'currently not implemented')
 
@@ -464,7 +469,7 @@ def estimate_markov_model(dtrajs, lag, reversible=True, sparse=False, connectivi
     return mlmsm.estimate(dtrajs)
 
 
-def timescales_hmsm(dtrajs, nstates, lags=None, nits=10, reversible=True, connected=True, errors=None, nsamples=100):
+def timescales_hmsm(dtrajs, nstates, lags=None, nits=None, reversible=True, connected=True, errors=None, nsamples=100):
     r""" Calculate implied timescales from Hidden Markov state models estimated at a series of lag times.
 
     Warning: this can be slow!
@@ -478,20 +483,19 @@ def timescales_hmsm(dtrajs, nstates, lags=None, nits=10, reversible=True, connec
         number of hidden states
 
     lags : array-like of integers (optional)
-        integer lag times at which the implied timescales will be
-        calculated
+        integer lag times at which the implied timescales will be calculated
 
     nits : int (optional)
-        number of implied timescales to be computed. Will compute less
-        if the number of states are smaller
+        number of implied timescales to be computed. Will compute less if the
+        number of states are smaller. None means the number of timescales will
+        be determined automatically.
 
     connected : boolean (optional)
         If true compute the connected set before transition matrix
         estimation at each lag separately
 
     reversible : boolean (optional)
-        Estimate the transition matrix reversibly (True) or
-        nonreversibly (False)
+        Estimate transition matrix reversibly (True) or nonreversibly (False)
 
     errors : None | 'bayes'
         Specifies whether to compute statistical uncertainties (by default not),
@@ -500,12 +504,12 @@ def timescales_hmsm(dtrajs, nstates, lags=None, nits=10, reversible=True, connec
         the involved matrices are much smaller.
 
     nsamples : int
-        The number of approximately independent HMSM samples generated for each lag time for uncertainty
-        quantification. Only used if errors is not None.
+        Number of approximately independent HMSM samples generated for each lag
+        time for uncertainty quantification. Only used if errors is not None.
 
     Returns
     -------
-    itsobj : :class:`ImpliedTimescales <pyemma.msm.ui.ImpliedTimescales>` object
+    itsobj : :class:`ImpliedTimescales <pyemma.msm.ImpliedTimescales>` object
 
     See also
     --------

diff --git a/pyemma/msm/estimators/bayesian_hmsm.py b/pyemma/msm/estimators/bayesian_hmsm.py
@@ -3,33 +3,19 @@
 import numpy as _np
 
 from pyemma.util.types import ensure_dtraj_list
-from pyemma.msm.estimators.maximum_likelihood_hmsm import MaximumLikelihoodHMSM as _HMSMEstimator
+from pyemma.msm.estimators.maximum_likelihood_hmsm import MaximumLikelihoodHMSM as _MaximumLikelihoodHMSM
 from pyemma.msm.models.hmsm import HMSM as _HMSM
 from pyemma.msm.estimators.estimated_hmsm import EstimatedHMSM as _EstimatedHMSM
 from pyemma.msm.models.hmsm_sampled import SampledHMSM as _SampledHMSM
 from pyemma.util.units import TimeUnit
 
-def _lag_observations(observations, lag):
-    """ Create new trajectories that are subsampled at lag but shifted
 
-    Given a trajectory (s0, s1, s2, s3, s4, ...) and lag 3, this function will generate 3 trajectories
-    (s0, s3, s6, ...), (s1, s4, s7, ...) and (s2, s5, s8, ...). Use this function in order to parametrize a MLE
-    at lag times larger than 1 without discarding data. Do not use this function for Bayesian estimators, where
-    data must be given such that subsequent transitions are uncorrelated.
-
-    """
-    obsnew = []
-    for obs in observations:
-        for shift in range(0, lag):
-            obsnew.append(obs[shift:][::lag])
-    return obsnew
-
-class BayesianHMSM(_HMSMEstimator, _SampledHMSM):
+class BayesianHMSM(_MaximumLikelihoodHMSM, _SampledHMSM):
     """Estimator for a Bayesian HMSM
 
     """
-    def __init__(self, nstates=2, lag=1, nsamples=100, init_hmsm=None, reversible=True, connectivity='largest',
-                 observe_active=True, dt_traj='1 step', conf=0.95):
+    def __init__(self, nstates=2, lag=1, stride='effective', nsamples=100, init_hmsm=None, reversible=True,
+                 connectivity='largest', observe_active=True, dt_traj='1 step', conf=0.95):
         """
         Parameters
         ----------
@@ -39,6 +25,18 @@ def __init__(self, nstates=2, lag=1, nsamples=100, init_hmsm=None, reversible=Tr
         lag : int, optional, default=1
             lagtime to estimate the HMSM at
 
+        stride : str or int, default=1
+            stride between two lagged trajectories extracted from the input
+            trajectories. Given trajectory s[t], stride and lag will result
+            in trajectories
+                s[0], s[tau], s[2 tau], ...
+                s[stride], s[stride + tau], s[stride + 2 tau], ...
+            Setting stride = 1 will result in using all data (useful for maximum
+            likelihood estimator), while a Bayesian estimator requires a longer
+            stride in order to have statistically uncorrelated trajectories.
+            Setting stride = None 'effective' uses the largest neglected timescale as
+            an estimate for the correlation time and sets the stride accordingly
+
         hmsm : :class:`HMSM <pyemma.msm.ui.hmsm.HMSM>`
             Single-point estimate of HMSM object around which errors will be evaluated
 
@@ -48,6 +46,7 @@ def __init__(self, nstates=2, lag=1, nsamples=100, init_hmsm=None, reversible=Tr
 
         """
         self.lag = lag
+        self.stride = stride
         self.nstates = nstates
         self.nsamples = nsamples
         self.init_hmsm = init_hmsm
@@ -69,13 +68,14 @@ def _estimate(self, dtrajs):
         """
         # ensure right format
         dtrajs = ensure_dtraj_list(dtrajs)
+
         # if no initial MSM is given, estimate it now
         if self.init_hmsm is None:
             # estimate with store_data=True, because we need an EstimatedHMSM
-            hmsm_estimator = _HMSMEstimator(lag=self.lag, nstates=self.nstates, reversible=self.reversible,
-                                            connectivity=self.connectivity, observe_active=self.observe_active,
-                                            dt_traj=self.dt_traj)
-            init_hmsm = hmsm_estimator.estimate(dtrajs)
+            hmsm_estimator = _MaximumLikelihoodHMSM(lag=self.lag, stride=self.stride, nstates=self.nstates,
+                                            reversible=self.reversible, connectivity=self.connectivity,
+                                            observe_active=self.observe_active, dt_traj=self.dt_traj)
+            init_hmsm = hmsm_estimator.estimate(dtrajs)  # estimate with lagged trajectories
         else:
             # check input
             assert isinstance(self.init_hmsm, _EstimatedHMSM), 'hmsm must be of type EstimatedHMSM'
@@ -96,8 +96,8 @@ def _estimate(self, dtrajs):
         # HMM sampler
         from bhmm import discrete_hmm, bayesian_hmm
         hmm_mle = discrete_hmm(init_hmsm.transition_matrix, pobs, stationary=True, reversible=self.reversible)
-        dtrajs_lagged = _lag_observations(dtrajs, self.lag)  # rewrite discrete traj to lagged observations
-        sampled_hmm = bayesian_hmm(dtrajs_lagged, hmm_mle, nsample=self.nsamples,
+        # using the lagged discrete trajectories that have been found in the MLHMM
+        sampled_hmm = bayesian_hmm(init_hmsm.discrete_trajectories_lagged, hmm_mle, nsample=self.nsamples,
                                    transition_matrix_prior='init-connect')
 
         # Samples

diff --git a/pyemma/msm/estimators/estimated_hmsm.py b/pyemma/msm/estimators/estimated_hmsm.py
@@ -41,13 +41,14 @@
 
 class EstimatedHMSM(_HMSM):
 
-    def __init__(self, dtrajs_full, dt_model, lagtime, nstates_obs, observable_set, dtrajs_obs,
+    def __init__(self, dtrajs_full, dtrajs_lagged, dt_model, lagtime, nstates_obs, observable_set, dtrajs_obs,
                  transition_matrix, observation_probabilities):
         _HMSM.__init__(self, transition_matrix, observation_probabilities, dt_model=dt_model)
         self.lag = lagtime
         self._nstates_obs = nstates_obs
         self._observable_set = observable_set
         self._dtrajs_full = dtrajs_full
+        self._dtrajs_lagged = dtrajs_lagged
         self._dtrajs_obs = dtrajs_obs
 
     @property
@@ -77,6 +78,15 @@ def discrete_trajectories_full(self):
         """
         return self._dtrajs_full
 
+    @property
+    @shortcut('dtrajs_lagged')
+    def discrete_trajectories_lagged(self):
+        """
+        Transformed original trajectories that are used as an input into the HMM estimation
+
+        """
+        return self._dtrajs_lagged
+
     @property
     @shortcut('dtrajs_obs')
     def discrete_trajectories_obs(self):

diff --git a/pyemma/msm/estimators/implied_timescales.py b/pyemma/msm/estimators/implied_timescales.py
@@ -75,17 +75,18 @@ class ImpliedTimescales(Estimator):
     lags = None : array-like with integers
         integer lag times at which the implied timescales will be calculated
 
-    nits = 10 : int
+    nits = None : int
         maximum number of implied timescales to be computed and stored. If less
         timescales are available, nits will be set to a smaller value during
-        estimation
+        estimation. None means the number of timescales will be automatically
+        determined.
 
     failfast = False : boolean
         if True, will raise an error as soon as not all requested timescales can be computed at all requested
         lagtimes. If False, will continue with a warning and compute the timescales/lagtimes that are possible.
 
     """
-    def __init__(self, estimator, lags=None, nits=10, failfast=False):
+    def __init__(self, estimator, lags=None, nits=None, failfast=False):
         # initialize
         self.estimator = get_estimator(estimator)
         self.nits = nits
@@ -143,6 +144,8 @@ def _estimate(self, data):
 
         # how many finity timescales do we really have?
         maxnts = max([len(ts[np.isfinite(ts)]) for ts in timescales])
+        if self.nits is None:
+            self.nits = maxnts
         if maxnts < self.nits:
             self.nits = maxnts
             self.logger.warn('Changed user setting nits to the number of available timescales nits=' + str(self.nits))
@@ -286,9 +289,9 @@ def get_sample_mean(self, process=None):
                                ' try calling bootstrap() before')
         # OK, go:
         if process is None:
-            return np.mean(self._its_samples[self._successful_lag_indexes, :, :], axis=0)
+            return np.mean(self._its_samples[:, self._successful_lag_indexes, :], axis=0)
         else:
-            return np.mean(self._its_samples[self._successful_lag_indexes, :, process], axis=0)
+            return np.mean(self._its_samples[:, self._successful_lag_indexes, process], axis=0)
 
     @property
     def sample_std(self):
@@ -330,9 +333,9 @@ def get_sample_std(self, process=None):
                                ' try calling bootstrap() before')
         # OK, go:
         if process is None:
-            return np.std(self._its_samples[self._successful_lag_indexes, :, :], axis=0)
+            return np.std(self._its_samples[:, self._successful_lag_indexes, :], axis=0)
         else:
-            return np.std(self._its_samples[self._successful_lag_indexes, :, process], axis=0)
+            return np.std(self._its_samples[:, self._successful_lag_indexes, process], axis=0)
 
     def get_sample_conf(self, conf=0.95, process=None):
         r"""Returns the confidence interval that contains alpha % of the sample data
@@ -358,9 +361,9 @@ def get_sample_conf(self, conf=0.95, process=None):
                                ' try calling bootstrap() before')
         # OK, go:
         if process is None:
-            return confidence_interval(self._its_samples[self._successful_lag_indexes, :, :], conf=conf)
+            return confidence_interval(self._its_samples[:, self._successful_lag_indexes, :], conf=conf)
         else:
-            return confidence_interval(self._its_samples[self._successful_lag_indexes, :, process], conf=conf)
+            return confidence_interval(self._its_samples[:, self._successful_lag_indexes, process], conf=conf)
 
     @property
     def estimators(self):
@@ -390,9 +393,7 @@ def fraction_of_frames(self):
         set. Hence, the output is not necessarily the **active** fraction. For that, use the
         :py:func:`EstimatedMSM.active_count_fraction` function of the :py:class:`EstimatedMSM` class object.
         """
-
         # TODO : implement fraction_of_active_frames
-
         # Are we computing this for the first time?
         if not hasattr(self,'_fraction'):
             self._fraction = np.zeros_like(self.lagtimes, dtype='float32')