Skip to content
This repository has been archived by the owner on Sep 11, 2023. It is now read-only.

Commit

Permalink
[msm.tests.test_its]: Added integration tests for ImpliedTimescales w…
Browse files Browse the repository at this point in the history
…ith various estimators an debugged the used estimator classes.
  • Loading branch information
franknoe committed Jul 9, 2015
1 parent b5ef7b8 commit 1f58f8e
Show file tree
Hide file tree
Showing 7 changed files with 193 additions and 125 deletions.
56 changes: 30 additions & 26 deletions pyemma/msm/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@


@shortcut('its')
def timescales_msm(dtrajs, lags=None, nits=10, reversible=True, connected=True, errors=None, nsamples=50):
def timescales_msm(dtrajs, lags=None, nits=None, reversible=True, connected=True, errors=None, nsamples=50):
# format data
r""" Calculate implied timescales from Markov state models estimated at a series of lag times.
Parameters
Expand All @@ -71,29 +72,34 @@ def timescales_msm(dtrajs, lags=None, nits=10, reversible=True, connected=True,
discrete trajectories
lags : array-like of integers (optional)
integer lag times at which the implied timescales will be
calculated
integer lag times at which the implied timescales will be calculated
nits : int (optional)
number of implied timescales to be computed. Will compute less
if the number of states are smaller
if the number of states are smaller. If None, the number of timescales
will be automatically determined.
connected : boolean (optional)
If true compute the connected set before transition matrix
estimation at each lag separately
If true compute the connected set before transition matrix estimation
at each lag separately
reversible : boolean (optional)
Estimate the transition matrix reversibly (True) or
nonreversibly (False)
Estimate transition matrix reversibly (True) or nonreversibly (False)
errors : None | 'bayes' | 'bootstrap'
Specifies whether to compute statistical uncertainties (by default
not), an which algorithm to use if yes.
Options are 'bayes' for Bayesian sampling of the posterior and
'bootstrap' for bootstrapping of the discrete trajectories.
not), an which algorithm to use if yes. Options are:
* 'bayes' for Bayesian sampling of the posterior
* 'bootstrap' for bootstrapping of the discrete trajectories.
Attention: Computing errors can be *very* slow if the MSM has many
states. Moreover there are still unsolved theoretical problems, and
therefore the uncertainty interval and the maximum likelihood
estimator can be inconsistent. Use this as a rough guess for
statistical uncertainties.
therefore the uncertainty interval and the maximum likelihood estimator
can be inconsistent. Use this as a rough guess for statistical
uncertainties.
nsamples : int
The number of approximately independent transition matrix samples
generated for each lag time for uncertainty quantification.
Expand Down Expand Up @@ -161,7 +167,6 @@ def timescales_msm(dtrajs, lags=None, nits=10, reversible=True, connected=True,
[ 5.13829397 2.59477703]]
"""
# format data
dtrajs = _types.ensure_dtraj_list(dtrajs)

if connected:
Expand All @@ -173,7 +178,7 @@ def timescales_msm(dtrajs, lags=None, nits=10, reversible=True, connected=True,
if errors is None:
estimator = _ML_MSM(reversible=reversible, connectivity=connectivity)
elif errors == 'bayes':
estimator = _Bayes_MSM(reversible=reversible, connectivity=connectivity)
estimator = _Bayes_MSM(reversible=reversible, connectivity=connectivity, nsamples=nsamples)
else:
raise NotImplementedError('Error estimation method'+errors+'currently not implemented')

Expand Down Expand Up @@ -464,7 +469,7 @@ def estimate_markov_model(dtrajs, lag, reversible=True, sparse=False, connectivi
return mlmsm.estimate(dtrajs)


def timescales_hmsm(dtrajs, nstates, lags=None, nits=10, reversible=True, connected=True, errors=None, nsamples=100):
def timescales_hmsm(dtrajs, nstates, lags=None, nits=None, reversible=True, connected=True, errors=None, nsamples=100):
r""" Calculate implied timescales from Hidden Markov state models estimated at a series of lag times.
Warning: this can be slow!
Expand All @@ -478,20 +483,19 @@ def timescales_hmsm(dtrajs, nstates, lags=None, nits=10, reversible=True, connec
number of hidden states
lags : array-like of integers (optional)
integer lag times at which the implied timescales will be
calculated
integer lag times at which the implied timescales will be calculated
nits : int (optional)
number of implied timescales to be computed. Will compute less
if the number of states are smaller
number of implied timescales to be computed. Will compute less if the
number of states are smaller. None means the number of timescales will
be determined automatically.
connected : boolean (optional)
If true compute the connected set before transition matrix
estimation at each lag separately
reversible : boolean (optional)
Estimate the transition matrix reversibly (True) or
nonreversibly (False)
Estimate transition matrix reversibly (True) or nonreversibly (False)
errors : None | 'bayes'
Specifies whether to compute statistical uncertainties (by default not),
Expand All @@ -500,12 +504,12 @@ def timescales_hmsm(dtrajs, nstates, lags=None, nits=10, reversible=True, connec
the involved matrices are much smaller.
nsamples : int
The number of approximately independent HMSM samples generated for each lag time for uncertainty
quantification. Only used if errors is not None.
Number of approximately independent HMSM samples generated for each lag
time for uncertainty quantification. Only used if errors is not None.
Returns
-------
itsobj : :class:`ImpliedTimescales <pyemma.msm.ui.ImpliedTimescales>` object
itsobj : :class:`ImpliedTimescales <pyemma.msm.ImpliedTimescales>` object
See also
--------
Expand Down
48 changes: 24 additions & 24 deletions pyemma/msm/estimators/bayesian_hmsm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,33 +3,19 @@
import numpy as _np

from pyemma.util.types import ensure_dtraj_list
from pyemma.msm.estimators.maximum_likelihood_hmsm import MaximumLikelihoodHMSM as _HMSMEstimator
from pyemma.msm.estimators.maximum_likelihood_hmsm import MaximumLikelihoodHMSM as _MaximumLikelihoodHMSM
from pyemma.msm.models.hmsm import HMSM as _HMSM
from pyemma.msm.estimators.estimated_hmsm import EstimatedHMSM as _EstimatedHMSM
from pyemma.msm.models.hmsm_sampled import SampledHMSM as _SampledHMSM
from pyemma.util.units import TimeUnit

def _lag_observations(observations, lag):
""" Create new trajectories that are subsampled at lag but shifted

Given a trajectory (s0, s1, s2, s3, s4, ...) and lag 3, this function will generate 3 trajectories
(s0, s3, s6, ...), (s1, s4, s7, ...) and (s2, s5, s8, ...). Use this function in order to parametrize a MLE
at lag times larger than 1 without discarding data. Do not use this function for Bayesian estimators, where
data must be given such that subsequent transitions are uncorrelated.
"""
obsnew = []
for obs in observations:
for shift in range(0, lag):
obsnew.append(obs[shift:][::lag])
return obsnew

class BayesianHMSM(_HMSMEstimator, _SampledHMSM):
class BayesianHMSM(_MaximumLikelihoodHMSM, _SampledHMSM):
"""Estimator for a Bayesian HMSM
"""
def __init__(self, nstates=2, lag=1, nsamples=100, init_hmsm=None, reversible=True, connectivity='largest',
observe_active=True, dt_traj='1 step', conf=0.95):
def __init__(self, nstates=2, lag=1, stride='effective', nsamples=100, init_hmsm=None, reversible=True,
connectivity='largest', observe_active=True, dt_traj='1 step', conf=0.95):
"""
Parameters
----------
Expand All @@ -39,6 +25,18 @@ def __init__(self, nstates=2, lag=1, nsamples=100, init_hmsm=None, reversible=Tr
lag : int, optional, default=1
lagtime to estimate the HMSM at
stride : str or int, default=1
stride between two lagged trajectories extracted from the input
trajectories. Given trajectory s[t], stride and lag will result
in trajectories
s[0], s[tau], s[2 tau], ...
s[stride], s[stride + tau], s[stride + 2 tau], ...
Setting stride = 1 will result in using all data (useful for maximum
likelihood estimator), while a Bayesian estimator requires a longer
stride in order to have statistically uncorrelated trajectories.
Setting stride = None 'effective' uses the largest neglected timescale as
an estimate for the correlation time and sets the stride accordingly
hmsm : :class:`HMSM <pyemma.msm.ui.hmsm.HMSM>`
Single-point estimate of HMSM object around which errors will be evaluated
Expand All @@ -48,6 +46,7 @@ def __init__(self, nstates=2, lag=1, nsamples=100, init_hmsm=None, reversible=Tr
"""
self.lag = lag
self.stride = stride
self.nstates = nstates
self.nsamples = nsamples
self.init_hmsm = init_hmsm
Expand All @@ -69,13 +68,14 @@ def _estimate(self, dtrajs):
"""
# ensure right format
dtrajs = ensure_dtraj_list(dtrajs)

# if no initial MSM is given, estimate it now
if self.init_hmsm is None:
# estimate with store_data=True, because we need an EstimatedHMSM
hmsm_estimator = _HMSMEstimator(lag=self.lag, nstates=self.nstates, reversible=self.reversible,
connectivity=self.connectivity, observe_active=self.observe_active,
dt_traj=self.dt_traj)
init_hmsm = hmsm_estimator.estimate(dtrajs)
hmsm_estimator = _MaximumLikelihoodHMSM(lag=self.lag, stride=self.stride, nstates=self.nstates,
reversible=self.reversible, connectivity=self.connectivity,
observe_active=self.observe_active, dt_traj=self.dt_traj)
init_hmsm = hmsm_estimator.estimate(dtrajs) # estimate with lagged trajectories
else:
# check input
assert isinstance(self.init_hmsm, _EstimatedHMSM), 'hmsm must be of type EstimatedHMSM'
Expand All @@ -96,8 +96,8 @@ def _estimate(self, dtrajs):
# HMM sampler
from bhmm import discrete_hmm, bayesian_hmm
hmm_mle = discrete_hmm(init_hmsm.transition_matrix, pobs, stationary=True, reversible=self.reversible)
dtrajs_lagged = _lag_observations(dtrajs, self.lag) # rewrite discrete traj to lagged observations
sampled_hmm = bayesian_hmm(dtrajs_lagged, hmm_mle, nsample=self.nsamples,
# using the lagged discrete trajectories that have been found in the MLHMM
sampled_hmm = bayesian_hmm(init_hmsm.discrete_trajectories_lagged, hmm_mle, nsample=self.nsamples,
transition_matrix_prior='init-connect')

# Samples
Expand Down
12 changes: 11 additions & 1 deletion pyemma/msm/estimators/estimated_hmsm.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,14 @@

class EstimatedHMSM(_HMSM):

def __init__(self, dtrajs_full, dt_model, lagtime, nstates_obs, observable_set, dtrajs_obs,
def __init__(self, dtrajs_full, dtrajs_lagged, dt_model, lagtime, nstates_obs, observable_set, dtrajs_obs,
transition_matrix, observation_probabilities):
_HMSM.__init__(self, transition_matrix, observation_probabilities, dt_model=dt_model)
self.lag = lagtime
self._nstates_obs = nstates_obs
self._observable_set = observable_set
self._dtrajs_full = dtrajs_full
self._dtrajs_lagged = dtrajs_lagged
self._dtrajs_obs = dtrajs_obs

@property
Expand Down Expand Up @@ -77,6 +78,15 @@ def discrete_trajectories_full(self):
"""
return self._dtrajs_full

@property
@shortcut('dtrajs_lagged')
def discrete_trajectories_lagged(self):
"""
Transformed original trajectories that are used as an input into the HMM estimation
"""
return self._dtrajs_lagged

@property
@shortcut('dtrajs_obs')
def discrete_trajectories_obs(self):
Expand Down
23 changes: 12 additions & 11 deletions pyemma/msm/estimators/implied_timescales.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,17 +75,18 @@ class ImpliedTimescales(Estimator):
lags = None : array-like with integers
integer lag times at which the implied timescales will be calculated
nits = 10 : int
nits = None : int
maximum number of implied timescales to be computed and stored. If less
timescales are available, nits will be set to a smaller value during
estimation
estimation. None means the number of timescales will be automatically
determined.
failfast = False : boolean
if True, will raise an error as soon as not all requested timescales can be computed at all requested
lagtimes. If False, will continue with a warning and compute the timescales/lagtimes that are possible.
"""
def __init__(self, estimator, lags=None, nits=10, failfast=False):
def __init__(self, estimator, lags=None, nits=None, failfast=False):
# initialize
self.estimator = get_estimator(estimator)
self.nits = nits
Expand Down Expand Up @@ -143,6 +144,8 @@ def _estimate(self, data):

# how many finity timescales do we really have?
maxnts = max([len(ts[np.isfinite(ts)]) for ts in timescales])
if self.nits is None:
self.nits = maxnts
if maxnts < self.nits:
self.nits = maxnts
self.logger.warn('Changed user setting nits to the number of available timescales nits=' + str(self.nits))
Expand Down Expand Up @@ -286,9 +289,9 @@ def get_sample_mean(self, process=None):
' try calling bootstrap() before')
# OK, go:
if process is None:
return np.mean(self._its_samples[self._successful_lag_indexes, :, :], axis=0)
return np.mean(self._its_samples[:, self._successful_lag_indexes, :], axis=0)
else:
return np.mean(self._its_samples[self._successful_lag_indexes, :, process], axis=0)
return np.mean(self._its_samples[:, self._successful_lag_indexes, process], axis=0)

@property
def sample_std(self):
Expand Down Expand Up @@ -330,9 +333,9 @@ def get_sample_std(self, process=None):
' try calling bootstrap() before')
# OK, go:
if process is None:
return np.std(self._its_samples[self._successful_lag_indexes, :, :], axis=0)
return np.std(self._its_samples[:, self._successful_lag_indexes, :], axis=0)
else:
return np.std(self._its_samples[self._successful_lag_indexes, :, process], axis=0)
return np.std(self._its_samples[:, self._successful_lag_indexes, process], axis=0)

def get_sample_conf(self, conf=0.95, process=None):
r"""Returns the confidence interval that contains alpha % of the sample data
Expand All @@ -358,9 +361,9 @@ def get_sample_conf(self, conf=0.95, process=None):
' try calling bootstrap() before')
# OK, go:
if process is None:
return confidence_interval(self._its_samples[self._successful_lag_indexes, :, :], conf=conf)
return confidence_interval(self._its_samples[:, self._successful_lag_indexes, :], conf=conf)
else:
return confidence_interval(self._its_samples[self._successful_lag_indexes, :, process], conf=conf)
return confidence_interval(self._its_samples[:, self._successful_lag_indexes, process], conf=conf)

@property
def estimators(self):
Expand Down Expand Up @@ -390,9 +393,7 @@ def fraction_of_frames(self):
set. Hence, the output is not necessarily the **active** fraction. For that, use the
:py:func:`EstimatedMSM.active_count_fraction` function of the :py:class:`EstimatedMSM` class object.
"""

# TODO : implement fraction_of_active_frames

# Are we computing this for the first time?
if not hasattr(self,'_fraction'):
self._fraction = np.zeros_like(self.lagtimes, dtype='float32')
Expand Down
Loading

0 comments on commit 1f58f8e

Please sign in to comment.