Skip to content
This repository has been archived by the owner on Sep 11, 2023. It is now read-only.

Commit

Permalink
Fixed BayesianHMSM and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
franknoe authored and marscher committed Feb 18, 2016
1 parent c96b2b9 commit 3172ec7
Show file tree
Hide file tree
Showing 11 changed files with 942 additions and 118 deletions.
1 change: 0 additions & 1 deletion pyemma/msm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@
from .estimators import MaximumLikelihoodMSM, BayesianMSM
from .estimators import MaximumLikelihoodHMSM, BayesianHMSM
from .estimators import ImpliedTimescales
from .estimators import EstimatedMSM, EstimatedHMSM
from .estimators import ChapmanKolmogorovValidator

from .models import MSM, HMSM, SampledMSM, SampledHMSM
Expand Down
31 changes: 24 additions & 7 deletions pyemma/msm/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -707,7 +707,7 @@ def bayesian_markov_model(dtrajs, lag, reversible=True, statdist=None,

def timescales_hmsm(dtrajs, nstates, lags=None, nits=None, reversible=True, stationary=False,
connectivity=None, mincount_connectivity='1/n', separate=None, errors=None, nsamples=100,
n_jobs=1, show_progress=True):
stride=None, n_jobs=1, show_progress=True):
r""" Calculate implied timescales from Hidden Markov state models estimated at a series of lag times.
Warning: this can be slow!
Expand Down Expand Up @@ -831,12 +831,16 @@ def timescales_hmsm(dtrajs, nstates, lags=None, nits=None, reversible=True, stat

# MLE or error estimation?
if errors is None:
if stride is None:
stride = 1
estimator = _ML_HMSM(nstates=nstates, reversible=reversible, stationary=stationary, connectivity=connectivity,
mincount_connectivity=mincount_connectivity, separate=separate)
stride=stride, mincount_connectivity=mincount_connectivity, separate=separate)
elif errors == 'bayes':
if stride is None:
stride = 'effective'
estimator = _Bayes_HMSM(nstates=nstates, reversible=reversible, stationary=stationary,
connectivity=connectivity, mincount_connectivity=mincount_connectivity,
separate=separate, show_progress=show_progress, nsamples=nsamples)
stride=stride, separate=separate, show_progress=show_progress, nsamples=nsamples)
else:
raise NotImplementedError('Error estimation method'+str(errors)+'currently not implemented')

Expand All @@ -849,7 +853,7 @@ def timescales_hmsm(dtrajs, nstates, lags=None, nits=None, reversible=True, stat

def estimate_hidden_markov_model(dtrajs, nstates, lag, reversible=True, stationary=False,
connectivity=None, mincount_connectivity='1/n', separate=None, observe_nonempty=True,
dt_traj='1 step', accuracy=1e-3, maxit=1000):
stride=1, dt_traj='1 step', accuracy=1e-3, maxit=1000):
r""" Estimates a Hidden Markov state model from discrete trajectories
Returns a :class:`MaximumLikelihoodHMSM` that contains a transition
Expand Down Expand Up @@ -1027,14 +1031,15 @@ def estimate_hidden_markov_model(dtrajs, nstates, lag, reversible=True, stationa
# initialize HMSM estimator
hmsm_estimator = _ML_HMSM(lag=lag, nstates=nstates, reversible=reversible, msm_init='largest-strong',
connectivity=connectivity, mincount_connectivity=mincount_connectivity, separate=separate,
observe_nonempty=observe_nonempty, dt_traj=dt_traj, accuracy=accuracy, maxit=maxit)
observe_nonempty=observe_nonempty, stride=stride, dt_traj=dt_traj,
accuracy=accuracy, maxit=maxit)
# run estimation
return hmsm_estimator.estimate(dtrajs)


def bayesian_hidden_markov_model(dtrajs, nstates, lag, nsamples=100, reversible=True, stationary=False,
connectivity=None, mincount_connectivity='1/n', separate=None, observe_nonempty=True,
conf=0.95, dt_traj='1 step', store_hidden=False, show_progress=True):
stride='effective', conf=0.95, dt_traj='1 step', store_hidden=False, show_progress=True):
r""" Bayesian Hidden Markov model estimate using Gibbs sampling of the posterior
Returns a :class:`BayesianHMSM` that contains
Expand Down Expand Up @@ -1085,6 +1090,18 @@ def bayesian_hidden_markov_model(dtrajs, nstates, lag, nsamples=100, reversible=
at least one observation in the lagged input trajectories.
nsamples : int, optional, default=100
number of transition matrix samples to compute and store
stride : str or int, default='effective'
stride between two lagged trajectories extracted from the input
trajectories. Given trajectory s[t], stride and lag will result
in trajectories
s[0], s[tau], s[2 tau], ...
s[stride], s[stride + tau], s[stride + 2 tau], ...
Setting stride = 1 will result in using all data (useful for
maximum likelihood estimator), while a Bayesian estimator requires
a longer stride in order to have statistically uncorrelated
trajectories. Setting stride = None 'effective' uses the largest
neglected timescale as an estimate for the correlation time and
sets the stride accordingly.
conf : float, optional, default=0.95
size of confidence intervals
dt_traj : str, optional, default='1 step'
Expand Down Expand Up @@ -1176,7 +1193,7 @@ def bayesian_hidden_markov_model(dtrajs, nstates, lag, nsamples=100, reversible=
measurement uncertainty. arXiv:1108.1430 (2011)
"""
bhmsm_estimator = _Bayes_HMSM(lag=lag, nstates=nstates, nsamples=nsamples, reversible=reversible,
bhmsm_estimator = _Bayes_HMSM(lag=lag, nstates=nstates, stride=stride, nsamples=nsamples, reversible=reversible,
connectivity=connectivity, mincount_connectivity=mincount_connectivity,
separate=separate, observe_nonempty=observe_nonempty,
dt_traj=dt_traj, conf=conf, store_hidden=store_hidden, show_progress=show_progress)
Expand Down
5 changes: 1 addition & 4 deletions pyemma/msm/estimators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,4 @@
from .maximum_likelihood_hmsm import MaximumLikelihoodHMSM
from .bayesian_hmsm import BayesianHMSM
from .implied_timescales import ImpliedTimescales
from .lagged_model_validators import ChapmanKolmogorovValidator, EigenvalueDecayValidator#

from .estimated_msm import EstimatedMSM
from .estimated_hmsm import EstimatedHMSM
from .lagged_model_validators import ChapmanKolmogorovValidator, EigenvalueDecayValidator
150 changes: 88 additions & 62 deletions pyemma/msm/estimators/bayesian_hmsm.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,9 @@
from pyemma.util.types import ensure_dtraj_list
from pyemma.msm.estimators.maximum_likelihood_hmsm import MaximumLikelihoodHMSM as _MaximumLikelihoodHMSM
from pyemma.msm.models.hmsm import HMSM as _HMSM
from pyemma.msm.estimators.estimated_hmsm import EstimatedHMSM as _EstimatedHMSM
from pyemma.msm.models.hmsm_sampled import SampledHMSM as _SampledHMSM
from pyemma.util.units import TimeUnit
from pyemma._base.progress import ProgressReporter
from pyemma.util.units import TimeUnit

__author__ = 'noe'

Expand Down Expand Up @@ -125,23 +124,17 @@ def __init__(self, nstates=2, lag=1, stride='effective',
J. Chem. Phys. 143, 174101 (2015).
"""
self.lag = lag
self.stride = stride
self.nstates = nstates
super(BayesianHMSM, self).__init__(nstates=nstates, lag=lag, stride=stride,
reversible=reversible, stationary=stationary,
connectivity=connectivity, mincount_connectivity=mincount_connectivity,
observe_nonempty=observe_nonempty, separate=separate,
dt_traj=dt_traj)
self.p0_prior = p0_prior
self.transition_matrix_prior = transition_matrix_prior
self.nsamples = nsamples
if init_hmsm is not None:
assert issubclass(init_hmsm.__class__, _MaximumLikelihoodHMSM), 'hmsm must be of type MaximumLikelihoodHMSM'
self.init_hmsm = init_hmsm
self.reversible = reversible
self.stationary = stationary
self.connectivity = connectivity
if mincount_connectivity == '1/n':
mincount_connectivity = 1.0/float(nstates)
self.mincount_connectivity = mincount_connectivity
self.separate = separate
self.observe_nonempty = observe_nonempty
self.dt_traj = dt_traj
self.timestep_traj = TimeUnit(dt_traj)
self.conf = conf
self.store_hidden = store_hidden
self.show_progress = show_progress
Expand All @@ -158,38 +151,66 @@ def _estimate(self, dtrajs):
# ensure right format
dtrajs = ensure_dtraj_list(dtrajs)

# if no initial MSM is given, estimate it now
if self.init_hmsm is None:
# estimate with store_data=True, because we need an EstimatedHMSM
# connective=None (always estimate full)
hmsm_estimator = _MaximumLikelihoodHMSM(lag=self.lag, stride=self.stride, nstates=self.nstates,
reversible=self.reversible, stationary=self.stationary,
connectivity=None, mincount_connectivity=self.mincount_connectivity,
separate=self.separate, observe_nonempty=self.observe_nonempty,
dt_traj=self.dt_traj)
init_hmsm = hmsm_estimator.estimate(dtrajs) # estimate with lagged trajectories
self.nstates = init_hmsm.nstates # might have changed due to connectivity
else:
# check input
assert isinstance(self.init_hmsm, _EstimatedHMSM), 'hmsm must be of type EstimatedHMSM'
init_hmsm = self.init_hmsm
self.nstates = init_hmsm.nstates
self.reversible = init_hmsm.is_reversible
if self.init_hmsm is None: # estimate using maximum-likelihood superclass
# memorize the observation state for bhmm and reset
# TODO: more elegant solution is to set Estimator params only temporarily in estimate(X, **kwargs)
default_connectivity = self.connectivity
default_mincount_connectivity = self.mincount_connectivity
default_observe_nonempty = self.observe_nonempty
self.connectivity = None
self.observe_nonempty = False
self.mincount_connectivity = 0
self.accuracy = 1e-2 # this is sufficient for an initial guess
super(BayesianHMSM, self)._estimate(dtrajs)
self.connectivity = default_connectivity
self.mincount_connectivity = default_mincount_connectivity
self.observe_nonempty = default_observe_nonempty
else: # if given another initialization, must copy its attributes
# TODO: this is too tedious - need to automatize parameter+result copying between estimators.
self.nstates = self.init_hmsm.nstates
self.reversible = self.init_hmsm.is_reversible
self.stationary = self.init_hmsm.stationary
# trajectories
self._dtrajs_full = self.init_hmsm._dtrajs_full
self._dtrajs_lagged = self.init_hmsm._dtrajs_lagged
self._observable_set = self.init_hmsm._observable_set
self._dtrajs_obs = self.init_hmsm._dtrajs_obs
# MLE estimation results
self.likelihoods = self.init_hmsm.likelihoods # Likelihood history
self.likelihood = self.init_hmsm.likelihood
self.hidden_state_probabilities = self.init_hmsm.hidden_state_probabilities # gamma variables
self.hidden_state_trajectories = self.init_hmsm.hidden_state_trajectories # Viterbi path
self.count_matrix = self.init_hmsm.count_matrix # hidden count matrix
self.initial_count = self.init_hmsm.initial_count # hidden init count
self.initial_distribution = self.init_hmsm.initial_distribution
self._active_set = self.init_hmsm._active_set
# update HMM Model
self.update_model_params(P=self.init_hmsm.transition_matrix, pobs=self.init_hmsm.observation_probabilities,
dt_model=TimeUnit(self.dt_traj).get_scaled(self.lag))

# check if we have a valid initial model
import msmtools.estimation as msmest
if self.reversible and not msmest.is_connected(self.count_matrix):
raise NotImplementedError('Encountered disconnected count matrix:\n ' + str(self.count_matrix)
+ 'with reversible Bayesian HMM sampler using lag=' + str(self.lag)
+ ' and stride=' + str(self.stride) + '. Consider using shorter lag, '
+ 'or shorter stride (to use more of the data), '
+ 'or using a lower value for mincount_connectivity.')

# here we blow up the output matrix (if needed) to the FULL state space because we want to use dtrajs in the
# Bayesian HMM sampler. This is just an initialization.
import msmtools.estimation as msmest
nstates_full = msmest.number_of_states(dtrajs)
if init_hmsm.nstates_obs < nstates_full:
if self.nstates_obs < nstates_full:
eps = 0.01 / nstates_full # default output probability, in order to avoid zero columns
# full state space output matrix. make sure there are no zero columns
pobs = eps * _np.ones((self.nstates, nstates_full), dtype=_np.float64)
B_init = eps * _np.ones((self.nstates, nstates_full), dtype=_np.float64)
# fill active states
pobs[:, init_hmsm.observable_set] = _np.maximum(eps, init_hmsm.observation_probabilities)
B_init[:, self.observable_set] = _np.maximum(eps, self.observation_probabilities)
# renormalize B to make it row-stochastic
pobs /= pobs.sum(axis=1)[:, None]
B_init /= B_init.sum(axis=1)[:, None]
else:
pobs = init_hmsm.observation_probabilities
B_init = self.observation_probabilities

# HMM sampler
if self.show_progress:
Expand All @@ -201,9 +222,9 @@ def call_back():
call_back = None

from bhmm import discrete_hmm, bayesian_hmm
hmm_mle = discrete_hmm(init_hmsm.initial_distribution, init_hmsm.transition_matrix, pobs)
hmm_mle = discrete_hmm(self.initial_distribution, self.transition_matrix, B_init)

sampled_hmm = bayesian_hmm(init_hmsm.discrete_trajectories_lagged, hmm_mle, nsample=self.nsamples,
sampled_hmm = bayesian_hmm(self.discrete_trajectories_lagged, hmm_mle, nsample=self.nsamples,
reversible=self.reversible, stationary=self.stationary,
p0_prior=self.p0_prior, transition_matrix_prior=self.transition_matrix_prior,
store_hidden=self.store_hidden, call_back=call_back)
Expand All @@ -217,33 +238,38 @@ def call_back():
sample_pobs = [sampled_hmm.sampled_hmms[i].output_model.output_probabilities for i in range(self.nsamples)]
samples = []
for i in range(self.nsamples): # restrict to observable set if necessary
Bobs = sample_pobs[i][:, init_hmsm.observable_set]
Bobs = sample_pobs[i][:, self.observable_set]
sample_pobs[i] = Bobs / Bobs.sum(axis=1)[:, None] # renormalize
samples.append(_HMSM(sample_Ps[i], sample_pobs[i], pi=sample_pis[i], dt_model=init_hmsm.dt_model))
samples.append(_HMSM(sample_Ps[i], sample_pobs[i], pi=sample_pis[i], dt_model=self.dt_model))

# store hidden trajectories
# store results
self.sampled_trajs = [sampled_hmm.sampled_hmms[i].hidden_state_trajectories for i in range(self.nsamples)]
self.update_model_params(samples=samples)

# parametrize self
self._dtrajs_full = dtrajs
self._dtrajs_lagged = init_hmsm._dtrajs_lagged
self._observable_set = init_hmsm._observable_set
self._dtrajs_obs = init_hmsm._dtrajs_obs

# get estimation parameters
self.likelihoods = init_hmsm.likelihoods # Likelihood history
self.likelihood = init_hmsm.likelihood
self.hidden_state_probabilities = init_hmsm.hidden_state_probabilities # gamma variables
self.hidden_state_trajectories = init_hmsm.hidden_state_trajectories # Viterbi path
self.count_matrix = init_hmsm.count_matrix # hidden count matrix
self.initial_count = init_hmsm.initial_count # hidden init count
self.initial_distribution = init_hmsm.initial_distribution
self._active_set = init_hmsm._active_set

self.set_model_params(samples=samples, P=init_hmsm.transition_matrix, pobs=init_hmsm.observation_probabilities,
dt_model=init_hmsm.dt_model)
# deal with connectivity
states_subset = None
if self.connectivity == 'largest':
states_subset = 'largest-strong'
elif self.connectivity == 'populous':
states_subset = 'populous-strong'
# OBSERVATION SET
if self.observe_nonempty:
observe_subset = 'nonempty'
else:
observe_subset = None

# return submodel (will return self if all None)
return self.submodel(states=init_hmsm.active_set,
obs=init_hmsm.observable_set,
return self.submodel(states=states_subset, obs=observe_subset,
mincount_connectivity=self.mincount_connectivity)

def submodel(self, states=None, obs=None, mincount_connectivity='1/n'):
# call submodel on MaximumLikelihoodHMSM
_MaximumLikelihoodHMSM.submodel(self, states=states, obs=obs, mincount_connectivity=mincount_connectivity)
# if samples set, also reduce them
if hasattr(self, 'samples'):
if self.samples is not None:
subsamples = [sample.submodel(states=self.active_set, obs=self.observable_set)
for sample in self.samples]
self.update_model_params(samples=subsamples)
# return
return self
7 changes: 5 additions & 2 deletions pyemma/msm/estimators/estimated_msm.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,11 +327,14 @@ def trajectory_weights(self):
# compute stationary distribution, expanded to full set
statdist_full = np.zeros([self._nstates_full])
statdist_full[self.active_set] = self.stationary_distribution
# histogram observed states
import msmtools.dtraj as msmtraj
hist = 1.0 * msmtraj.count_states(self.discrete_trajectories_full)
# simply read off stationary distribution and accumulate total weight
W = []
wtot = 0.0
for dtraj in self.discrete_trajectories_full:
w = statdist_full[dtraj]
w = statdist_full[dtraj] / hist[dtraj]
W.append(w)
wtot += np.sum(W)
# normalize
Expand Down Expand Up @@ -544,4 +547,4 @@ def coarse_grain(self, ncoarse, method='hmm'):
assert _types.is_int(self.nstates) and ncoarse > 1 and ncoarse <= self.nstates, \
'nstates must be an int in [2,msmobj.nstates]'

return self.hmm(ncoarse)
return self.hmm(ncoarse)
2 changes: 1 addition & 1 deletion pyemma/msm/estimators/implied_timescales.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def _estimate(self, data):
self.estimator.show_progress = False

# run estimation on all lag times
self._models, self._estimators = estimate_param_scan(self.estimator, data, param_sets, failfast=True,
self._models, self._estimators = estimate_param_scan(self.estimator, data, param_sets, failfast=False,
return_estimators=True, n_jobs=self.n_jobs,
progress_reporter=self)

Expand Down
Loading

0 comments on commit 3172ec7

Please sign in to comment.