diff --git a/doc/source/CHANGELOG.rst b/doc/source/CHANGELOG.rst index e65e07826..c3969aacc 100644 --- a/doc/source/CHANGELOG.rst +++ b/doc/source/CHANGELOG.rst @@ -1,6 +1,22 @@ Changelog ========= +2.3.1 (tba) +----------- + +**New features**: + +- msm: + - ImpliedTimescales: enable insertion/removal of lag times. + Avoid recomputing existing models. #1030 + +**Fixes**: + +- coordinates: + - If Estimators supporting streaming are used directly, restore previous behaviour. #1034 + Note that estimators used directly from the API are not affected. + + 2.3 (6-1-2017) -------------- diff --git a/pyemma/coordinates/clustering/assign.py b/pyemma/coordinates/clustering/assign.py index 84a4ae483..597105ffb 100644 --- a/pyemma/coordinates/clustering/assign.py +++ b/pyemma/coordinates/clustering/assign.py @@ -87,11 +87,12 @@ def describe(self): @AbstractClustering.data_producer.setter def data_producer(self, dp): - # check dimensions - dim = self.clustercenters.shape[1] - if not dim == dp.dimension(): - raise ValueError('cluster centers have wrong dimension. Have dim=%i' - ', but input has %i' % (dim, dp.dimension())) + if dp is not None: + # check dimensions + dim = self.clustercenters.shape[1] + if not dim == dp.dimension(): + raise ValueError('cluster centers have wrong dimension. Have dim=%i' + ', but input has %i' % (dim, dp.dimension())) AbstractClustering.data_producer.fset(self, dp) def _estimate(self, iterable, **kw): diff --git a/pyemma/coordinates/data/_base/streaming_estimator.py b/pyemma/coordinates/data/_base/streaming_estimator.py index b400c6997..41ce2efec 100644 --- a/pyemma/coordinates/data/_base/streaming_estimator.py +++ b/pyemma/coordinates/data/_base/streaming_estimator.py @@ -37,13 +37,15 @@ def __init__(self, chunksize=None): self._chunksize = chunksize def estimate(self, X, **kwargs): + # ensure the input is able to provide a stream if not isinstance(X, Iterable): if isinstance(X, np.ndarray) or \ - (isinstance(X, (list, tuple)) and len(X) > 0 and all([isinstance(x, np.ndarray) for x in X])): + (isinstance(X, (list, tuple)) and len(X) > 0 and all((isinstance(x, np.ndarray) for x in X))): X = DataInMemory(X, self.chunksize) else: raise ValueError("no np.ndarray or non-empty list of np.ndarrays given") - + # Because we want to use pipelining methods like get_output, we have to set a data producer. + self.data_producer = X # run estimation try: super(StreamingEstimator, self).estimate(X, **kwargs) diff --git a/pyemma/coordinates/data/_base/transformer.py b/pyemma/coordinates/data/_base/transformer.py index 69944c3fd..efc29f754 100644 --- a/pyemma/coordinates/data/_base/transformer.py +++ b/pyemma/coordinates/data/_base/transformer.py @@ -24,6 +24,7 @@ import six from pyemma._ext.sklearn.base import TransformerMixin from pyemma.coordinates.data._base.datasource import DataSource, DataSourceIterator +from pyemma.coordinates.data._base.iterable import Iterable from pyemma.coordinates.data._base.random_accessible import RandomAccessStrategy from pyemma.coordinates.data._base.streaming_estimator import StreamingEstimator from pyemma.coordinates.util.change_notification import (inform_children_upon_change, @@ -108,6 +109,9 @@ class StreamingTransformer(Transformer, DataSource, NotifyOnChangesMixIn): r""" Basis class for pipelined Transformers. + This class derives from DataSource, so follow up pipeline elements can stream + the output of this class. + Parameters ---------- chunksize : int (optional) @@ -116,24 +120,28 @@ class StreamingTransformer(Transformer, DataSource, NotifyOnChangesMixIn): """ def __init__(self, chunksize=1000): super(StreamingTransformer, self).__init__(chunksize=chunksize) - self._estimated = False - self._data_producer = None + self.data_producer = None + self._Y_source = None @property # overload of DataSource def data_producer(self): + if not hasattr(self, '_data_producer'): + return None return self._data_producer @data_producer.setter @inform_children_upon_change def data_producer(self, dp): - if dp is not self._data_producer: + if dp is not self.data_producer: # first unregister from current dataproducer - if self._data_producer is not None and isinstance(self._data_producer, NotifyOnChangesMixIn): - self._data_producer._stream_unregister_child(self) + if self.data_producer is not None and isinstance(self.data_producer, NotifyOnChangesMixIn): + self.data_producer._stream_unregister_child(self) # then register this instance as a child of the new one. if dp is not None and isinstance(dp, NotifyOnChangesMixIn): dp._stream_register_child(self) + if dp is not None and not isinstance(dp, Iterable): + raise ValueError('can not set data_producer to non-iterable class of type {}'.format(type(dp))) self._data_producer = dp # register random access strategies self._set_random_access_strategies() @@ -215,9 +223,10 @@ def n_frames_total(self, stride=1, skip=0): class StreamingEstimationTransformer(StreamingTransformer, StreamingEstimator): """ Basis class for pipelined Transformers, which perform also estimation. """ - def estimate(self, X, **kwargs): super(StreamingEstimationTransformer, self).estimate(X, **kwargs) + # we perform the mapping to memory exactly here, because a StreamingEstimator on its own + # has not output to be mapped. Only the combination of Estimation/Transforming has this feature. if self.in_memory and not self._mapping_to_mem_active: self._map_to_memory() return self diff --git a/pyemma/coordinates/tests/test_cluster.py b/pyemma/coordinates/tests/test_cluster.py index a11abcb26..dcda2a227 100644 --- a/pyemma/coordinates/tests/test_cluster.py +++ b/pyemma/coordinates/tests/test_cluster.py @@ -65,9 +65,6 @@ def setUpClass(cls): cls.rt = coor.cluster_uniform_time(data = cls.X, k = 100) cls.cl = [cls.km, cls.rs, cls.rt] - def setUp(self): - pass - def test_chunksize(self): for c in self.cl: assert types.is_int(c.chunksize) @@ -138,11 +135,6 @@ def test_output_type(self): for c in self.cl: assert c.output_type() == np.int32 - def test_parametrize(self): - for c in self.cl: - # nothing should happen - c.parametrize() - def test_save_dtrajs(self): extension = ".dtraj" outdir = self.dtraj_dir @@ -170,5 +162,38 @@ def test_trajectory_lengths(self): assert c.trajectory_lengths()[0] == c.trajectory_length(0) +class TestClusterDirect(TestCluster): + # perform all the tests of TestCluster, but use Estimator classes directly without API. + @classmethod + def setUpClass(cls): + from pyemma.coordinates.clustering import KmeansClustering, RegularSpaceClustering, UniformTimeClustering + cls.dtraj_dir = tempfile.mkdtemp() + + # generate Gaussian mixture + means = [np.array([-3,0]), + np.array([-1,1]), + np.array([0,0]), + np.array([1,-1]), + np.array([4,2])] + widths = [np.array([0.3,2]), + np.array([0.3,2]), + np.array([0.3,2]), + np.array([0.3,2]), + np.array([0.3,2])] + # continuous trajectory + nsample = 1000 + cls.T = len(means)*nsample + cls.X = np.zeros((cls.T, 2)) + for i in range(len(means)): + cls.X[i*nsample:(i+1)*nsample,0] = widths[i][0] * np.random.randn() + means[i][0] + cls.X[i*nsample:(i+1)*nsample,1] = widths[i][1] * np.random.randn() + means[i][1] + # cluster in different ways + cls.km = KmeansClustering(n_clusters=100).estimate(cls.X) + cls.rs = RegularSpaceClustering(dmin=0.5).estimate(cls.X) + cls.rt = UniformTimeClustering(n_clusters=100).estimate(cls.X) + cls.cl = [cls.km, cls.rs, cls.rt] + return cls + + if __name__ == "__main__": unittest.main() \ No newline at end of file diff --git a/pyemma/coordinates/tests/test_pca.py b/pyemma/coordinates/tests/test_pca.py index 38e227782..96d4b941c 100644 --- a/pyemma/coordinates/tests/test_pca.py +++ b/pyemma/coordinates/tests/test_pca.py @@ -222,5 +222,13 @@ def test_feature_correlation_data(self): true_corr = np.corrcoef(feature_traj.T, pca_traj.T)[:nfeat,-npcs:] np.testing.assert_allclose(test_corr, true_corr, atol=1.E-8) + def test_pipelining_sklearn_compat(self): + from pyemma.coordinates.transform import PCA + t = PCA(dim=2) + x = np.random.random((20, 3)) + y = t.fit_transform(x) + y2 = t.get_output() + np.testing.assert_allclose(y2[0], y) + if __name__ == "__main__": unittest.main() diff --git a/pyemma/coordinates/tests/test_tica.py b/pyemma/coordinates/tests/test_tica.py index 1a25ade43..1bd0693aa 100644 --- a/pyemma/coordinates/tests/test_tica.py +++ b/pyemma/coordinates/tests/test_tica.py @@ -178,7 +178,15 @@ def test_too_short_trajs(self): def test_with_skip(self): data = np.random.random((100, 10)) - tica_obj = api.tica(lag=10, dim=1, skip=1) + tica_obj = api.tica(data, lag=10, dim=1, skip=1) + + def test_pipelining_sklearn_compat(self): + from pyemma.coordinates.transform import TICA + t = TICA(1) + x = np.random.random((20, 3)) + y = t.fit_transform(x) + y2 = t.get_output() + np.testing.assert_allclose(y2[0], y) class TestTICAExtensive(unittest.TestCase):