markovmodel
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/source/CHANGELOG.rst‎
Lines changed: 1 addition & 1 deletion b/‎doc/source/CHANGELOG.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyemma/_base/progress/reporter/__init__.py‎
Lines changed: 3 additions & 1 deletion b/‎pyemma/_base/progress/reporter/__init__.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎pyemma/coordinates/api.py‎
Lines changed: 33 additions & 16 deletions b/‎pyemma/coordinates/api.py‎
Lines changed: 33 additions & 16 deletions
diff --git a/‎pyemma/coordinates/data/_base/datasource.py‎
Lines changed: 10 additions & 2 deletions b/‎pyemma/coordinates/data/_base/datasource.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎pyemma/coordinates/data/_base/iterable.py‎
Lines changed: 31 additions & 16 deletions b/‎pyemma/coordinates/data/_base/iterable.py‎
Lines changed: 31 additions & 16 deletions
diff --git a/‎pyemma/coordinates/data/_base/transformer.py‎
Lines changed: 8 additions & 5 deletions b/‎pyemma/coordinates/data/_base/transformer.py‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎pyemma/coordinates/data/feature_reader.py‎
Lines changed: 10 additions & 3 deletions b/‎pyemma/coordinates/data/feature_reader.py‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎pyemma/coordinates/data/sources_merger.py‎
Lines changed: 5 additions & 5 deletions b/‎pyemma/coordinates/data/sources_merger.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎pyemma/coordinates/tests/test_coordinates_iterator.py‎
Lines changed: 13 additions & 0 deletions b/‎pyemma/coordinates/tests/test_coordinates_iterator.py‎
Lines changed: 13 additions & 0 deletions
@@ -11,6 +11,7 @@ build*
 *.egg-info
 *.so
 /temp
+/tmp
 /target
 __pycache__
 /pylint/pylint_*.txt
 
@@ -9,7 +9,7 @@ Quick fix release to repair chunking in the coordinates package.
 **Fixes**:
 
 - msm: fix bug in ImpliedTimescales, which happened when an estimation failed for a given lag time. #1248
-- coordinates: fixed handling of default chunksize. #1247
+- coordinates: fixed handling of default chunksize. #1247, #1251
 - base: updated pybind to 2.2.2. #1249
 
 
 
@@ -193,7 +193,9 @@ def _progress_force_finish(self, stage=0, description=None):
 
         pg = self._prog_rep_progressbars[stage]
         pg.desc = description
-        pg.update(int(pg.total - pg.n))
+        increment = int(pg.total - pg.n)
+        if increment > 0:
+            pg.update(increment)
         pg.refresh(nolock=True)
         pg.close()
         self._prog_rep_progressbars.pop(stage, None)
 
@@ -1012,10 +1012,12 @@ def pca(data=None, dim=-1, var_cutoff=0.95, stride=1, mean=None, skip=0, chunksi
         warnings.warn("provided mean ignored", DeprecationWarning)
 
     res = PCA(dim=dim, var_cutoff=var_cutoff, mean=None, skip=skip, stride=stride)
+    from pyemma.util.reflection import get_default_args
+    cs = _check_old_chunksize_arg(chunksize, get_default_args(pca)['chunksize'], **kwargs)
     if data is not None:
-        from pyemma.util.reflection import get_default_args
-        cs = _check_old_chunksize_arg(chunksize, get_default_args(pca)['chunksize'], **kwargs)
         res.estimate(data, chunksize=cs)
+    else:
+        res.chunksize = cs
     return res
 
 
@@ -1256,6 +1258,8 @@ def tica(data=None, lag=10, dim=-1, var_cutoff=0.95, kinetic_map=True, commute_m
                weights=weights, reversible=reversible, ncov_max=ncov_max)
     if data is not None:
         res.estimate(data, chunksize=cs)
+    else:
+        res.chunksize = cs
     return res
 
 
@@ -1267,14 +1271,13 @@ def vamp(data=None, lag=10, dim=None, scaling=None, right=True, ncov_max=float('
       ----------
       lag : int
           lag time
-      dim : float or int
+      dim : float or int, default=None
           Number of dimensions to keep:
 
-          * if dim is not set all available ranks are kept:
+          * if dim is not set (None) all available ranks are kept:
               `n_components == min(n_samples, n_features)`
           * if dim is an integer >= 1, this number specifies the number
-            of dimensions to keep. By default this will use the kinetic
-            variance.
+            of dimensions to keep.
           * if dim is a float with ``0 < dim < 1``, select the number
             of dimensions such that the amount of kinetic variance
             that needs to be explained is greater than the percentage
@@ -1406,6 +1409,8 @@ def vamp(data=None, lag=10, dim=None, scaling=None, right=True, ncov_max=float('
     res = VAMP(lag, dim=dim, scaling=scaling, right=right, skip=skip, ncov_max=ncov_max)
     if data is not None:
         res.estimate(data, stride=stride, chunksize=chunksize)
+    else:
+        res.chunksize = chunksize
     return res
 
 
@@ -1502,6 +1507,8 @@ def covariance_lagged(data=None, c00=True, c0t=True, ctt=False, remove_constant_
                           weights=weights, stride=stride, skip=skip, ncov_max=ncov_max)
     if data is not None:
         lc.estimate(data, chunksize=chunksize)
+    else:
+        lc.chunksize = chunksize
     return lc
 
 
@@ -1552,10 +1559,12 @@ def cluster_mini_batch_kmeans(data=None, k=100, max_iter=10, batch_size=0.2, met
     from pyemma.coordinates.clustering.kmeans import MiniBatchKmeansClustering
     res = MiniBatchKmeansClustering(n_clusters=k, max_iter=max_iter, metric=metric, init_strategy=init_strategy,
                                     batch_size=batch_size, n_jobs=n_jobs, skip=skip, clustercenters=clustercenters)
+    from pyemma.util.reflection import get_default_args
+    cs = _check_old_chunksize_arg(chunksize, get_default_args(cluster_mini_batch_kmeans)['chunksize'], **kwargs)
     if data is not None:
-        from pyemma.util.reflection import get_default_args
-        cs = _check_old_chunksize_arg(chunksize, get_default_args(cluster_mini_batch_kmeans)['chunksize'], **kwargs)
         res.estimate(data, chunksize=cs)
+    else:
+        res.chunksize = chunksize
     return res
 
 
@@ -1687,10 +1696,12 @@ def cluster_kmeans(data=None, k=None, max_iter=10, tolerance=1e-5, stride=1,
     res = KmeansClustering(n_clusters=k, max_iter=max_iter, metric=metric, tolerance=tolerance,
                            init_strategy=init_strategy, fixed_seed=fixed_seed, n_jobs=n_jobs, skip=skip,
                            keep_data=keep_data, clustercenters=clustercenters, stride=stride)
+    from pyemma.util.reflection import get_default_args
+    cs = _check_old_chunksize_arg(chunksize, get_default_args(cluster_kmeans)['chunksize'], **kwargs)
     if data is not None:
-        from pyemma.util.reflection import get_default_args
-        cs = _check_old_chunksize_arg(chunksize, get_default_args(cluster_kmeans)['chunksize'], **kwargs)
         res.estimate(data, chunksize=cs)
+    else:
+        res.chunksize = cs
     return res
 
 
@@ -1764,10 +1775,12 @@ def cluster_uniform_time(data=None, k=None, stride=1, metric='euclidean',
     """
     from pyemma.coordinates.clustering.uniform_time import UniformTimeClustering
     res = UniformTimeClustering(k, metric=metric, n_jobs=n_jobs, skip=skip, stride=stride)
+    from pyemma.util.reflection import get_default_args
+    cs = _check_old_chunksize_arg(chunksize, get_default_args(cluster_uniform_time)['chunksize'], **kwargs)
     if data is not None:
-        from pyemma.util.reflection import get_default_args
-        cs = _check_old_chunksize_arg(chunksize, get_default_args(cluster_uniform_time)['chunksize'], **kwargs)
         res.estimate(data, chunksize=cs)
+    else:
+        res.chunksize = cs
     return res
 
 
@@ -1863,10 +1876,12 @@ def cluster_regspace(data=None, dmin=-1, max_centers=1000, stride=1, metric='euc
     from pyemma.coordinates.clustering.regspace import RegularSpaceClustering as _RegularSpaceClustering
     res = _RegularSpaceClustering(dmin, max_centers=max_centers, metric=metric,
                                   n_jobs=n_jobs, stride=stride, skip=skip)
+    from pyemma.util.reflection import get_default_args
+    cs = _check_old_chunksize_arg(chunksize, get_default_args(cluster_regspace)['chunksize'], **kwargs)
     if data is not None:
-        from pyemma.util.reflection import get_default_args
-        cs = _check_old_chunksize_arg(chunksize, get_default_args(cluster_regspace)['chunksize'], **kwargs)
         res.estimate(data, chunksize=cs)
+    else:
+        res.chunksize = cs
     return res
 
 
@@ -1952,11 +1967,13 @@ def assign_to_centers(data=None, centers=None, stride=1, return_dtrajs=True,
                          ' or NumPy array or a reader created by source function')
     from pyemma.coordinates.clustering.assign import AssignCenters
     res = AssignCenters(centers, metric=metric, n_jobs=n_jobs, skip=skip, stride=stride)
+    from pyemma.util.reflection import get_default_args
+    cs = _check_old_chunksize_arg(chunksize, get_default_args(assign_to_centers)['chunksize'], **kwargs)
     if data is not None:
-        from pyemma.util.reflection import get_default_args
-        cs = _check_old_chunksize_arg(chunksize, get_default_args(assign_to_centers)['chunksize'], **kwargs)
         res.estimate(data, chunksize=cs)
         if return_dtrajs:
             return res.dtrajs
+    else:
+        res.chunksize = cs
 
     return res
@@ -222,7 +222,7 @@ def number_of_trajectories(self, stride=None):
             n = self.ntraj
         return n
 
-    def trajectory_length(self, itraj, stride=1, skip=None):
+    def trajectory_length(self, itraj, stride=1, skip=0):
         r"""Returns the length of trajectory of the requested index.
 
         Parameters
@@ -246,7 +246,10 @@ def trajectory_length(self, itraj, stride=1, skip=None):
             selection = stride[stride[:, 0] == itraj][:, 0]
             return 0 if itraj not in selection else len(selection)
         else:
-            return (self._lengths[itraj] - (0 if skip is None else skip) - 1) // int(stride) + 1
+            skip = 0 if skip is None else skip
+            res = (self._lengths[itraj] - skip - 1) // int(stride) + 1
+            assert res >= 0
+            return res
 
     def n_chunks(self, chunksize, stride=1, skip=0):
         """ how many chunks an iterator of this sourcde will output, starting (eg. after calling reset())
@@ -658,6 +661,11 @@ def __init__(self, data_source, skip=0, chunk=0, stride=1, return_trajindex=Fals
         self.__init_stride(stride)
         self._pos = 0
         self._last_chunk_in_traj = False
+        if not isinstance(stride, np.ndarray) and skip > 0:
+            # skip over the trajectories that are smaller than skip
+            while self.state.itraj < self._data_source.ntraj \
+                    and self._data_source.trajectory_length(self.state.itraj, self.stride, 0) <= skip:
+                self.state.itraj += 1
         super(DataSourceIterator, self).__init__()
 
     def __init_stride(self, stride):
 
@@ -27,6 +27,7 @@
 
 
 class Iterable(six.with_metaclass(ABCMeta, InMemoryMixin, Loggable)):
+    _FALLBACK_CHUNKSIZE = 1000
 
     def __init__(self, chunksize=None):
         super(Iterable, self).__init__()
@@ -41,28 +42,42 @@ def dimension(self):
     def ndim(self):
         return self.dimension()
 
+    @staticmethod
+    def _compute_default_cs(dim, itemsize, logger=None):
+        # obtain a human readable memory size from the config, convert it to bytes and calc maximum chunksize.
+        from pyemma import config
+        from pyemma.util.units import string_to_bytes
+        max_bytes = string_to_bytes(config.default_chunksize)
+
+        # TODO: consider rounding this to some cache size of CPU? e.g py-cpuinfo can obtain it.
+        # if one time step is already bigger than max_memory, we set the chunksize to 1.
+        max_elements = max(1, int(np.floor(max_bytes / (itemsize * dim))))
+        assert max_elements * dim * itemsize <= max_bytes or max_elements == 1
+        result = max(1, max_elements // dim)
+
+        assert result > 0
+        if logger is not None:
+            logger.debug('computed default chunksize to %s'
+                         ' to limit memory per chunk to %s', result, config.default_chunksize)
+        return result
+
     @property
     def default_chunksize(self):
-        """ How much data will be processed at once, in case no chunksize has been provided."""
+        """ How much data will be processed at once, in case no chunksize has been provided.
+
+        Notes
+        -----
+        This variable respects your setting for maximum memory in pyemma.config.default_chunksize
+        """
         if self._default_chunksize is None:
             try:
-                # some overloads of dimension can raise, eg. PCA, TICA
-                dim = self.dimension()
+                self.dimension()
+                self.output_type()
             except:
-                self.logger.info('could not obtain output dimension, defaulting to chunksize=1000')
-                self._default_chunksize = 1000
+                self._default_chunksize = Iterable._FALLBACK_CHUNKSIZE
             else:
-                # obtain a human readable memory size from the config, convert it to bytes and calc maximum chunksize.
-                from pyemma import config
-                from pyemma.util.units import string_to_bytes
-                max_bytes = string_to_bytes(config.default_chunksize)
-                itemsize = np.dtype(self.output_type()).itemsize
-                # TODO: consider rounding this to some cache size of CPU? e.g py-cpuinfo can obtain it.
-                # if one time step is already bigger than max_memory, we set the chunksize to 1.
-                max_elements = max(1, int(np.floor(max_bytes / (itemsize * self.ndim))))
-                assert max_elements * self.ndim * itemsize <= max_bytes or max_elements == 1
-                self._default_chunksize = max(1, max_elements // self.ndim)
-                assert self._default_chunksize > 0, self._default_chunksize
+                self._default_chunksize = Iterable._compute_default_cs(self.dimension(),
+                                                                       self.output_type()().itemsize, self.logger)
         return self._default_chunksize
 
     @property
 
@@ -179,12 +179,15 @@ def chunksize(self):
         return self.data_producer.chunksize
 
     @chunksize.setter
-    def chunksize(self, size):
+    def chunksize(self, value):
         if self.data_producer is None:
-            if size < 0:
-                raise ValueError('chunksize has to be positive.')
-            self._default_chunksize = size
-        self.data_producer.chunksize = size
+            if not isinstance(value, (type(None), int)):
+                raise ValueError('chunksize has to be of type: None or int')
+            if isinstance(value, int) and value < 0:
+                raise ValueError("Chunksize of %s was provided, but has to be >= 0" % value)
+            self._default_chunksize = value
+        else:
+            self.data_producer.chunksize = value
 
     def number_of_trajectories(self, stride=1):
         return self.data_producer.number_of_trajectories(stride)
 
@@ -311,8 +311,9 @@ def __init__(self, data_source, skip=0, chunk=0, stride=1, return_trajindex=Fals
                 return_trajindex=return_trajindex,
                 cols=cols
         )
+        # set chunksize prior selecting the first file, to ensure we have a sane value for mditer...
+        self.chunksize = chunk
         self._selected_itraj = -1
-        self._select_file(0)
 
     @property
     def chunksize(self):
@@ -322,7 +323,7 @@ def chunksize(self):
     def chunksize(self, value):
         self.state.chunk = value
         if hasattr(self, '_mditer'):
-            self._mditer._chunksize = int(value)
+            self._mditer.chunksize = value
 
     @property
     def skip(self):
@@ -353,6 +354,8 @@ def _next_chunk(self):
 
         :return: a feature mapped vector X, or (X, Y) if lag > 0
         """
+        if not hasattr(self, '_mditer') or self._mditer is None:
+            self._select_file(self._itraj)
         try:
             chunk = next(self._mditer)
         except StopIteration as si:
@@ -408,6 +411,10 @@ def _create_mditer(self):
         self._closed = False
 
     def _create_patched_iter(self, filename, skip=0, stride=1, atom_indices=None):
-        return patches.iterload(filename, chunk=self.chunksize, top=self._data_source.featurizer.topology,
+        if self.is_uniform_stride(self.stride):
+            flen = self._data_source.trajectory_length(itraj=self._itraj, stride=self.stride, skip=self.skip)
+        else:
+            flen = self.ra_trajectory_length(self._itraj)
+        return patches.iterload(filename, flen, chunk=self.chunksize, top=self._data_source.featurizer.topology,
                                 skip=skip, stride=stride, atom_indices=atom_indices)
 
@@ -90,8 +90,8 @@ def _select_file(self, itraj):
             self._t = 0
             self._itraj = itraj
             self._selected_itraj = itraj
-            if __debug__:
-                for it in self._iterators:
-                    assert it._itraj == itraj
-                    assert it._selected_itraj == itraj
-                    assert it._t == self._t
+            for it in self._iterators:
+                it._select_file(itraj)
+                assert it._itraj == itraj
+                assert it._selected_itraj == itraj
+                assert it._t == self._t
@@ -62,6 +62,19 @@ def test_n_chunks(self):
                                                 "returned for stride=%s, lag=%s" % (stride, lag))
                 assert chunks == it.n_chunks
 
+        dd = [np.random.random((100, 3)), np.random.random((120, 3)), np.random.random((120, 3))]
+        rr = DataInMemory(dd)
+
+        # test for lagged iterator
+        for stride in range(1, 5):
+            for lag in [x for x in range(0, 18)] + [50, 100]:
+                it = rr.iterator(lag=lag, chunk=30, stride=stride, return_trajindex=False)
+                chunks = sum(1 for _ in it)
+                np.testing.assert_equal(it.n_chunks, chunks,
+                                        err_msg="Expected number of chunks did not agree with what the iterator "
+                                                "returned for stride=%s, lag=%s" % (stride, lag))
+                assert chunks == it.n_chunks
+
     def _count_chunks(self, it):
         with it:
             it.reset()