From 7b53e47a82c84eb09900edf7c028e56622971f7d Mon Sep 17 00:00:00 2001 From: "Martin K. Scherer" Date: Thu, 20 Oct 2016 16:27:36 +0200 Subject: [PATCH 1/5] [patches] also scale box_lengths with distance_unit Fixes #968, because it produces different unitcell_lengths for xtc and dcd (factor 10). --- pyemma/coordinates/util/patches.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pyemma/coordinates/util/patches.py b/pyemma/coordinates/util/patches.py index 904043e73..48aea188f 100644 --- a/pyemma/coordinates/util/patches.py +++ b/pyemma/coordinates/util/patches.py @@ -327,6 +327,7 @@ def _read_traj_data(atom_indices, f, n_frames, **kwargs): #raise NotImplementedError("format read function not handled..." + str(f)) box = in_units_of(box, f.distance_unit, Trajectory._distance_unit, inplace=True) + cell_lengths = in_units_of(cell_lengths, f.distance_unit, Trajectory._distance_unit, inplace=True) return TrajData(xyz, cell_lengths, cell_angles, box) From 7bc9dd3b09dcc0b7a54900cfc2f4d8ab5ed2a13c Mon Sep 17 00:00:00 2001 From: "Martin K. Scherer" Date: Thu, 20 Oct 2016 16:30:08 +0200 Subject: [PATCH 2/5] [save_trajs] Fix index checking of frames, which failed if ntraj>max(frame_i) The error was to obtain the maximum on a 2d array (itraj, frame_i) and not only frame indices. - added test to check for correct error message (by regex). - reset the return_traj_obj to initial state. - also removed obsolete binding for frame_from_file. Fixes #958 --- .../coordinates/data/util/frames_from_file.py | 134 ++++++------------ .../tests/test_frames_from_file.py | 12 ++ 2 files changed, 54 insertions(+), 92 deletions(-) diff --git a/pyemma/coordinates/data/util/frames_from_file.py b/pyemma/coordinates/data/util/frames_from_file.py index eda57d757..0201802f7 100644 --- a/pyemma/coordinates/data/util/frames_from_file.py +++ b/pyemma/coordinates/data/util/frames_from_file.py @@ -20,7 +20,6 @@ import itertools from logging import getLogger -import mdtraj as md import numpy as np from pyemma.coordinates import source @@ -29,7 +28,6 @@ from pyemma.coordinates.data.util.reader_utils import (copy_traj_attributes as _copy_traj_attributes, preallocate_empty_trajectory as _preallocate_empty_trajectory, enforce_top as _enforce_top) -from pyemma.util.annotators import deprecated __all__ = ['frames_from_files'] @@ -53,6 +51,7 @@ def frames_from_files(files, top, frames, chunksize=1000, stride=1, verbose=Fals # Enforce topology to be a md.Topology object if reader is None: top = _enforce_top(top) + reader_given = False else: if not reader.number_of_trajectories(): raise ValueError("need at least one trajectory file in reader.") @@ -62,6 +61,7 @@ def frames_from_files(files, top, frames, chunksize=1000, stride=1, verbose=Fals top = reader.featurizer.topology else: raise ValueError("unsupported reader (only md readers).") + reader_given = True stride = int(stride) frames = np.array(frames) @@ -74,7 +74,7 @@ def frames_from_files(files, top, frames, chunksize=1000, stride=1, verbose=Fals frames = np.insert(np.atleast_2d(frames), 0, np.zeros_like(frames), axis=0).T if stride != 1: - frames[:, 1] *= int(stride) + frames[:, 1] *= stride if verbose: log.info('A stride value of = %u was parsed, ' 'interpreting "indexes" accordingly.' % stride) @@ -100,97 +100,47 @@ def frames_from_files(files, top, frames, chunksize=1000, stride=1, verbose=Fals # sanity check of indices for itraj in inds_to_check: - inds_by_traj = sorted_inds[sorted_inds[:, 0] == itraj] + inds_by_traj = sorted_inds[sorted_inds[:, 0] == itraj][:, 1] + assert inds_by_traj.ndim == 1 largest_ind_in_traj = np.max(inds_by_traj) length = reader.trajectory_length(itraj) - if length < largest_ind_in_traj: - raise ValueError("largest specified index (%i * stride=%i * %i=%i) " - "is larger than trajectory length '%s' = %i" % - (largest_ind_in_traj/stride, largest_ind_in_traj/stride, - stride, largest_ind_in_traj, reader.filenames[itraj], - length)) + if largest_ind_in_traj >= length: + raise ValueError("largest specified index ({largest_without_stride} * stride=" + "{largest_without_stride} * {stride}={largest}) " + "is larger than trajectory length '{filename}' = {length}".format( + largest_without_stride=largest_ind_in_traj / stride, + stride=stride, + largest=largest_ind_in_traj, + filename=reader.filenames[itraj], + length=length)) + + def set_reader_return_traj_objects(reader, flag): + if isinstance(reader, FeatureReader): + reader._return_traj_obj = flag + elif isinstance(reader, FragmentedTrajectoryReader): + for file in reader.filenames_flat: + r = reader.reader_by_filename(file) + if isinstance(r, FeatureReader): + r = [r] + for _r in r: + _r._return_traj_obj = flag # we want the FeatureReader to return mdtraj.Trajectory objects - if isinstance(reader, FeatureReader): - reader._return_traj_obj = True - elif isinstance(reader, FragmentedTrajectoryReader): - for file in reader.filenames_flat: - r = reader.reader_by_filename(file) - if isinstance(r, FeatureReader): - r = [r] - for _r in r: - _r._return_traj_obj = True - - it = reader.iterator(chunk=chunksize, stride=sorted_inds, return_trajindex=False) - collected_frames = [] - with it: - for x in it: - collected_frames.append(x) - - dest = _preallocate_empty_trajectory(top, len(frames)) - i = 0 - for chunk in collected_frames: - _copy_traj_attributes(dest, chunk, i) - i += len(chunk) - dest = dest.slice(sort_inds.argsort(), copy=False) + set_reader_return_traj_objects(reader, True) + + try: + it = reader.iterator(chunk=chunksize, stride=sorted_inds, return_trajindex=False) + with it: + collected_frames = [f for f in it] + dest = _preallocate_empty_trajectory(top, len(frames)) + t = 0 + for chunk in collected_frames: + _copy_traj_attributes(dest, chunk, t) + t += len(chunk) + # reshuffle the indices of the final trajectory object to obtain the desired order + dest = dest.slice(sort_inds.argsort(), copy=False) + finally: + # in any case we want to reset the reader to its previous state (return features, instead of md.Trajectory) + if reader_given: + set_reader_return_traj_objects(reader, False) return dest - - -@deprecated("use_frames_from_files") -def frames_from_file(file_name, top, frames, chunksize=100, - stride=1, verbose=False, copy_not_join=False): - r"""Reads one "file_name" molecular trajectory and returns an mdtraj trajectory object - containing only the specified "frames" in the specified order. - - Extracts the specified sequence of time/trajectory indexes from the input loader - and saves it in a molecular dynamics trajectory. The output format will be determined - by the outfile name. - - Parameters - ---------- - file_name: str. - Absolute path to the molecular trajectory file, ex. trajout.xtc - - top : str, mdtraj.Trajectory, or mdtraj.Topology - Topology information to load the molecular trajectroy file in :py:obj:`file_name` - - frames : ndarray of shape (n_frames, ) and integer type - Contains the frame indices to be retrieved from "file_name". There is no restriction as to what - this array has to look like other than: - - positive integers - - <= the total number of frames in "file_name". - "frames" need not be monotonous or unique, i.e, arrays like - [3, 1, 4, 1, 5, 9, 9, 9, 9, 3000, 0, 0, 1] are welcome - - verbose: boolean. - Level of verbosity while looking for "frames". Useful when using "chunksize" with large trajectories. - It provides the no. of frames accumulated for every chunk. - - stride : integer, default is 1 - This parameter informs :py:func:`save_traj` about the stride used in :py:obj:`indexes`. Typically, :py:obj:`indexes` - contains frame-indexes that match exactly the frames of the files contained in :py:obj:`traj_inp.trajfiles`. - However, in certain situations, that might not be the case. Examples are cases in which a stride value != 1 - was used when reading/featurizing/transforming/discretizing the files contained in :py:obj:`traj_inp.trajfiles`. - - copy_not_join : boolean, default is False - This parameter decides how geometry objects are appended onto one another. If left to False, mdtraj's own - :py:obj:`join` method will be used, which is the recommended method. However, for some combinations of - py:obj:`chunksizes` and :py:obj:`frames` this might be not very effective. If one sets :py:obj:`copy_not_join` - to True, the returned :py:obj:`traj` is preallocated and the important attributes (currently traj.xyz, traj.time, - traj.unit_lengths, traj.unit_angles) are broadcasted onto it. - - - Returns - ------- - traj : an md trajectory object containing the frames specified in "frames", - in the order specified in "frames". - """ - - assert isinstance(frames, np.ndarray), "input frames frames must be a numpy ndarray, got %s instead "%type(frames) - assert np.ndim(frames) == 1, "input frames frames must have ndim = 1, got np.ndim = %u instead "%np.ndim(frames) - assert isinstance(file_name, str), "input file_name must be a string, got %s instead"%type(file_name) - assert isinstance(top, (str, md.Trajectory, md.Topology)), "input topology must of one of type: " \ - "str, mdtraj.Trajectory, or mdtraj.Topology. " \ - "Got %s instead" % type(top) - - return frames_from_files([file_name], top, frames, chunksize, stride, verbose) diff --git a/pyemma/coordinates/tests/test_frames_from_file.py b/pyemma/coordinates/tests/test_frames_from_file.py index ee1e974c0..a02ac2607 100644 --- a/pyemma/coordinates/tests/test_frames_from_file.py +++ b/pyemma/coordinates/tests/test_frames_from_file.py @@ -32,6 +32,7 @@ from numpy.random import randint from numpy import floor, allclose +import numpy as np import mdtraj as md from pyemma.coordinates.data.util.frames_from_file import frames_from_files as _frames_from_file @@ -151,6 +152,17 @@ def test_gets_the_right_frames_with_stride_with_copy(self): assert allclose(traj_test.unitcell_lengths, traj_ref.unitcell_lengths) assert allclose(traj_test.unitcell_angles, traj_ref.unitcell_angles) + def test_trajs_larger_than_frame_index(self): + """ file list is larger than largest traj file """ + from pyemma.coordinates.tests.util import create_traj, get_top + files = [create_traj(length=10)[0] for _ in range(20)] + inds = np.vstack((np.arange(20), np.arange(20))).T + + with self.assertRaises(ValueError) as cm: + _frames_from_file(files, top=get_top(), frames=inds) + import re + matches = re.fullmatch(".*10\).*is larger than trajectory length.*\= 10", cm.exception.args[0]) + assert matches if __name__ == "__main__": unittest.main() From 7553d592d004b1f1a22264424b9feb54d20a01d4 Mon Sep 17 00:00:00 2001 From: "Martin K. Scherer" Date: Thu, 20 Oct 2016 17:12:22 +0200 Subject: [PATCH 3/5] fullmatches -> matches (py2 compat) --- pyemma/coordinates/tests/test_frames_from_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyemma/coordinates/tests/test_frames_from_file.py b/pyemma/coordinates/tests/test_frames_from_file.py index a02ac2607..6f6946a1b 100644 --- a/pyemma/coordinates/tests/test_frames_from_file.py +++ b/pyemma/coordinates/tests/test_frames_from_file.py @@ -161,7 +161,7 @@ def test_trajs_larger_than_frame_index(self): with self.assertRaises(ValueError) as cm: _frames_from_file(files, top=get_top(), frames=inds) import re - matches = re.fullmatch(".*10\).*is larger than trajectory length.*\= 10", cm.exception.args[0]) + matches = re.match(".*10\).*is larger than trajectory length.*\= 10", cm.exception.args[0]) assert matches if __name__ == "__main__": From 00fe7684ac28361e1b5b4d639c7fc34c2ed6f78b Mon Sep 17 00:00:00 2001 From: "Martin K. Scherer" Date: Thu, 20 Oct 2016 17:17:18 +0200 Subject: [PATCH 4/5] [doc] added changelog entries. --- doc/source/CHANGELOG.rst | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/doc/source/CHANGELOG.rst b/doc/source/CHANGELOG.rst index 43ec54646..d364a5bad 100644 --- a/doc/source/CHANGELOG.rst +++ b/doc/source/CHANGELOG.rst @@ -1,15 +1,21 @@ Changelog ========= -2.3 (tba) ---------- +2.2.7 (10-20-16) +---------------- **New features**: - coordinates: for lag < chunksize improved speed (50%) for TICA. #960 **Fixes**: -- msm: low-level api removed (use msmtools for now, if you really need it). #550, + +- coordinates: + - save_trajs, frames_from_files: fix input indices checking. #958 + - FeatureReader: fix random access iterator unitcell_lengths scaling. + This lead to an error in conjunction with distance calculations, where + frames are collected in a random access pattern. #968 +- msm: low-level api removed (use msmtools for now, if you really need it). #550 2.2.6 (9-23-16) --------------- From e6a69d993fc84da60e89c8baf9dd6d7bf4d99479 Mon Sep 17 00:00:00 2001 From: "Martin K. Scherer" Date: Thu, 20 Oct 2016 17:26:15 +0200 Subject: [PATCH 5/5] [patches] check if unitcell_length is valid before unit conversion --- pyemma/coordinates/util/patches.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyemma/coordinates/util/patches.py b/pyemma/coordinates/util/patches.py index 48aea188f..18aba9f52 100644 --- a/pyemma/coordinates/util/patches.py +++ b/pyemma/coordinates/util/patches.py @@ -298,8 +298,7 @@ def _read_traj_data(atom_indices, f, n_frames, **kwargs): # first element is always xyz coords array. xyz = res[0] - # hopefully this works for all formats? - xyz = in_units_of(xyz, f.distance_unit, Trajectory._distance_unit, inplace=True) + in_units_of(xyz, f.distance_unit, Trajectory._distance_unit, inplace=True) box = cell_lengths = cell_angles = None @@ -326,8 +325,9 @@ def _read_traj_data(atom_indices, f, n_frames, **kwargs): assert len(res) == 1, "len:{l}, type={t}".format(l=len(res), t=f) #raise NotImplementedError("format read function not handled..." + str(f)) - box = in_units_of(box, f.distance_unit, Trajectory._distance_unit, inplace=True) - cell_lengths = in_units_of(cell_lengths, f.distance_unit, Trajectory._distance_unit, inplace=True) + in_units_of(box, f.distance_unit, Trajectory._distance_unit, inplace=True) + if cell_lengths is not None: + in_units_of(cell_lengths, f.distance_unit, Trajectory._distance_unit, inplace=True) return TrajData(xyz, cell_lengths, cell_angles, box)