Merge pull request #879 from marscher/fix_lru_db_creation

[coor/traj_info_cache] fix a bug in creating the LRU cache This failed if the user has not yet created a config directory by invoking pyemma.config.save(), eg. in fairly new installations, since the LRU db name is derived from the main db, which is being hold in memory, in case no dir exists.
markovmodel · Jul 26, 2016 · 49e4a3d · 49e4a3d
2 parents e1bd9b2 + 7d61161
commit 49e4a3d
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 10 deletions.
diff --git a/doc/source/CHANGELOG.rst b/doc/source/CHANGELOG.rst
@@ -7,11 +7,16 @@ Changelog
 **New features**:
 - thermo: added MBAR estimation
 
+**Fixes**:
+- coordinates: In case a configuration directory has not been created yet, the LRU cache
+  of the TrajInfo database was failed to be created.
+
 
 2.2.2 (7-14-16)
 ---------------
 
 **New features**:
+
 - coordinates: SQLite backend for trajectory info data. This enables fast access to this data
   on parallel filesystems where multiple processes are writing to the database. This greatly
   speeds ups reader construction and enables fast random access for formats which usually do not

diff --git a/pyemma/coordinates/data/util/traj_info_backends.py b/pyemma/coordinates/data/util/traj_info_backends.py
@@ -205,15 +205,16 @@ def set(self, traj_info):
 
         self._update_time_stamp(hash_value=traj_info.hash_value)
 
-        current_size = os.stat(self.filename).st_size
-        if (self.num_entries >= config.traj_info_max_entries or
-                # current_size is in bytes, while traj_info_max_size is in MB
-                1.*current_size / 1024**2 >= config.traj_info_max_size):
-            logger.info("Cleaning database because it has too much entries or is too large.\n"
-                        "Entries: %s. Size: %.2fMB. Configured max_entires: %s. Max_size: %sMB"
-                        % (self.num_entries, (current_size*1.0 / 1024**2),
-                           config.traj_info_max_entries, config.traj_info_max_size))
-            self._clean(n=self.clean_n_entries)
+        if self.filename is not None:
+            current_size = os.stat(self.filename).st_size
+            if (self.num_entries >= config.traj_info_max_entries or
+                    # current_size is in bytes, while traj_info_max_size is in MB
+                    1.*current_size / 1024**2 >= config.traj_info_max_size):
+                logger.info("Cleaning database because it has too much entries or is too large.\n"
+                            "Entries: %s. Size: %.2fMB. Configured max_entires: %s. Max_size: %sMB"
+                            % (self.num_entries, (current_size*1.0 / 1024**2),
+                               config.traj_info_max_entries, config.traj_info_max_size))
+                self._clean(n=self.clean_n_entries)
 
     def get(self, key):
         cursor = self._database.execute("SELECT * FROM traj_info WHERE hash=?", (key,))
@@ -231,12 +232,15 @@ def _database_from_key(self, key):
         database has to be locked for updates and multiple processes want to write,
         each process has to wait until the lock has been released.
 
-        By default the LRU databases will be stored in a sub directory "tra_info_usage"
+        By default the LRU databases will be stored in a sub directory "traj_info_usage"
         lying next to the main database.
 
         :param key: hash of the TrajInfo instance
         :return: str, database path
         """
+        if not self.filename:
+            return None
+
         from pyemma.util.files import mkdir_p
         hash_value_long = int(key, 16)
         # bin hash to one of either 10 different databases
@@ -250,6 +254,9 @@ def _update_time_stamp(self, hash_value):
         """ timestamps are being stored distributed over several lru databases.
         The timestamp is a time.time() snapshot (float), which are seconds since epoch."""
         db_name = self._database_from_key(hash_value)
+        if not db_name:
+            db_name=':memory:'
+
         import sqlite3
 
         with sqlite3.connect(db_name) as conn:

diff --git a/pyemma/coordinates/tests/test_traj_info_cache.py b/pyemma/coordinates/tests/test_traj_info_cache.py
@@ -35,6 +35,7 @@
 from pyemma.coordinates.data.feature_reader import FeatureReader
 from pyemma.coordinates.data.numpy_filereader import NumPyFileReader
 from pyemma.coordinates.data.py_csv_reader import PyCSVReader
+from pyemma.coordinates.data.util.traj_info_backends import SqliteDB
 from pyemma.coordinates.data.util.traj_info_cache import TrajectoryInfoCache
 from pyemma.coordinates.tests.util import create_traj
 from pyemma.datasets import get_bpti_test_data
@@ -276,6 +277,15 @@ def test_max_size(self):
         self.assertLessEqual(os.stat(self.db.database_filename).st_size / 1024, config.traj_info_max_size)
         self.assertGreater(self.db.num_entries, 0)
 
+    def test_no_working_directory(self):
+        import sqlite3
+        # this is the case as long as the user has not yet created a config directory via config.save()
+        self.db._database = SqliteDB(filename=None)
+
+
+        # trigger caching
+        pyemma.coordinates.source(xtcfiles, top=pdbfile)
+
     @unittest.skip("not yet functional")
     def test_no_sqlite(self):
         def import_mock(name, *args):