use mambabuild in CI and fix rmsd computation in clustering (#1531)

clonker · web-flow · commit e9d08d715dde · 2022-01-13T11:38:52.000+01:00
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -46,7 +46,12 @@ jobs:
       - run: mkdir -p $CIRCLE_ARTIFACTS $CIRCLE_TEST_REPORTS
       - attach_workspace:
           at: workspace
-      - run: conda build devtools/conda-recipe --python=3.9 --numpy=1.19 --test --output-folder workspace
+      - run:
+          name: Compile and test
+          no_output_timeout: 30m
+          command: |
+            export PYTHONUNBUFFERED=1
+            conda build devtools/conda-recipe --python=3.9 --numpy=1.21 --test --output-folder workspace
       - run: bash <(curl -s https://codecov.io/bash) -f $HOME/coverage.xml
       - store_test_results:
           path: /tmp/circleci-test-results
diff --git a/devtools/conda-setup+build.yml b/devtools/conda-setup+build.yml
@@ -5,14 +5,14 @@ steps:
       conda config --set quiet true
     displayName: Configure conda
   - bash: |
-      python -m pip install --upgrade pip
-      conda update --all
+      conda install mamba
+      mamba update --all
     displayName: Update conda
   - bash: |
-      conda install conda-build conda-verify pip
+      mamba install boa conda-build conda-verify pip
     displayName: 'Install dependencies'
     continueOnError: false
   - bash: |
-      conda build devtools/conda-recipe
+      conda mambabuild devtools/conda-recipe
     displayName: 'Build and test'
     continueOnError: false
diff --git a/pyemma/coordinates/clustering/src/clustering_module.cpp b/pyemma/coordinates/clustering/src/clustering_module.cpp
@@ -8,14 +8,16 @@
 
 struct RMSDMetric {
     template<typename T>
-    static T compute(const T* xs, const T* ys, std::size_t dim) {
+    static T compute_squared(const T* xs, const T* ys, std::size_t dim) {
         if (dim % 3 != 0) {
-            throw std::range_error("RMSDMetric is only implemented for input data with a dimension dividable by 3.");
+            throw std::range_error("RMSDMetric is only implemented for input data with a dimension divisible by 3.");
         }
+
         float trace_a, trace_b;
         auto dim3 = static_cast<const int>(dim / 3);
-        std::vector<float> buffer_b (ys, ys + dim);
         std::vector<float> buffer_a (xs, xs + dim);
+        std::vector<float> buffer_b (ys, ys + dim);
+
         inplace_center_and_trace_atom_major(buffer_a.data(), &trace_a, 1, dim3);
         inplace_center_and_trace_atom_major(buffer_b.data(), &trace_b, 1, dim3);
 
@@ -27,14 +29,16 @@ struct RMSDMetric {
         }
     }
 
-    template<typename dtype>
-    static dtype compute_squared(const dtype* xs, const dtype* ys, std::size_t dim) {
-        auto d = compute(xs, ys, dim);
-        return d*d;
+    template<typename T>
+    static T compute(const T* xs, const T* ys, std::size_t dim) {
+        return std::sqrt(compute_squared(xs, ys, dim));
     }
 };
 
 PYBIND11_MODULE(_ext, m) {
     auto rmsdModule = m.def_submodule("rmsd");
     deeptime::clustering::registerClusteringImplementation<RMSDMetric>(rmsdModule);
+    rmsdModule.def("compute_metric", [](py::array_t<float> x, py::array_t<float> y) {
+        return RMSDMetric::compute<float>(x.data(), y.data(), x.size());
+    });
 }
diff --git a/pyemma/coordinates/clustering/tests/test_kmeans.py b/pyemma/coordinates/clustering/tests/test_kmeans.py
@@ -24,9 +24,14 @@
 import os
 import random
 import unittest
+
+import deeptime.clustering
+import mdtraj
 import numpy as np
+from deeptime.clustering import ClusterModel
 
 from pyemma.coordinates.api import cluster_kmeans
+from pyemma.coordinates.clustering import KmeansClustering
 from pyemma.util.files import TemporaryDirectory
 from pyemma.util.contexts import settings, Capturing
 
@@ -227,9 +232,88 @@ def test_kmeans_convex_hull(self):
         self.assertGreaterEqual(np.inner(np.array([0, -144337500, -102061250], dtype=float), res) + 353560531, 0)
         self.assertGreaterEqual(np.inner(np.array([0, 0, -10000], dtype=float), res) + 17321, 0)
 
-    def test_with_n_jobs_minrmsd(self):
-        kmeans = cluster_kmeans(np.random.rand(500, 3), 10, metric='minRMSD')
-        kmeans.dtrajs
+    def test_minrmsd_assignment(self):
+        state = np.random.RandomState(123)
+        data = state.uniform(-50, 50, size=(500, 3 * 15))
+        n_clusters = 15
+        kmeans = cluster_kmeans([data], n_clusters, metric='minRMSD', max_iter=0,
+                                fixed_seed=32, init_strategy='kmeans++', n_jobs=1)
+        kmeans2 = cluster_kmeans([data], n_clusters, metric='minRMSD', max_iter=0,
+                                 fixed_seed=32, init_strategy='kmeans++', n_jobs=1)
+        np.testing.assert_array_equal(kmeans.dtrajs[0], kmeans2.dtrajs[0])
+        np.testing.assert_array_almost_equal(kmeans.clustercenters, kmeans2.clustercenters)
+        np.testing.assert_equal(kmeans.metric, 'minRMSD')
+
+        impl = deeptime.clustering.metrics['minRMSD']
+        dtraj_manual = []
+        for frame in data:
+            dists_to_cc = [impl.compute_metric(frame, cc) for cc in kmeans.clustercenters]
+            dtraj_manual.append(np.argmin(dists_to_cc))
+        np.testing.assert_array_equal(dtraj_manual, kmeans.dtrajs[0])
+
+    def test_minrmsd_metric(self):
+        # make sure impl is registered
+        _ = KmeansClustering(n_clusters=5)
+        # now we can import the impl
+        impl = deeptime.clustering.metrics['minRMSD']
+        target = np.random.uniform(size=(1, 3 * 15))
+        reference = np.random.uniform(size=(1, 3 * 15))
+        x = mdtraj.rmsd(mdtraj.Trajectory(target.reshape(1, -1, 3), None),
+                        mdtraj.Trajectory(reference.reshape(1, -1, 3), None))
+        y = impl.compute_metric(target, reference)
+        np.testing.assert_almost_equal(x[0], y)
+
+    def test_minrmsd_assignments(self):
+        # make sure impl is registered
+        _ = KmeansClustering(n_clusters=5)
+        # now we can import the impl
+        impl = deeptime.clustering.metrics['minRMSD']
+
+        from scipy.linalg import expm, norm
+        n_clusters = 5
+        n_particles = 3
+        n_frames_per_cluster = 25
+
+        def rotation_matrix(axis, theta):
+            """ rotation matrix
+            :param axis: np.ndarray, axis around which to rotate
+            :param theta: float, angle in radians
+            :return: rotation matrix
+            """
+            return expm(np.cross(np.eye(3), axis/norm(axis)*theta))
+
+        out = np.zeros((n_clusters*n_frames_per_cluster, 3*n_particles))
+        for i in range(n_clusters):
+            # define `n_particles` random particle xyz positions,
+            # repeat `n_frames_per_cluster` frames and add noise
+            _pos = np.random.choice(np.arange(3*n_particles), size=3*n_particles)
+            pos = np.repeat(_pos[None], n_frames_per_cluster, axis=0).astype(float)
+            pos += np.random.normal(size=pos.shape, scale=.1)
+
+            # add random rotation and translation for each frame
+            rand_rot_trans = np.zeros_like(pos)
+            for n, _pos in enumerate(pos):
+                r = rotation_matrix(np.array([0, 1, 0]), np.pi*np.random.rand())
+                t = np.array([np.random.normal(), np.random.normal(), np.random.normal()])
+
+                for m in range(n_particles):
+                    rand_rot_trans[n, 3*m:3*(m+1)] = np.dot(r, _pos[3*m:3*(m+1)]) - t
+
+            out[n_frames_per_cluster*i:n_frames_per_cluster*(i+1)] = rand_rot_trans
+
+        cc = impl.kmeans.init_centers_kmpp(out, k=n_clusters, random_seed=-1, n_threads=1, callback=None)
+        cl = ClusterModel(cc, metric='minRMSD', converged=True)
+        assignments = cl.transform(out)
+        unique = []
+        for i in range(n_clusters):
+            unique_in_inverval = np.unique(
+                assignments[n_frames_per_cluster*i:n_frames_per_cluster*(i+1)])
+            # assert that each interval is assigned correctly
+            self.assertEqual(unique_in_inverval.shape[0], 1)
+            unique.append(unique_in_inverval[0])
+
+        # assign that all integers are assigned
+        # self.assertSetEqual(set(unique), set(range(n_clusters)))
 
     def test_skip(self):
         cl = cluster_kmeans(np.random.rand(100, 3), skip=42)