Support complex data in post-processing utilities

michaelweylandt · michaelweylandt · commit 0d9b1067908e · 2020-10-06T22:21:31.000-05:00
- Move post-processing (tensor PCA projection and U smoothing)
  to template functions to allow
  real and complex data support
- These depend on ClustRVizLogger
  so create a new header to implement them
- Write R/C++ glue functions
diff --git a/src/clustRviz.cpp b/src/clustRviz.cpp
@@ -287,3 +287,31 @@ SEXP matrix_col_prox(SEXP Xsexp,
   return R_NilValue;
 };
 
+// [[Rcpp::export(rng = false)]]
+SEXP smooth_u_clustering(SEXP U_oldSEXP, Rcpp::List cluster_info_list){
+  switch(TYPEOF(U_oldSEXP)){
+  case REALSXP: return Rcpp::wrap(smooth_u_clustering_impl<Rcpp::NumericVector, double>(Rcpp::as<Rcpp::NumericVector>(U_oldSEXP), cluster_info_list));
+  case CPLXSXP: return Rcpp::wrap(smooth_u_clustering_impl<Rcpp::ComplexVector, std::complex<double> >(Rcpp::as<Rcpp::ComplexVector>(U_oldSEXP), cluster_info_list));
+  default: Rcpp::stop("Unsupported type of X.");
+  }
+
+  // Should not trigger but appease compiler...
+  return R_NilValue;
+}
+
+// [[Rcpp::export(rng = false)]]
+SEXP tensor_projection(SEXP Xsexp, SEXP Ysexp){
+  if(TYPEOF(Xsexp) != TYPEOF(Ysexp)){
+    Rcpp::stop("Type of X and Y must match.");
+  }
+
+  switch(TYPEOF(Xsexp)){
+  case REALSXP: return Rcpp::wrap(tensor_projection_impl<Rcpp::NumericVector, double>(Rcpp::as<Rcpp::NumericVector>(Xsexp), Rcpp::as<Eigen::MatrixXd>(Ysexp)));
+  case CPLXSXP: return Rcpp::wrap(tensor_projection_impl<Rcpp::ComplexVector, std::complex<double> >(Rcpp::as<Rcpp::ComplexVector>(Xsexp), Rcpp::as<Eigen::MatrixXcd>(Ysexp)));
+  default: Rcpp::stop("Unsupported type of X.");
+  }
+
+  // Should not trigger but appease compiler...
+  return R_NilValue;
+}
+
diff --git a/src/clustRviz.h b/src/clustRviz.h
@@ -1,5 +1,6 @@
 #include "clustRviz_base.h"
 #include "clustRviz_logging.h"
+#include "clustRviz_utils.h"
 #include "clustering_impl.h"
 #include "biclustering_impl.h"
 #include "trout_impl.h"
diff --git a/src/clustRviz_utils.h b/src/clustRviz_utils.h
@@ -0,0 +1,135 @@
+#ifndef CLUSTRVIZ_UTILS_H
+#define CLUSTRVIZ_UTILS_H 1
+// This header defines template versions of complex utilities
+// These are templated on the data type to allow for both real and complex data
+// These are not in clustRviz_base.h since some of them depend on ClustRVizLogger...
+
+#include <RcppEigen.h>
+#include "clustRviz_logging.h"
+
+// U-smoothing for convex clustering
+//
+// Given cluster memberships, replace rows of U which belong to the same cluster
+// with their mutual mean....
+template <typename RcppVector, typename DataType>
+RcppVector smooth_u_clustering_impl(RcppVector U_old, Rcpp::List cluster_info_list){
+  // The first argument is really an array but we pass as a NumericVector
+  // The second argument is a list produced by get_cluster_assignments()
+  Rcpp::IntegerVector U_dims = U_old.attr("dim");
+  if(U_dims.size() != 3){
+    ClustRVizLogger::error("U must be a three rank tensor.");
+  }
+  int N = U_dims(0);
+  int P = U_dims(1);
+  int Q = U_dims(2);
+
+  // Check length of cluster_info
+  if(cluster_info_list.size() != Q){
+    ClustRVizLogger::error("Dimensions of U and cluster_info do not match");
+  }
+
+  RcppVector U(N * P * Q);
+  U.attr("dim") = U_dims;
+  Rcpp::rownames(U) = Rcpp::rownames(U_old);
+  Rcpp::colnames(U) = Rcpp::colnames(U_old);
+
+  for(int q = 0; q < Q; q++){
+    Rcpp::List cluster_info = cluster_info_list[q];
+    int n_clusters = Rcpp::as<int>(cluster_info[2]);
+
+    Rcpp::IntegerVector cluster_ids   = cluster_info[0];
+    Rcpp::IntegerVector cluster_sizes = cluster_info[1];
+
+    // There's a lot going on on the RHS here, so let's un-pack (inside outwards)
+    // First, we get a pointer to the relevant slice of U_old
+    //   This works because the RcppVector is a C++ wrapper around a SEXP which
+    //   is ultimately just a pointer to the relevant memory
+    // We when cast it to an appropriate C++ pointer type
+    //   For real data, this is a no-op since both R and Eigen use doubles for real data
+    //   For complex data, this matters because we convert from R's homegrown Rcomplex*
+    //   to a std::complex* pointer as eigen expects
+    // We then use Eigen::Map<Eigen::Matrix<DataType>> to get an Eigen::Matrix<DataType> backed
+    //   by R's memory in a read only fashion.
+    // The same construct is used below to load the smoothed data into U
+    Eigen::Matrix<DataType, Eigen::Dynamic, Eigen::Dynamic> U_old_slice = Eigen::Map<Eigen::Matrix<DataType, Eigen::Dynamic, Eigen::Dynamic> >(reinterpret_cast<DataType*>(&U_old[N * P * q]), N, P);
+    Eigen::Matrix<DataType, Eigen::Dynamic, Eigen::Dynamic> U_new(N, P);
+
+    for(int j = 1; j <= n_clusters; j++){ // Cluster IDs are 1-based (per R conventions)
+      Eigen::Matrix<DataType, Eigen::Dynamic, 1> vec(P); vec.setZero();
+
+      // Manually work out new mean
+      for(int n = 0; n < N; n++){
+        if(cluster_ids[n] == j){
+          vec += U_old_slice.row(n);
+        }
+      }
+
+      vec /= cluster_sizes[j - 1]; // Subtract 1 to adjust to C++ indexing
+
+      // Assign new mean where needed...
+      for(int n = 0; n < N; n++){
+        if(cluster_ids[n] == j){
+          U_new.row(n) = vec;
+        }
+      }
+    }
+
+    Eigen::Map<Eigen::Matrix<DataType, Eigen::Dynamic, Eigen::Dynamic> >(reinterpret_cast<DataType*>(&U[N * P * q]), N, P) = U_new;
+  }
+
+  return U;
+}
+
+// Tensor projection along the second mode
+//
+// Given a 3D tensor X in F^{n-by-p-by-q} (observations by features by iterations)
+// and a rotation matrix Y in F^{p-by-k} (features by principal components), we
+// want to get a projected array in F^{n-by-k-by-q} giving the path of the principal
+// components
+//
+// This is straightforward, but "loopy" so we implement it in Rcpp / RcppEigen for speed
+// We use some template magic to support F = R (real) and F = C (complex) data
+template <typename RcppVector, typename DataType>
+RcppVector tensor_projection_impl(RcppVector X, const Eigen::Matrix<DataType, Eigen::Dynamic, Eigen::Dynamic>& Y){
+
+  // Validate X
+  Rcpp::IntegerVector X_dims = X.attr("dim");
+  if(X_dims.size() != 3){
+    ClustRVizLogger::error("X must be a three rank tensor.");
+  }
+  int n = X_dims(0);
+  int p = X_dims(1);
+  int q = X_dims(2);
+
+  // Validate Y
+  if(Y.rows() != p){
+    ClustRVizLogger::error("The dimensions of X and Y do not match -- ") << p << " != " << Y.rows();
+  }
+
+  int k = Y.cols();
+
+  RcppVector result(n * k * q);
+  Rcpp::IntegerVector result_dims{n, k, q};
+  result.attr("dim") = result_dims;
+
+  for(int i = 0; i < q; i++){
+    // There's a lot going on on the RHS here, so let's un-pack (inside outwards)
+    // First, we get a pointer to the relevant slice of X
+    //   This works because the RcppVector is a C++ wrapper around a SEXP which
+    //   is ultimately just a pointer to the relevant memory
+    // We when cast it to an appropriate C++ pointer type
+    //   For real data, this is a no-op since both R and Eigen use doubles for real data
+    //   For complex data, this matters because we convert from R's homegrown Rcomplex*
+    //   to a std::complex* pointer as eigen expects
+    // We then use Eigen::Map<Eigen::Matrix<DataType>> to get an Eigen::Matrix<DataType> backed
+    //   by R's memory in a read only fashion.
+    // The same construct is used below to load the smoothed data into result
+    Eigen::Matrix<DataType, Eigen::Dynamic, Eigen::Dynamic> X_slice = Eigen::Map<Eigen::Matrix<DataType, Eigen::Dynamic, Eigen::Dynamic> >(reinterpret_cast<DataType*>(&X[n * p * i]), n, p);
+    Eigen::Matrix<DataType, Eigen::Dynamic, Eigen::Dynamic> X_slice_projected = X_slice * Y;
+    Eigen::Map<Eigen::Matrix<DataType, Eigen::Dynamic, Eigen::Dynamic> >(reinterpret_cast<DataType*>(&result[n * k * i]), n, k) = X_slice_projected;
+  }
+
+  return result;
+}
+
+#endif
diff --git a/src/utils.cpp b/src/utils.cpp
@@ -67,109 +67,6 @@ void check_weight_matrix(const Eigen::MatrixXd& weight_matrix){
   }
 }
 
-// U-smoothing for convex clustering
-//
-// Given cluster memberships, replace rows of U which belong to the same cluster
-// with their mutual mean....
-//
-// [[Rcpp::export(rng = false)]]
-Rcpp::NumericVector smooth_u_clustering(Rcpp::NumericVector U_old, Rcpp::List cluster_info_list){
-  // The first argument is really an array but we pass as a NumericVector
-  // The second argument is a list produced by get_cluster_assignments()
-  Rcpp::IntegerVector U_dims = U_old.attr("dim");
-  if(U_dims.size() != 3){
-    ClustRVizLogger::error("U must be a three rank tensor.");
-  }
-  int N = U_dims(0);
-  int P = U_dims(1);
-  int Q = U_dims(2);
-
-  // Check length of cluster_info
-  if(cluster_info_list.size() != Q){
-    ClustRVizLogger::error("Dimensions of U and cluster_info do not match");
-  }
-
-  Rcpp::NumericVector U(N * P * Q);
-  U.attr("dim") = U_dims;
-  Rcpp::rownames(U) = Rcpp::rownames(U_old);
-  Rcpp::colnames(U) = Rcpp::colnames(U_old);
-
-  for(int q = 0; q < Q; q++){
-    Rcpp::List cluster_info = cluster_info_list[q];
-    int n_clusters = Rcpp::as<int>(cluster_info[2]);
-
-    Rcpp::IntegerVector cluster_ids   = cluster_info[0];
-    Rcpp::IntegerVector cluster_sizes = cluster_info[1];
-
-    Eigen::MatrixXd U_old_slice = Eigen::Map<Eigen::MatrixXd>(&U_old[N * P * q], N, P);
-    Eigen::MatrixXd U_new(N, P);
-
-    for(int j = 1; j <= n_clusters; j++){ // Cluster IDs are 1-based (per R conventions)
-      Eigen::VectorXd vec = Eigen::VectorXd::Zero(P);
-
-      // Manually work out new mean
-      for(int n = 0; n < N; n++){
-        if(cluster_ids[n] == j){
-          vec += U_old_slice.row(n);
-        }
-      }
-
-      vec /= cluster_sizes[j - 1]; // Subtract 1 to adjust to C++ indexing
-
-      // Assign new mean where needed...
-      for(int n = 0; n < N; n++){
-        if(cluster_ids[n] == j){
-          U_new.row(n) = vec;
-        }
-      }
-    }
-
-    Eigen::Map<Eigen::MatrixXd>(&U[N * P * q], N, P) = U_new;
-  }
-
-  return U;
-}
-
-// Tensor projection along the second mode
-//
-// Given a 3D tensor X in R^{n-by-p-by-q} (observations by features by iterations)
-// and a rotation matrix Y in R^{p-by-k} (features by principal components), we
-// want to get a projected array in R^{n-by-k-by-q} giving the path of the principal
-// components
-//
-// This is straightforward, but "loopy" so we implement it in Rcpp / RcppEigen for speed
-// [[Rcpp::export(rng = false)]]
-Rcpp::NumericVector tensor_projection(Rcpp::NumericVector X, const Eigen::MatrixXd& Y){
-
-  // Validate X
-  Rcpp::IntegerVector X_dims = X.attr("dim");
-  if(X_dims.size() != 3){
-    ClustRVizLogger::error("X must be a three rank tensor.");
-  }
-  int n = X_dims(0);
-  int p = X_dims(1);
-  int q = X_dims(2);
-
-  // Validate Y
-  if(Y.rows() != p){
-    ClustRVizLogger::error("The dimensions of X and Y do not match -- ") << p << " != " << Y.rows();
-  }
-
-  int k = Y.cols();
-
-  Rcpp::NumericVector result(n * k * q);
-  Rcpp::IntegerVector result_dims{n, k, q};
-  result.attr("dim") = result_dims;
-
-  for(int i = 0; i < q; i++){
-    Eigen::MatrixXd X_slice = Eigen::Map<Eigen::MatrixXd>(&X[n * p * i], n, p);
-    Eigen::MatrixXd X_slice_projected = X_slice * Y;
-    Eigen::Map<Eigen::MatrixXd>(&result[n * k * i], n, k) = X_slice_projected;
-  }
-
-  return result;
-}
-
 // TROUT Alignment
 // FIXME - Why doesn't this play nice with overloading? Prototype missing somewhwere?
 Eigen::VectorXcd align_phase_v(const Eigen::VectorXcd& u,