Merge pull request #26 from Irrationone/dev

v0.99.1 -- improvements to analysis with nonexistant and rare cell types
Irrationone · Mar 14, 2019 · 3ce5923 · 3ce5923
2 parents 80fbdf8 + ca4a93b
commit 3ce5923
Show file tree

Hide file tree

Showing 8 changed files with 69 additions and 106 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: cellassign
-Version: 0.99.0
+Version: 0.99.1
 Title: What the Package Does (One Line, Title Case)
 Description: What the package does (one paragraph).
 Authors@R: c(

diff --git a/R/cellassign.R b/R/cellassign.R
@@ -17,6 +17,7 @@
 #' @param shrinkage Logical - should the delta parameters
 #' have hierarchical shrinkage?
 #' @param n_batches Number of data subsample batches to use in inference
+#' @param dirichlet_concentration Dirichlet concentration parameter for cell type abundances
 #' @param rel_tol_adam The change in Q function value (in pct) below which
 #' each optimization round is considered converged
 #' @param rel_tol_em The change in log marginal likelihood value (in pct)
@@ -97,21 +98,22 @@
 #' @return
 #' An object of class \code{cellassign_fit}. See \code{details}
 cellassign <- function(exprs_obj,
-                        marker_gene_info,
-                        s = NULL,
-                        min_delta = log(2),
-                        X = NULL,
-                        B = 10,
-                        shrinkage = FALSE,
-                        n_batches = 1,
-                        rel_tol_adam = 1e-4,
-                        rel_tol_em = 1e-4,
-                        max_iter_adam = 1e5,
-                        max_iter_em = 20,
-                        learning_rate = 0.1,
-                        verbose = TRUE,
-                        sce_assay = "counts",
-                        num_runs = 1) {
+                       marker_gene_info,
+                       s = NULL,
+                       min_delta = 2,
+                       X = NULL,
+                       B = 10,
+                       shrinkage = FALSE,
+                       n_batches = 1,
+                       dirichlet_concentration = 1e-2,
+                       rel_tol_adam = 1e-4,
+                       rel_tol_em = 1e-4,
+                       max_iter_adam = 1e5,
+                       max_iter_em = 20,
+                       learning_rate = 0.1,
+                       verbose = TRUE,
+                       sce_assay = "counts",
+                       num_runs = 1) {
 
   # Work out rho
   rho <- NULL
@@ -123,20 +125,17 @@ cellassign <- function(exprs_obj,
     stop("marker_gene_info must either be a matrix or list. See ?cellassign")
   }
 
-  # Subset genes appropriately
-  # exprs_obj <- subset_exprs_obj(exprs_obj, rho)
-
   # Get expression input
   Y <- extract_expression_matrix(exprs_obj, sce_assay = sce_assay)
-  
-  
+
+
   # Check X is correct
   if(!is.null(X)) {
     if(!(is.matrix(X) && is.numeric(X))) {
       stop("X must either be NULL or a numeric matrix")
     }
   }
-  
+
 
   stopifnot(is.matrix(Y))
   stopifnot(is.matrix(rho))
@@ -150,11 +149,6 @@ cellassign <- function(exprs_obj,
     colnames(rho) <- paste0("cell_type_", seq_len(ncol(rho)))
 
   }
-
-  # Remove non-expressed genes
-  val_result <- validate_genes(Y, rho)
-  Y <- val_result$Y
-  rho <- val_result$rho
 
   N <- nrow(Y)
 
@@ -174,12 +168,16 @@ cellassign <- function(exprs_obj,
     message("No size factors supplied - computing from matrix. It is highly recommended to supply size factors calculated using the full gene set")
     s <- scran::computeSumFactors(t(Y))
   }
-  
+
   # Make sure all size factors are positive
   if (any(s <= 0)) {
     stop("Cells with size factors <= 0 must be removed prior to analysis.")
   }
 
+  # Make Dirichlet concentration parameter symmetric if not otherwise specified
+  if (length(dirichlet_concentration) == 1) {
+    dirichlet_concentration <- rep(dirichlet_concentration, C)
+  }
 
   res <- NULL
 
@@ -194,16 +192,16 @@ cellassign <- function(exprs_obj,
                                 N = N,
                                 P = P,
                                 B = B,
-                                use_priors = shrinkage,
+                                shrinkage = shrinkage,
                                 verbose = verbose,
                                 n_batches = n_batches,
                                 rel_tol_adam = rel_tol_adam,
                                 rel_tol_em = rel_tol_em,
                                 max_iter_adam = max_iter_adam,
                                 max_iter_em = max_iter_em,
                                 learning_rate = learning_rate,
-                                em_convergence_thres = rel_tol_em,
-                                min_delta = min_delta)
+                                min_delta = min_delta,
+                                dirichlet_concentration = dirichlet_concentration)
 
     return(structure(res, class = "cellassign_fit"))
   })

diff --git a/R/inference-tensorflow.R b/R/inference-tensorflow.R
@@ -30,7 +30,7 @@ inference_tensorflow <- function(Y,
                                  N,
                                  P,
                                  B = 10,
-                                 use_priors,
+                                 shrinkage,
                                  verbose = FALSE,
                                  n_batches = 1,
                                  rel_tol_adam = 1e-4,
@@ -39,8 +39,8 @@ inference_tensorflow <- function(Y,
                                  max_iter_em = 20,
                                  learning_rate = 1e-4,
                                  random_seed = NULL,
-                                 em_convergence_thres = 1e-5,
-                                 min_delta = log(2)) {
+                                 min_delta = 2,
+                                 dirichlet_concentration = rep(1e-2, C)) {
   tf$reset_default_graph()
 
   tfd <- tf$contrib$distributions
@@ -66,7 +66,7 @@ inference_tensorflow <- function(Y,
   # Variables
 
   ## Shrinkage prior on delta
-  if (use_priors) {
+  if (shrinkage) {
     delta_log_mean <- tf$Variable(0, dtype = tf$float64)
     delta_log_variance <- tf$Variable(1, dtype = tf$float64) # May need to bound this or put a prior over this
   }
@@ -77,9 +77,7 @@ inference_tensorflow <- function(Y,
 
   beta <- tf$Variable(tf$random_normal(shape(G,P), mean = 0, stddev = 1, seed = random_seed, dtype = tf$float64), dtype = tf$float64)
 
-  # total_concentration <- tf$Variable(tf$random_uniform(shape(1), minval = 0.5, maxval = 10, seed = random_seed, dtype = tf$float64), dtype = tf$float64,
-  #                                    constraint = function(x) tf$clip_by_value(x, tf$constant(1e-2, dtype = tf$float64), tf$constant(Inf, dtype = tf$float64)))
-
+  theta_logit <- tf$Variable(tf$random_normal(shape(C), mean = 0, stddev = 1, seed = random_seed, dtype = tf$float64), dtype = tf$float64)
 
   ## Spline variables
   a <- tf$exp(tf$Variable(tf$zeros(shape = B, dtype = tf$float64)))
@@ -90,6 +88,7 @@ inference_tensorflow <- function(Y,
 
   # Transformed variables
   delta = tf$exp(delta_log)
+  theta_log = tf$nn$log_softmax(theta_logit)
 
   # Model likelihood
   base_mean <- tf$transpose(tf$einsum('np,gp->gn', X_, beta) + tf$log(s_)) #+ tf$add(tf$log(s_), tf$log(control_pct_), name = "s_to_control"))
@@ -100,10 +99,6 @@ inference_tensorflow <- function(Y,
 
   mu_cng = tf$transpose(mu_ngc, shape(2,0,1))
 
-  # if (random_effects) {
-  #   mu_cng <- mu_cng + psi_times_W
-  # }
-
   mu_cngb <- tf$tile(tf$expand_dims(mu_cng, axis = 3L), c(1L, 1L, 1L, B))
 
   phi_cng <- tf$reduce_sum(a * tf$exp(-b * tf$square(mu_cngb - basis_means)), 3L) + LOWER_BOUND
@@ -123,31 +118,32 @@ inference_tensorflow <- function(Y,
   Y__ = tf$stack(Y_tensor_list, axis = 2)
 
   y_log_prob_raw <- nb_pdf$log_prob(Y__)
-  y_log_prob <- tf$transpose(y_log_prob_raw, shape(2,0,1))
-
+  y_log_prob <- tf$transpose(y_log_prob_raw, shape(0,2,1))
+  y_log_prob_sum <- tf$reduce_sum(y_log_prob, 2L) + theta_log
+  p_y_on_c_unorm <- tf$transpose(y_log_prob_sum, shape(1,0))
 
   gamma_fixed = tf$placeholder(dtype = tf$float64, shape = shape(NULL,C))
-  p_y_on_c_unorm <- tf$reduce_sum(y_log_prob, 2L)
-
-  Q1 = -tf$einsum('nc,cng->', gamma_fixed, y_log_prob)
 
+  Q = -tf$einsum('nc,cn->', gamma_fixed, p_y_on_c_unorm)
 
   p_y_on_c_norm <- tf$reshape(tf$reduce_logsumexp(p_y_on_c_unorm, 0L), shape(1,-1))
 
   gamma <- tf$transpose(tf$exp(p_y_on_c_unorm - p_y_on_c_norm))
 
   ## Priors
-  if (use_priors) {
+  if (shrinkage) {
     delta_log_prior <- tfd$Normal(loc = delta_log_mean * rho_,
                                   scale = delta_log_variance)
     delta_log_prob <- -tf$reduce_sum(delta_log_prior$log_prob(delta_log))
-
   }
 
+  theta_log_prior <- tfd$Dirichlet(concentration = tf$constant(dirichlet_concentration, dtype = tf$float64))
+  theta_log_prob <- -theta_log_prior$log_prob(tf$exp(theta_log))
+
   ## End priors
 
-  Q = Q1
-  if (use_priors) {
+  Q <- Q + theta_log_prob
+  if (shrinkage) {
     Q <- Q + delta_log_prob
   }
 
@@ -156,12 +152,10 @@ inference_tensorflow <- function(Y,
   train = optimizer$minimize(Q)
 
   # Marginal log likelihood for monitoring convergence
-  eta_y = tf$reduce_sum(y_log_prob, 2L)
-
-  L_y1 = tf$reduce_sum(tf$reduce_logsumexp(eta_y, 0L))
+  L_y = tf$reduce_sum(tf$reduce_logsumexp(p_y_on_c_unorm, 0L))
 
-  L_y <- L_y1
-  if (use_priors) {
+  L_y <- L_y - theta_log_prob
+  if (shrinkage) {
     L_y <- L_y - delta_log_prob
   }
 
@@ -201,7 +195,7 @@ inference_tensorflow <- function(Y,
 
         if(mi %% 20 == 0) {
           if (verbose) {
-            message(paste(mi, sess$run(Q1, feed_dict = gfd)))
+            message(paste(mi, sess$run(Q, feed_dict = gfd)))
           }
           Q_new <- sess$run(Q, feed_dict = gfd)
           Q_diff = -(Q_new - Q_old) / abs(Q_old)
@@ -218,17 +212,17 @@ inference_tensorflow <- function(Y,
     ll_old <- ll
     log_liks <- c(log_liks, ll)
 
-    if (ll_diff < em_convergence_thres) {
+    if (ll_diff < rel_tol_em) {
       break
     }
   }
 
   # Finished EM - peel off final values
-  variable_list <- list(delta, beta, phi, gamma, mu_ngc, a)
-  variable_names <- c("delta", "beta", "phi", "gamma", "mu", "a")
+  variable_list <- list(delta, beta, phi, gamma, mu_ngc, a, tf$exp(theta_log))
+  variable_names <- c("delta", "beta", "phi", "gamma", "mu", "a", "theta")
 
 
-  if (use_priors) {
+  if (shrinkage) {
     variable_list <- c(variable_list, list(delta_log_mean, delta_log_variance))
     variable_names <- c(variable_names, "ld_mean", "ld_var")
   }
@@ -246,6 +240,7 @@ inference_tensorflow <- function(Y,
   rownames(mle_params$delta) <- rownames(rho)
   colnames(mle_params$delta) <- colnames(rho)
   rownames(mle_params$beta) <- rownames(rho)
+  names(mle_params$theta) <- colnames(rho)
 
 
   cell_type <- get_mle_cell_type(mle_params$gamma)

diff --git a/R/utils.R b/R/utils.R
@@ -121,24 +121,6 @@ initialize_X <- function(X, N, verbose = FALSE) {
   return(X)
 }
 
-#' Extract expression matrix from expression object
-#'
-#' @return The expression matrix and marker gene matrix, with only expressed genes, for input to \code{cellassign}
-#'
-#' @keywords internal
-validate_genes <- function(Y, rho) {
-  expressed_genes <- colSums(Y) > 0
-  if (sum(expressed_genes) < ncol(Y)) {
-    message(paste("Removing genes", paste(colnames(Y)[!expressed_genes], collapse = ", "),
-                  "with <= 0 total counts."))
-
-    Y <- Y[,expressed_genes]
-    rho <- rho[expressed_genes,]
-  }
-
-  return(list(Y=Y, rho=rho))
-}
-
 #' Makes sure the exprs obj is of correct size
 # subset_exprs_obj <- function(exprs_obj, rho) {
 #   G_e <- NULL

diff --git a/man/cellassign.Rd b/man/cellassign.Rd
diff --git a/man/inference_tensorflow.Rd b/man/inference_tensorflow.Rd
diff --git a/man/validate_genes.Rd b/man/validate_genes.Rd
diff --git a/vignettes/introduction-to-cellassign.Rmd b/vignettes/introduction-to-cellassign.Rmd
@@ -100,11 +100,11 @@ We then call `cellassign` using the `cellassign()` function, passing in the abov
 s <- sizeFactors(example_sce)
 
 fit <- cellassign(exprs_obj = example_sce, 
-                        marker_gene_info = example_rho, 
-                        s = s, 
-                        learning_rate = 1e-2, 
-                        shrinkage = TRUE,
-                        verbose = FALSE)
+                  marker_gene_info = example_rho, 
+                  s = s, 
+                  learning_rate = 1e-2, 
+                  shrinkage = TRUE,
+                  verbose = FALSE)
 ```
 
 This returns a `cellassign_fit` object: