leaffur · leaffur · Dec 15, 2024 · Dec 15, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: CASE
 Title: Cell-type-specific And Shared EQTL fine-mapping
-Version: 0.2.2
+Version: 0.2.3
 Authors@R: c(
     person("Chen", "Lin", , "[email protected]", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0001-9821-2578")),
@@ -25,9 +25,9 @@ Suggests:
     testthat (>= 3.0.0),
     scales,
     ggplot2,
-    susieR
+    tidyr
 Config/testthat/edition: 3
 Depends: 
-    R (>= 2.10)
+    R (>= 4.0.0)
 LazyData: true
 VignetteBuilder: knitr
diff --git a/R/CASE.R b/R/CASE.R
@@ -9,13 +9,15 @@
 #' @param hatB M * C matrix of the estimated effects. Alternative summary data (together with hatS) to be provided instead of Z.
 #' @param hatS M * C matrix of standard errors of the estimated effects. Alternative summary data (together with hatB) to be provided instead of Z.
 #' @param N either C vector of the sample size, or C * C matrix of the sample size (diagonal) and ovelaps  (off-diagonal). If provided with a vector, CASE assumes that each pair of traits overlaps with their minimal sample size.
-#' @param V (optional) C * C covariance (correlation) matrix for the noise between traits. If not provided, the default is an identity matrix.
+#' @param V (optional) C * C covariance (correlation) matrix for the noise between traits. If not provided, the default is an identity matrix representing no correlations of the error.
 #' @param cs logical, whether to get credible sets.
 #' @param ... additional arguments.
 #' @return A \code{"CASE"} object with the following elements:
 #' \item{pi:}{L-vector, the prior probabilities of sharing patterns.}
 #' \item{U:}{L-list of C * C matrix, the prior covariances of sharing patterns.}
 #' \item{V:}{C * C matrix, the sample-adjusted phenotypical variance.}
+#' \item{pip:}{M * C matrix, posterior probability of having eQTL effects per SNP per cell type.}
+#' \item{post_mean:}{M * C matrix, average posterior estimates of eQTL effects per SNP per cell type.}
 #' @examples
 #' ## A single-trait example.
 #' set.seed(3)
@@ -63,7 +65,7 @@
 #'   Z[, c] <- hatB[, c] / hatS[, c]
 #' }
 #' 
-#' fit <- CASE(Z = Z, R = R, N = rep(N, C))
+#' fit <- CASE(Z = Z, R = R, N = N)
 #' # print(fit$sets)
 #' @export
 CASE <- function(Z = NULL, R, hatB = NULL, hatS = NULL, N, V = NULL, cs = TRUE, ...){
@@ -72,6 +74,7 @@ CASE <- function(Z = NULL, R, hatB = NULL, hatS = NULL, N, V = NULL, cs = TRUE,
         Z = hatB / hatS
     }
     Z = as.matrix(Z)
+
     hatBS = transform_Z(Z, N)
     hatB = hatBS$hatB
     hatS = hatBS$hatS
@@ -84,7 +87,7 @@ CASE <- function(Z = NULL, R, hatB = NULL, hatS = NULL, N, V = NULL, cs = TRUE,
     t2 = Sys.time()
     res$time = difftime(t2, t1, units = "secs")
     if (cs){
-        res$sets <- get_credible_sets(res$pvalue, R = R)
+        res$sets <- get_credible_sets(res$pip, R = R)
     }
 
     return(res)

diff --git a/R/CASE_models.R b/R/CASE_models.R
@@ -27,28 +27,30 @@ CASE_train <- function(Z = NULL, R, hatB = NULL, hatS = NULL, N, V = NULL, ...){
     Z = hatB / hatS
   }
   Z = as.matrix(Z)
-
-  hatBS = transform_Z(Z, N)
-  hatB = hatBS$hatB
-  hatS = hatBS$hatS
-
+
   n.iter = ifelse("n.iter" %in% names(args), args$h, 45)
   MC.max = ifelse("MC.max" %in% names(args), args$MC.max, 125)
   MC.sim = ((MC.max / 60)^((1:n.iter-1) / (n.iter - 1)) * 60) %>% round
-  C <- ncol(hatB)
+  C <- ncol(Z)
 
   if (is.vector(N)){
     if (C == 1){
       N = matrix(N)
-    }else{
+    }else if (length(N) == C){
       N = diag(N)
       for (i in 1:(C-1)){
         for (j in (i+1):C){
           N[j, i] = N[i, j] = min(N[i, i], N[j, j])
         }
       }
+    }else if (length(N) == 1){
+      N = matrix(N, C, C)
     }
   }
+
+  hatBS = transform_Z(Z, N)
+  hatB = hatBS$hatB
+  hatS = hatBS$hatS
 
   if (is.null(V)){
     V = diag(rep(1, C))
@@ -276,14 +278,14 @@ CASE_train <- function(Z = NULL, R, hatB = NULL, hatS = NULL, N, V = NULL, ...){
 #' @param R M * M matrix of LD.
 #' @param hatB M * C matrix of the estimated effects. Alternative summary data (together with hatS) to be provided instead of Z.
 #' @param hatS M * C matrix of standard errors of the estimated effects. Alternative summary data (together with hatB) to be provided instead of Z.
-#' @param N either C vector of the sample size, or C * C matrix of the sample size (diagonal) and ovelaps  (off-diagonal). If provided with a vector, CASE assumes that each pair of traits overlaps with their minimal sample size.
+#' @param N either 1 or C vector of the sample size, or C * C matrix of the sample size (diagonal) and overlaps  (off-diagonal). If provided with a vector, CASE assumes that each pair of traits overlaps with their minimal sample size.
 #' @param CASE_training A \code{"CASE_training"} object.
 #' @param ... additional arguments.
 #' @return A \code{"CASE"} object with the following elements:
 #' \item{pi:}{L-vector, the prior probabilities of sharing patterns.}
 #' \item{U:}{L-list of C * C matrix, the prior covariances of sharing patterns.}
 #' \item{V:}{C * C matrix, the sample-adjusted phenotypical variance.}
-#' \item{pvalue:}{M * C matrix, posterior probability of no eQTL effects per SNP per cell type.}
+#' \item{pip:}{M * C matrix, posterior probability of having eQTL effects per SNP per cell type.}
 #' \item{post_mean:}{M * C matrix, average posterior estimates of eQTL effects per SNP per cell type.}
 #' @importFrom magrittr %>%
 #' @importFrom stats sd
@@ -338,7 +340,7 @@ CASE_test <- function(Z = NULL, R, hatB = NULL, hatS = NULL, N, CASE_training, .
       BB[, , nsim] = gB
     }
 
-    pp[, , ll] = apply(BB[, , -(1:ceiling(MC.sim * 0.3))], 1:((C > 1) + 1), function(x) mean(x == 0))
+    pp[, , ll] = apply(BB[, , -(1:ceiling(MC.sim * 0.3))], 1:((C > 1) + 1), function(x) mean(x != 0))
     pm[, , ll] = apply(BB[, , -(1:ceiling(MC.sim * 0.3))], 1:((C > 1) + 1), mean)
 
     if (ll == 20){
@@ -347,10 +349,10 @@ CASE_test <- function(Z = NULL, R, hatB = NULL, hatS = NULL, N, CASE_training, .
       }
     }
   }
-  pvalue = apply(pp[, , 1:ll], 1:((C > 1) + 1), mean)
+  pip = apply(pp[, , 1:ll], 1:((C > 1) + 1), mean)
   post_mean = apply(pm[, , 1:ll], 1:((C > 1) + 1), mean)
 
-  return(list(pi = pi, U = U, V = V, pvalue = pvalue, post_mean = post_mean))
+  return(list(pi = pi, U = U, V = V, pip = pip, post_mean = post_mean))
 }
 
 
@@ -359,26 +361,26 @@ CASE_test <- function(Z = NULL, R, hatB = NULL, hatS = NULL, N, CASE_training, .
 #' CASE Obtain Credible Sets
 #'
 #' Obtain credible sets for any multi-trait fine-mapping results.
-#' @param pvalues (M * C),The pvalues of SNPs.
+#' @param pips (M * C),The pips of SNPs.
 #' @param R M * M matrix of LD.
 #' @param cor.min minimum correlation in the credible sets
-#' @param pip threshold for the sum of PIPs.
+#' @param coverage_thres threshold for the sum of PIPs.
 #' @param ruled_out excluding SNPs with PIPs less than the threshold.
 #' @return a length C list of credible sets.
 #' @importFrom magrittr %>%
 #' @export
-get_credible_sets <- function(pvalues, R, cor.min = 0.5, pip = 0.95, ruled_out = 1e-4){
+get_credible_sets <- function(pips, R, cor.min = 0.5, coverage_thres = 0.95, ruled_out = 1e-4){
   cat("Start getting credible sets.", "\n")
 
-  pvalues = as.matrix(pvalues)
-  C = ncol(pvalues)
+  pips = as.matrix(pips)
+  C = ncol(pips)
   css = vector("list", C)
   R1 = R
 
   for (ct in 1:C){
-    p = pvalues[, ct]
+    p = pips[, ct]
 
-    or = order(p)
+    or = order(p, decreasing = TRUE)
     cs = list()
     coverage = numeric(0)
     purity = numeric(0)
@@ -390,24 +392,24 @@ get_credible_sets <- function(pvalues, R, cor.min = 0.5, pip = 0.95, ruled_out =
         break
       }
 
-      if (p[kk] <= 1 - pip){
+      if (p[kk] >= coverage_thres){
         L = L + 1
         cs[[L]] = kk
         flag[cs[[L]]] = FALSE
         purity[L] = 1
-        coverage[L] = 1 - p[kk]
+        coverage[L] = p[kk]
       } else{
         inds = which(abs(R[kk, ]) >= cor.min & flag)
-        if (sum(1 - p[inds]) >= pip){
+        if (sum(p[inds]) >= coverage_thres){
           or_inds = order(p[inds])
-          or_inds = or_inds[p[inds[or_inds]] <= 1 - ruled_out]
-          if (sum(1-p[inds[or_inds]]) > pip){
-            best_local_sets = select_first_valid_set(1 - p[inds[or_inds]], R[inds[or_inds], inds[or_inds]],
-                                                     pip, cor.min)
+          or_inds = or_inds[p[inds[or_inds]] >= ruled_out]
+          if (sum(p[inds[or_inds]]) > coverage_thres){
+            best_local_sets = select_first_valid_set(p[inds[or_inds]], R[inds[or_inds], inds[or_inds]],
+                                                     coverage_thres, cor.min)
             if (!is.null(best_local_sets)){
               L = L + 1
               cs[[L]] = inds[or_inds[best_local_sets]]
-              coverage[L] = min(1, sum(1 - p[cs[[L]]]))
+              coverage[L] = min(1, sum(p[cs[[L]]]))
               mat = abs(R[cs[[L]], cs[[L]]])
               purity[L] = min(mat[upper.tri(mat)])
               flag[cs[[L]]] = FALSE

diff --git a/R/CASE_uitility.R b/R/CASE_uitility.R
@@ -3,6 +3,8 @@ transform_Z <- function(Z, N){
   hatB = hatS = Z
   if (length(dim(N)) == 2){
     N = diag(N)
+  } else if (length(N) == 1){
+    N = rep(N, C)
   }
   for (c in 1:C){
     hatS[, c] <- 1 / sqrt(N[c] - 1 + Z[, c]^2)

diff --git a/man/CASE.Rd b/man/CASE.Rd
diff --git a/man/CASE_test.Rd b/man/CASE_test.Rd
diff --git a/man/get_credible_sets.Rd b/man/get_credible_sets.Rd
diff --git a/vignettes/Introduction_to_CASE.Rmd b/vignettes/Introduction_to_CASE.Rmd
@@ -21,7 +21,7 @@ First, we load necessary packages.
 ```{r setup}
 library(CASE)
 library(ggplot2)
-library(susieR)
+library(tidyr)
 set.seed(1000)
 ```
 
@@ -66,7 +66,8 @@ g2 = ggplot(df, aes(x = Var1, y = Var2, fill = corr)) +
   labs(fill = "corr")
 print(g2)
 ```
-Only SNP 10 have eQTL effects for three cell types and SNP 950 have eQTL effects for the first two cell types. 
+
+Only SNP 10 and SNP 950 have eQTL effects for all three cell types and for the first two cell types, respectively. 
 
 ```{r}
 print(which(B != 0, arr.ind = TRUE))
@@ -82,24 +83,51 @@ for (i in 1:M){
   m1 = summary(lm(Y ~ X[, i]))
   Z[i, ] = sapply(m1, function(x) x$coefficients[2, 3])
 }
-Z[c(idx1, idx2), ] 
+Z[c(idx1, idx2), ]
+```
+
+Visualization of the Z scores for three cell types. The true eQTLs were surrounded with green circles. The dashed grey and red lines are for nominal significance ($\pm 1.96$) and significance after Bonferroni adjustment ($\pm 4.055$).
+
+```{r}
+df <- as.data.frame(Z)
+colnames(df) = paste0("Cell_type_", 1:3)
+df$SNP <- 1:nrow(df)
+
+df_long <- pivot_longer(df, 
+                        cols = starts_with("Cell_type"), 
+                        names_to = "Variable", 
+                        values_to = "Z")
+df_long$Highlight <- with(df_long, 
+  (SNP == 10) | 
+  (SNP == 950 & Variable != "Cell_type_3")
+)
+
+ggplot(df_long, aes(x = SNP, y = Z)) +
+  geom_point() +
+  geom_point(data = subset(df_long, Highlight), 
+             aes(x = SNP, y = Z), 
+             color = "green", size = 4, shape = 1) +
+  geom_hline(yintercept = c(1.96, -1.96), linetype = "dashed", color = "grey") +
+  geom_hline(yintercept = c(4.055, -4.055), linetype = "dashed", color = "red") +
+  facet_wrap(~ Variable) + 
+  theme_minimal()
 ```
 
 ## Fine-mapping
 
-First, we try a single-trait fine-mapping method, SuSiE (Wang et al. 2020), for each cell type separately. The results lacks power for the third cell type and for SNP 950.
+First, we try CASE fine-mapping for each cell type separately. The results lack power for the third cell type and for SNP 950.
 
 ```{r}
 for (c in 1:C){
-  f1 = susie_rss(z = Z[, c], R = R, n = N)
+  f1 = CASE(Z = Z[, c], R = R, N = N)
   print(f1$sets)
 }
 ```
 
-However, CASE jointly studies three traits together to improve the power of identifying the causal variants.
+However, CASE jointly studies three cell types together to improve the power of identifying the causal variants.
 
 ```{r}
-fit <- CASE(Z = Z, R = R, N = rep(N, C))
+fit <- CASE(Z = Z, R = R, N = N)
 print(fit$sets)
 ```