diff --git a/DESCRIPTION b/DESCRIPTION index c7f7d79..3bce74c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: CASE Title: Cell-type-specific And Shared EQTL fine-mapping -Version: 0.2.2 +Version: 0.2.3 Authors@R: c( person("Chen", "Lin", , "c.lin@yale.edu", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-9821-2578")), @@ -25,9 +25,9 @@ Suggests: testthat (>= 3.0.0), scales, ggplot2, - susieR + tidyr Config/testthat/edition: 3 Depends: - R (>= 2.10) + R (>= 4.0.0) LazyData: true VignetteBuilder: knitr diff --git a/R/CASE.R b/R/CASE.R index e0e8d46..b651958 100644 --- a/R/CASE.R +++ b/R/CASE.R @@ -9,13 +9,15 @@ #' @param hatB M * C matrix of the estimated effects. Alternative summary data (together with hatS) to be provided instead of Z. #' @param hatS M * C matrix of standard errors of the estimated effects. Alternative summary data (together with hatB) to be provided instead of Z. #' @param N either C vector of the sample size, or C * C matrix of the sample size (diagonal) and ovelaps (off-diagonal). If provided with a vector, CASE assumes that each pair of traits overlaps with their minimal sample size. -#' @param V (optional) C * C covariance (correlation) matrix for the noise between traits. If not provided, the default is an identity matrix. +#' @param V (optional) C * C covariance (correlation) matrix for the noise between traits. If not provided, the default is an identity matrix representing no correlations of the error. #' @param cs logical, whether to get credible sets. #' @param ... additional arguments. #' @return A \code{"CASE"} object with the following elements: #' \item{pi:}{L-vector, the prior probabilities of sharing patterns.} #' \item{U:}{L-list of C * C matrix, the prior covariances of sharing patterns.} #' \item{V:}{C * C matrix, the sample-adjusted phenotypical variance.} +#' \item{pip:}{M * C matrix, posterior probability of having eQTL effects per SNP per cell type.} +#' \item{post_mean:}{M * C matrix, average posterior estimates of eQTL effects per SNP per cell type.} #' @examples #' ## A single-trait example. #' set.seed(3) @@ -63,7 +65,7 @@ #' Z[, c] <- hatB[, c] / hatS[, c] #' } #' -#' fit <- CASE(Z = Z, R = R, N = rep(N, C)) +#' fit <- CASE(Z = Z, R = R, N = N) #' # print(fit$sets) #' @export CASE <- function(Z = NULL, R, hatB = NULL, hatS = NULL, N, V = NULL, cs = TRUE, ...){ @@ -72,6 +74,7 @@ CASE <- function(Z = NULL, R, hatB = NULL, hatS = NULL, N, V = NULL, cs = TRUE, Z = hatB / hatS } Z = as.matrix(Z) + hatBS = transform_Z(Z, N) hatB = hatBS$hatB hatS = hatBS$hatS @@ -84,7 +87,7 @@ CASE <- function(Z = NULL, R, hatB = NULL, hatS = NULL, N, V = NULL, cs = TRUE, t2 = Sys.time() res$time = difftime(t2, t1, units = "secs") if (cs){ - res$sets <- get_credible_sets(res$pvalue, R = R) + res$sets <- get_credible_sets(res$pip, R = R) } return(res) diff --git a/R/CASE_models.R b/R/CASE_models.R index 637b543..8527aef 100644 --- a/R/CASE_models.R +++ b/R/CASE_models.R @@ -27,28 +27,30 @@ CASE_train <- function(Z = NULL, R, hatB = NULL, hatS = NULL, N, V = NULL, ...){ Z = hatB / hatS } Z = as.matrix(Z) - - hatBS = transform_Z(Z, N) - hatB = hatBS$hatB - hatS = hatBS$hatS - + n.iter = ifelse("n.iter" %in% names(args), args$h, 45) MC.max = ifelse("MC.max" %in% names(args), args$MC.max, 125) MC.sim = ((MC.max / 60)^((1:n.iter-1) / (n.iter - 1)) * 60) %>% round - C <- ncol(hatB) + C <- ncol(Z) if (is.vector(N)){ if (C == 1){ N = matrix(N) - }else{ + }else if (length(N) == C){ N = diag(N) for (i in 1:(C-1)){ for (j in (i+1):C){ N[j, i] = N[i, j] = min(N[i, i], N[j, j]) } } + }else if (length(N) == 1){ + N = matrix(N, C, C) } } + + hatBS = transform_Z(Z, N) + hatB = hatBS$hatB + hatS = hatBS$hatS if (is.null(V)){ V = diag(rep(1, C)) @@ -276,14 +278,14 @@ CASE_train <- function(Z = NULL, R, hatB = NULL, hatS = NULL, N, V = NULL, ...){ #' @param R M * M matrix of LD. #' @param hatB M * C matrix of the estimated effects. Alternative summary data (together with hatS) to be provided instead of Z. #' @param hatS M * C matrix of standard errors of the estimated effects. Alternative summary data (together with hatB) to be provided instead of Z. -#' @param N either C vector of the sample size, or C * C matrix of the sample size (diagonal) and ovelaps (off-diagonal). If provided with a vector, CASE assumes that each pair of traits overlaps with their minimal sample size. +#' @param N either 1 or C vector of the sample size, or C * C matrix of the sample size (diagonal) and overlaps (off-diagonal). If provided with a vector, CASE assumes that each pair of traits overlaps with their minimal sample size. #' @param CASE_training A \code{"CASE_training"} object. #' @param ... additional arguments. #' @return A \code{"CASE"} object with the following elements: #' \item{pi:}{L-vector, the prior probabilities of sharing patterns.} #' \item{U:}{L-list of C * C matrix, the prior covariances of sharing patterns.} #' \item{V:}{C * C matrix, the sample-adjusted phenotypical variance.} -#' \item{pvalue:}{M * C matrix, posterior probability of no eQTL effects per SNP per cell type.} +#' \item{pip:}{M * C matrix, posterior probability of having eQTL effects per SNP per cell type.} #' \item{post_mean:}{M * C matrix, average posterior estimates of eQTL effects per SNP per cell type.} #' @importFrom magrittr %>% #' @importFrom stats sd @@ -338,7 +340,7 @@ CASE_test <- function(Z = NULL, R, hatB = NULL, hatS = NULL, N, CASE_training, . BB[, , nsim] = gB } - pp[, , ll] = apply(BB[, , -(1:ceiling(MC.sim * 0.3))], 1:((C > 1) + 1), function(x) mean(x == 0)) + pp[, , ll] = apply(BB[, , -(1:ceiling(MC.sim * 0.3))], 1:((C > 1) + 1), function(x) mean(x != 0)) pm[, , ll] = apply(BB[, , -(1:ceiling(MC.sim * 0.3))], 1:((C > 1) + 1), mean) if (ll == 20){ @@ -347,10 +349,10 @@ CASE_test <- function(Z = NULL, R, hatB = NULL, hatS = NULL, N, CASE_training, . } } } - pvalue = apply(pp[, , 1:ll], 1:((C > 1) + 1), mean) + pip = apply(pp[, , 1:ll], 1:((C > 1) + 1), mean) post_mean = apply(pm[, , 1:ll], 1:((C > 1) + 1), mean) - return(list(pi = pi, U = U, V = V, pvalue = pvalue, post_mean = post_mean)) + return(list(pi = pi, U = U, V = V, pip = pip, post_mean = post_mean)) } @@ -359,26 +361,26 @@ CASE_test <- function(Z = NULL, R, hatB = NULL, hatS = NULL, N, CASE_training, . #' CASE Obtain Credible Sets #' #' Obtain credible sets for any multi-trait fine-mapping results. -#' @param pvalues (M * C),The pvalues of SNPs. +#' @param pips (M * C),The pips of SNPs. #' @param R M * M matrix of LD. #' @param cor.min minimum correlation in the credible sets -#' @param pip threshold for the sum of PIPs. +#' @param coverage_thres threshold for the sum of PIPs. #' @param ruled_out excluding SNPs with PIPs less than the threshold. #' @return a length C list of credible sets. #' @importFrom magrittr %>% #' @export -get_credible_sets <- function(pvalues, R, cor.min = 0.5, pip = 0.95, ruled_out = 1e-4){ +get_credible_sets <- function(pips, R, cor.min = 0.5, coverage_thres = 0.95, ruled_out = 1e-4){ cat("Start getting credible sets.", "\n") - pvalues = as.matrix(pvalues) - C = ncol(pvalues) + pips = as.matrix(pips) + C = ncol(pips) css = vector("list", C) R1 = R for (ct in 1:C){ - p = pvalues[, ct] + p = pips[, ct] - or = order(p) + or = order(p, decreasing = TRUE) cs = list() coverage = numeric(0) purity = numeric(0) @@ -390,24 +392,24 @@ get_credible_sets <- function(pvalues, R, cor.min = 0.5, pip = 0.95, ruled_out = break } - if (p[kk] <= 1 - pip){ + if (p[kk] >= coverage_thres){ L = L + 1 cs[[L]] = kk flag[cs[[L]]] = FALSE purity[L] = 1 - coverage[L] = 1 - p[kk] + coverage[L] = p[kk] } else{ inds = which(abs(R[kk, ]) >= cor.min & flag) - if (sum(1 - p[inds]) >= pip){ + if (sum(p[inds]) >= coverage_thres){ or_inds = order(p[inds]) - or_inds = or_inds[p[inds[or_inds]] <= 1 - ruled_out] - if (sum(1-p[inds[or_inds]]) > pip){ - best_local_sets = select_first_valid_set(1 - p[inds[or_inds]], R[inds[or_inds], inds[or_inds]], - pip, cor.min) + or_inds = or_inds[p[inds[or_inds]] >= ruled_out] + if (sum(p[inds[or_inds]]) > coverage_thres){ + best_local_sets = select_first_valid_set(p[inds[or_inds]], R[inds[or_inds], inds[or_inds]], + coverage_thres, cor.min) if (!is.null(best_local_sets)){ L = L + 1 cs[[L]] = inds[or_inds[best_local_sets]] - coverage[L] = min(1, sum(1 - p[cs[[L]]])) + coverage[L] = min(1, sum(p[cs[[L]]])) mat = abs(R[cs[[L]], cs[[L]]]) purity[L] = min(mat[upper.tri(mat)]) flag[cs[[L]]] = FALSE diff --git a/R/CASE_uitility.R b/R/CASE_uitility.R index 304a687..20985fa 100644 --- a/R/CASE_uitility.R +++ b/R/CASE_uitility.R @@ -3,6 +3,8 @@ transform_Z <- function(Z, N){ hatB = hatS = Z if (length(dim(N)) == 2){ N = diag(N) + } else if (length(N) == 1){ + N = rep(N, C) } for (c in 1:C){ hatS[, c] <- 1 / sqrt(N[c] - 1 + Z[, c]^2) diff --git a/man/CASE.Rd b/man/CASE.Rd index 3959d18..f84fa0c 100644 --- a/man/CASE.Rd +++ b/man/CASE.Rd @@ -17,7 +17,7 @@ CASE(Z = NULL, R, hatB = NULL, hatS = NULL, N, V = NULL, cs = TRUE, ...) \item{N}{either C vector of the sample size, or C * C matrix of the sample size (diagonal) and ovelaps (off-diagonal). If provided with a vector, CASE assumes that each pair of traits overlaps with their minimal sample size.} -\item{V}{(optional) C * C covariance (correlation) matrix for the noise between traits. If not provided, the default is an identity matrix.} +\item{V}{(optional) C * C covariance (correlation) matrix for the noise between traits. If not provided, the default is an identity matrix representing no correlations of the error.} \item{cs}{logical, whether to get credible sets.} @@ -28,6 +28,8 @@ A \code{"CASE"} object with the following elements: \item{pi:}{L-vector, the prior probabilities of sharing patterns.} \item{U:}{L-list of C * C matrix, the prior covariances of sharing patterns.} \item{V:}{C * C matrix, the sample-adjusted phenotypical variance.} +\item{pip:}{M * C matrix, posterior probability of having eQTL effects per SNP per cell type.} +\item{post_mean:}{M * C matrix, average posterior estimates of eQTL effects per SNP per cell type.} } \description{ Perform Multi-trait Fine-mapping @@ -82,7 +84,7 @@ for (c in 1:C){ Z[, c] <- hatB[, c] / hatS[, c] } -fit <- CASE(Z = Z, R = R, N = rep(N, C)) +fit <- CASE(Z = Z, R = R, N = N) # print(fit$sets) } \references{ diff --git a/man/CASE_test.Rd b/man/CASE_test.Rd index 192100f..ef9b816 100644 --- a/man/CASE_test.Rd +++ b/man/CASE_test.Rd @@ -15,7 +15,7 @@ CASE_test(Z = NULL, R, hatB = NULL, hatS = NULL, N, CASE_training, ...) \item{hatS}{M * C matrix of standard errors of the estimated effects. Alternative summary data (together with hatB) to be provided instead of Z.} -\item{N}{either C vector of the sample size, or C * C matrix of the sample size (diagonal) and ovelaps (off-diagonal). If provided with a vector, CASE assumes that each pair of traits overlaps with their minimal sample size.} +\item{N}{either 1 or C vector of the sample size, or C * C matrix of the sample size (diagonal) and overlaps (off-diagonal). If provided with a vector, CASE assumes that each pair of traits overlaps with their minimal sample size.} \item{CASE_training}{A \code{"CASE_training"} object.} @@ -26,7 +26,7 @@ A \code{"CASE"} object with the following elements: \item{pi:}{L-vector, the prior probabilities of sharing patterns.} \item{U:}{L-list of C * C matrix, the prior covariances of sharing patterns.} \item{V:}{C * C matrix, the sample-adjusted phenotypical variance.} -\item{pvalue:}{M * C matrix, posterior probability of no eQTL effects per SNP per cell type.} +\item{pip:}{M * C matrix, posterior probability of having eQTL effects per SNP per cell type.} \item{post_mean:}{M * C matrix, average posterior estimates of eQTL effects per SNP per cell type.} } \description{ diff --git a/man/get_credible_sets.Rd b/man/get_credible_sets.Rd index 24690b9..b9e6e96 100644 --- a/man/get_credible_sets.Rd +++ b/man/get_credible_sets.Rd @@ -4,18 +4,24 @@ \alias{get_credible_sets} \title{CASE Obtain Credible Sets} \usage{ -get_credible_sets(pvalues, R, cor.min = 0.5, pip = 0.95, ruled_out = 1e-04) +get_credible_sets( + pips, + R, + cor.min = 0.5, + coverage_thres = 0.95, + ruled_out = 1e-04 +) } \arguments{ -\item{pvalues}{(M * C),The pvalues of SNPs.} +\item{pips}{(M * C),The pips of SNPs.} \item{R}{M * M matrix of LD.} \item{cor.min}{minimum correlation in the credible sets} -\item{pip}{threshold for the sum of PIPs.} - \item{ruled_out}{excluding SNPs with PIPs less than the threshold.} + +\item{coverage}{threshold for the sum of PIPs.} } \value{ a length C list of credible sets. diff --git a/vignettes/Introduction_to_CASE.Rmd b/vignettes/Introduction_to_CASE.Rmd index 5f0ffb8..d33cff8 100644 --- a/vignettes/Introduction_to_CASE.Rmd +++ b/vignettes/Introduction_to_CASE.Rmd @@ -21,7 +21,7 @@ First, we load necessary packages. ```{r setup} library(CASE) library(ggplot2) -library(susieR) +library(tidyr) set.seed(1000) ``` @@ -66,7 +66,8 @@ g2 = ggplot(df, aes(x = Var1, y = Var2, fill = corr)) + labs(fill = "corr") print(g2) ``` -Only SNP 10 have eQTL effects for three cell types and SNP 950 have eQTL effects for the first two cell types. + +Only SNP 10 and SNP 950 have eQTL effects for all three cell types and for the first two cell types, respectively. ```{r} print(which(B != 0, arr.ind = TRUE)) @@ -82,24 +83,51 @@ for (i in 1:M){ m1 = summary(lm(Y ~ X[, i])) Z[i, ] = sapply(m1, function(x) x$coefficients[2, 3]) } -Z[c(idx1, idx2), ] +Z[c(idx1, idx2), ] +``` + +Visualization of the Z scores for three cell types. The true eQTLs were surrounded with green circles. The dashed grey and red lines are for nominal significance ($\pm 1.96$) and significance after Bonferroni adjustment ($\pm 4.055$). + +```{r} +df <- as.data.frame(Z) +colnames(df) = paste0("Cell_type_", 1:3) +df$SNP <- 1:nrow(df) + +df_long <- pivot_longer(df, + cols = starts_with("Cell_type"), + names_to = "Variable", + values_to = "Z") +df_long$Highlight <- with(df_long, + (SNP == 10) | + (SNP == 950 & Variable != "Cell_type_3") +) + +ggplot(df_long, aes(x = SNP, y = Z)) + + geom_point() + + geom_point(data = subset(df_long, Highlight), + aes(x = SNP, y = Z), + color = "green", size = 4, shape = 1) + + geom_hline(yintercept = c(1.96, -1.96), linetype = "dashed", color = "grey") + + geom_hline(yintercept = c(4.055, -4.055), linetype = "dashed", color = "red") + + facet_wrap(~ Variable) + + theme_minimal() ``` ## Fine-mapping -First, we try a single-trait fine-mapping method, SuSiE (Wang et al. 2020), for each cell type separately. The results lacks power for the third cell type and for SNP 950. +First, we try CASE fine-mapping for each cell type separately. The results lack power for the third cell type and for SNP 950. ```{r} for (c in 1:C){ - f1 = susie_rss(z = Z[, c], R = R, n = N) + f1 = CASE(Z = Z[, c], R = R, N = N) print(f1$sets) } ``` -However, CASE jointly studies three traits together to improve the power of identifying the causal variants. +However, CASE jointly studies three cell types together to improve the power of identifying the causal variants. ```{r} -fit <- CASE(Z = Z, R = R, N = rep(N, C)) +fit <- CASE(Z = Z, R = R, N = N) print(fit$sets) ```