diff --git a/DESCRIPTION b/DESCRIPTION index beacd03..e9823dc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -11,7 +11,7 @@ LazyData: false Depends: R (>= 4.0) Imports: S4Vectors, - limma, + scrapper, ggraph, igraph, methods, @@ -30,7 +30,7 @@ Imports: graphics, statmod, Cepo -RoxygenNote: 7.1.1 +RoxygenNote: 7.3.2 Suggests: knitr, rmarkdown, diff --git a/NAMESPACE b/NAMESPACE index 0e55029..b2b880f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,7 +20,6 @@ exportClasses(scClassifyTrainModel) exportClasses(scClassifyTrainModelList) import(ggplot2) import(igraph) -import(limma) import(statmod) importClassesFrom(S4Vectors,DataFrame) importClassesFrom(S4Vectors,SimpleList) @@ -45,8 +44,6 @@ importFrom(hopach,hdist) importFrom(hopach,improveordering) importFrom(hopach,is.hdist) importFrom(hopach,vectmatrix) -importFrom(limma,eBayes) -importFrom(limma,lmFit) importFrom(methods,as) importFrom(methods,is) importFrom(methods,new) @@ -57,6 +54,7 @@ importFrom(mixtools,normalmixEM) importFrom(proxy,as.dist) importFrom(proxy,dist) importFrom(proxyC,simil) +importFrom(scrapper,scoreMarkers) importFrom(stats,coef) importFrom(stats,cutree) importFrom(stats,dnorm) diff --git a/R/featureSelection.R b/R/featureSelection.R index 61fa75c..f14477e 100644 --- a/R/featureSelection.R +++ b/R/featureSelection.R @@ -1,17 +1,17 @@ #' @importFrom methods new #' @importFrom Cepo Cepo topGenes -#' @import limma - +#' @importFrom scrapper scoreMarkers featureSelection <- function(exprsMat, trainClass, - feature = c("limma", "DV", "DD", "chisq", "BI", + sampleID = NULL, + feature = c("DM", "DV", "DD", "chisq", "BI", "Cepo"), topN = 50, pSig = 0.001 ){ - feature <- match.arg(feature, c("limma", "DV", "DD", "chisq", "BI", "Cepo"), + feature <- match.arg(feature, c("DM", "DV", "DD", "chisq", "BI", "Cepo"), several.ok = FALSE) @@ -40,69 +40,19 @@ featureSelection <- function(exprsMat, tt <- Cepo::Cepo(as.matrix(exprsMat), trainClass, exprsPct = 0.05) res <- Reduce(union, Cepo::topGenes(tt, n = topN)) } else{ - tt <- doLimma(exprsMat, trainClass) - res <- Reduce(union, lapply(tt, function(t) - rownames(t[t$logFC > 0 & (t$meanPct.2 - t$meanPct.1) > 0.05 & - t$adj.P.Val < pSig,])[seq_len(topN)])) + effectSizes <- scrapper::scoreMarkers(exprsMat, trainClass, sampleID, block.weight.policy = "equal") + res <- Reduce(union, mapply(function(typeD, propDetect) + { + bestNames <- rownames(exprsMat)[order(typeD$mean, decreasing = TRUE)] + bestNames <- bestNames[propDetect$mean > 0.05] + bestNames[seq_len(topN)] + }, effectSizes[["cohens.d"]], effectSizes[["delta.detected"]], SIMPLIFY = FALSE)) } return(res) } - -#' @importFrom limma eBayes lmFit -#' @importFrom methods new - -doLimma <- function(exprsMat, cellTypes, exprs_pct = 0.05){ - - cellTypes <- droplevels(as.factor(cellTypes)) - tt <- list() - for (i in seq_len(nlevels(cellTypes))) { - tmp_celltype <- (ifelse(cellTypes == levels(cellTypes)[i], 1, 0)) - design <- stats::model.matrix(~tmp_celltype) - - - meanExprs <- do.call(cbind, lapply(c(0,1), function(i){ - Matrix::rowMeans(exprsMat[, tmp_celltype == i, drop = FALSE]) - })) - - meanPct <- do.call(cbind, lapply(c(0,1), function(i){ - Matrix::rowSums(exprsMat[, tmp_celltype == i, - drop = FALSE] > 0)/sum(tmp_celltype == i) - })) - - keep <- meanPct[,2] > exprs_pct - - y <- methods::new("EList") - y$E <- exprsMat[keep, ] - fit <- limma::lmFit(y, design = design) - fit <- limma::eBayes(fit, trend = TRUE, robust = TRUE) - tt[[i]] <- limma::topTable(fit, n = Inf, adjust.method = "BH", coef = 2) - - - - if (!is.null(tt[[i]]$ID)) { - tt[[i]] <- tt[[i]][!duplicated(tt[[i]]$ID),] - rownames(tt[[i]]) <- tt[[i]]$ID - } - - tt[[i]]$meanExprs.1 <- meanExprs[rownames(tt[[i]]), 1] - tt[[i]]$meanExprs.2 <- meanExprs[rownames(tt[[i]]), 2] - tt[[i]]$meanPct.1 <- meanPct[rownames(tt[[i]]), 1] - tt[[i]]$meanPct.2 <- meanPct[rownames(tt[[i]]), 2] - } - - - - return(tt) - - -} - - - - doDV <- function(exprsMat, cellTypes){ diff --git a/R/predict_scClassify.R b/R/predict_scClassify.R index 8f20ebe..5aa0148 100644 --- a/R/predict_scClassify.R +++ b/R/predict_scClassify.R @@ -14,8 +14,8 @@ #' @param cor_threshold_high A numeric indicates the highest #' correlation threshold #' @param features A vector indicates the gene selection method, -#' set as "limma" by default. -#' This should be one or more of "limma", "DV", "DD", "chisq", "BI". +#' set as "DM" (Difference in Means) by default. +#' This should be one or more of "DM", "DV", "DD", "chisq", "BI". #' @param algorithm A vector indicates the KNN method that are used, #' set as "WKNN" by default. #' This should be one or more of "WKNN", "KNN", "DWKNN". @@ -46,7 +46,7 @@ #' trainRes = trainClassExample_xin, #' cellTypes_test = wang_cellTypes, #' algorithm = "WKNN", -#' features = c("limma"), +#' features = "DM", #' similarity = c("pearson"), #' prob_threshold = 0.7, #' verbose = TRUE) @@ -67,7 +67,7 @@ predict_scClassify <- function(exprsMat_test, prob_threshold = 0.7, cor_threshold_static = 0.5, cor_threshold_high = 0.7, - features = "limma", + features = "DM", algorithm = "WKNN", similarity = "pearson", cutoff_method = c("dynamic", "static"), @@ -238,8 +238,8 @@ predict_scClassify <- function(exprsMat_test, #' @param prob_threshold A numeric indicates the probability threshold for KNN/WKNN/DWKNN. #' @param cor_threshold_static A numeric indicates the static correlation threshold. #' @param cor_threshold_high A numeric indicates the highest correlation threshold -#' @param features A vector indicates the method to select features, set as "limma" by default. -#' This should be one or more of "limma", "DV", "DD", "chisq", "BI". +#' @param features A vector indicates the method to select features, set as "DM" (Difference in Means) by default. +#' This should be one or more of "DM", "DV", "DD", "chisq", "BI". #' @param algorithm A vector indicates the KNN method that are used, set as "WKNN" by default. #' This should be one or more of "WKNN", "KNN", "DWKNN". #' @param similarity A vector indicates the similarity measure that are used, @@ -269,7 +269,7 @@ predict_scClassify <- function(exprsMat_test, #' trainRes = trainClassExampleJoint, #' cellTypes_test = wang_cellTypes, #' algorithm = "WKNN", -#' features = c("limma"), +#' features = "DM", #' similarity = c("pearson"), #' prob_threshold = 0.7, #' verbose = FALSE) @@ -287,7 +287,7 @@ predict_scClassifyJoint <- function(exprsMat_test, prob_threshold = 0.7, cor_threshold_static = 0.5, cor_threshold_high = 0.7, - features = "limma", + features = "DM", algorithm = "WKNN", similarity = "pearson", cutoff_method = c("dynamic", "static"), @@ -363,7 +363,7 @@ predict_scClassifySingle <- function(exprsMat_test, prob_threshold = 0.7, cor_threshold_static = 0.5, cor_threshold_high = 0.7, - features = "limma", + features = "DM", algorithm = c("WKNN", "KNN", "DWKNN"), similarity = c("pearson", "spearman", "cosine", "jaccard", diff --git a/R/sampleSizeCal.R b/R/sampleSizeCal.R index 47a3258..53fe09c 100644 --- a/R/sampleSizeCal.R +++ b/R/sampleSizeCal.R @@ -170,7 +170,7 @@ runSubSampling <- function(exprsMat, test = cellTypes_test), ...) - acc_cls <- trainRes$testRes$test$pearson_WKNN_limma$classifyRes + acc_cls <- trainRes$testRes$test$pearson_WKNN_DM$classifyRes return(acc_cls) } diff --git a/R/scClassify.R b/R/scClassify.R index bc412da..29c65c7 100644 --- a/R/scClassify.R +++ b/R/scClassify.R @@ -2,12 +2,13 @@ #' #' @param exprsMat_train A matrix of log-transformed expression matrix of reference dataset #' @param cellTypes_train A vector of cell types of reference dataset +#' @param sampleID_train Default: \code{NULL}. A vector of sample IDs, if the cells come from more than one individual. #' @param exprsMat_test A list or a matrix indicates the expression matrices of the query datasets #' @param cellTypes_test A list or a vector indicates cell types of the query datasets (Optional). #' @param tree A vector indicates the method to build hierarchical tree, set as "HOPACH" by default. #' This should be one of "HOPACH" and "HC" (using hclust). -#' @param selectFeatures A vector indicates the gene selection method, set as "limma" by default. -#' This should be one or more of "limma", "DV", "DD", "chisq", "BI" and "Cepo". +#' @param selectFeatures A vector indicates the gene selection method, set as "DM" (Difference in Means) by default. +#' This should be one or more of "DM", "DV", "DD", "chisq", "BI" and "Cepo". #' @param algorithm A vector indicates the KNN method that are used, set as #' "WKNN" by default. Thisshould be one or more of "WKNN", "KNN", "DWKNN". #' @param similarity A vector indicates the similarity measure that are used, @@ -56,7 +57,7 @@ #' cellTypes_test = list(wang = wang_cellTypes), #' tree = "HOPACH", #' algorithm = "WKNN", -#' selectFeatures = c("limma"), +#' selectFeatures = "DM", #' similarity = c("pearson"), #' returnList = FALSE, #' verbose = FALSE) @@ -69,11 +70,12 @@ scClassify <- function(exprsMat_train = NULL, cellTypes_train = NULL, + sampleID_train = NULL, exprsMat_test = NULL, cellTypes_test = NULL, tree = "HOPACH", algorithm = "WKNN", - selectFeatures = "limma", + selectFeatures = "DM", similarity = "pearson", cutoff_method = c("dynamic", "static"), weighted_ensemble = FALSE, @@ -147,7 +149,7 @@ scClassify <- function(exprsMat_train = NULL, tree <- match.arg(tree, c("HOPACH", "HC"), several.ok = FALSE) selectFeatures <- match.arg(selectFeatures, - c("limma", "DV", "DD", "chisq", "BI", "Cepo"), + c("DM", "DV", "DD", "chisq", "BI", "Cepo"), several.ok = TRUE) algorithm <- match.arg(algorithm, @@ -231,6 +233,7 @@ scClassify <- function(exprsMat_train = NULL, ### train_scClassify trainRes <- train_scClassify(exprsMat_train, cellTypes_train, + sampleID_train, tree = tree, selectFeatures = selectFeatures, topN = topN, diff --git a/R/train_scClassify.R b/R/train_scClassify.R index e26ea54..b9af405 100644 --- a/R/train_scClassify.R +++ b/R/train_scClassify.R @@ -2,12 +2,13 @@ #' #' @param exprsMat_train A matrix of log-transformed expression matrix of reference dataset #' @param cellTypes_train A vector of cell types of reference dataset +#' @param sampleID_train Default: \code{NULL}. A vector of sample IDs, if the cells come from more than one individual. #' @param tree A vector indicates the method to build hierarchical tree, #' set as "HOPACH" by default. #' This should be one of "HOPACH" and "HC" (using stats::hclust). #' @param selectFeatures A vector indicates the gene selection method, -#' set as "limma" by default. -#' This should be one or more of "limma", "DV", "DD", "chisq", "BI", "Cepo". +#' set as "DM" (Difference in Means) by default. +#' This should be one or more of "DM", "DV", "DD", "chisq", "BI", "Cepo". #' @param topN An integer indicates the top number of features that are selected #' @param hopach_kmax An integer between 1 and 9 specifying the maximum number of #' children at each node in the HOPACH tree. @@ -32,7 +33,7 @@ #' exprsMat_xin_subset <- scClassify_example$exprsMat_xin_subset #' trainClass <- train_scClassify(exprsMat_train = exprsMat_xin_subset, #' cellTypes_train = xin_cellTypes, -#' selectFeatures = c("limma", "BI"), +#' selectFeatures = c("DM", "BI"), #' returnList = FALSE #' ) #' @@ -46,8 +47,9 @@ train_scClassify <- function(exprsMat_train, cellTypes_train, + sampleID_train = NULL, tree = "HOPACH", - selectFeatures = "limma", + selectFeatures = "DM", topN = 50, hopach_kmax = 5, pSig = 0.05, @@ -69,7 +71,7 @@ train_scClassify <- function(exprsMat_train, # Matching the argument of feature selection method selectFeatures <- match.arg(selectFeatures, - c("limma", "DV", "DD", "chisq", "BI", "Cepo"), + c("DM", "DV", "DD", "chisq", "BI", "Cepo"), several.ok = TRUE) @@ -133,6 +135,7 @@ train_scClassify <- function(exprsMat_train, for (train_list_idx in seq_len(length(exprsMat_train))) { trainRes[[train_list_idx]] <- train_scClassifySingle(exprsMat_train[[train_list_idx]], cellTypes_train[[train_list_idx]], + sampleID_train[[train_list_idx]], tree = tree, selectFeatures = selectFeatures, topN = topN, @@ -149,6 +152,7 @@ train_scClassify <- function(exprsMat_train, trainRes <- train_scClassifySingle(exprsMat_train, cellTypes_train, + sampleID_train, tree = tree, selectFeatures = selectFeatures, topN = topN, @@ -211,8 +215,9 @@ train_scClassify <- function(exprsMat_train, train_scClassifySingle <- function(exprsMat_train, cellTypes_train, + sampleID_train = NULL, tree = "HOPACH", - selectFeatures = "limma", + selectFeatures = "DM", topN = 50, hopach_kmax = 5, pSig = 0.05, @@ -247,7 +252,7 @@ train_scClassifySingle <- function(exprsMat_train, # Matching the argument of feature selection method selectFeatures <- match.arg(selectFeatures, - c("limma", "DV", "DD", "chisq", "BI", "Cepo"), + c("DM", "DV", "DD", "chisq", "BI", "Cepo"), several.ok = TRUE) @@ -257,10 +262,12 @@ train_scClassifySingle <- function(exprsMat_train, # Select the features to construct tree - tt <- doLimma(exprsMat_train, cellTypes_train) - de <- Reduce(union, lapply(tt, function(t) - rownames(t)[seq_len(max(min(50, sum(t$adj.P.Val < 0.001)), 30))])) - de <- na.omit(de) + effectSizes <- scrapper::scoreMarkers(exprsMat_train, cellTypes_train, sampleID_train, block.weight.policy = "equal") + de <- Reduce(union, mapply(function(typeD, propDetect) + { + bestNames <- rownames(exprsMat_train)[order(typeD$mean, decreasing = TRUE)] + bestNames[max(30, nrow(exprsMat_train))] + }, effectSizes[["cohens.d"]], effectSizes[["delta.detected"]], SIMPLIFY = FALSE)) if (verbose) { print(paste("Number of genes selected to construct HOPACH tree", length(de))) @@ -420,11 +427,11 @@ currentClass <- function(cellTypes, cutree_res){ hierarchyKNNcor <- function(exprsMat, cellTypes, cutree_list, - feature = c("limma", "DV", "DD", "chisq", "BI"), + feature = c("DM", "DV", "DD", "chisq", "BI"), topN = 50, pSig = 0.001, verbose= TRUE){ - feature <- match.arg(feature, c("limma", "DV", "DD", "chisq", "BI", "Cepo")) + feature <- match.arg(feature, c("DM", "DV", "DD", "chisq", "BI", "Cepo")) numHierchy <- length(cutree_list) levelModel <- list() levelHVG <- list() diff --git a/README.md b/README.md index 103c9bd..e6e6485 100644 --- a/README.md +++ b/README.md @@ -7,12 +7,11 @@ Single cell classification via cell-type hierarchies based on ensemble learning ## Installation - -Install Bioconductor packages `S4Vectors`, `hopach` and `limma` packages using `BiocManager`: +Install Bioconductor packages `S4Vectors`, `hopach` and `scrapper` packages using `BiocManager`: ```r # install.packages("BiocManager") -BiocManager::install(c("S4Vectors", "hopach", "limma")) +BiocManager::install(c("S4Vectors", "hopach", "scrapper")) ``` Then install the latest `scClassify` using `devtools` (For R >= 4.0): diff --git a/data/trainClassExample_wang.rda b/data/trainClassExample_wang.rda index 11569d9..439189a 100644 Binary files a/data/trainClassExample_wang.rda and b/data/trainClassExample_wang.rda differ diff --git a/data/trainClassExample_xin.rda b/data/trainClassExample_xin.rda index 625beb3..475b964 100644 Binary files a/data/trainClassExample_xin.rda and b/data/trainClassExample_xin.rda differ diff --git a/man/predict_scClassify.Rd b/man/predict_scClassify.Rd index b651a3b..f475bae 100644 --- a/man/predict_scClassify.Rd +++ b/man/predict_scClassify.Rd @@ -12,7 +12,7 @@ predict_scClassify( prob_threshold = 0.7, cor_threshold_static = 0.5, cor_threshold_high = 0.7, - features = "limma", + features = "DM", algorithm = "WKNN", similarity = "pearson", cutoff_method = c("dynamic", "static"), @@ -45,8 +45,8 @@ correlation threshold.} correlation threshold} \item{features}{A vector indicates the gene selection method, -set as "limma" by default. -This should be one or more of "limma", "DV", "DD", "chisq", "BI".} +set as "DM" (Difference in Means) by default. +This should be one or more of "DM", "DV", "DD", "chisq", "BI".} \item{algorithm}{A vector indicates the KNN method that are used, set as "WKNN" by default. @@ -90,7 +90,7 @@ pred_res <- predict_scClassify(exprsMat_test = exprsMat_wang_subset, trainRes = trainClassExample_xin, cellTypes_test = wang_cellTypes, algorithm = "WKNN", -features = c("limma"), +features = "DM", similarity = c("pearson"), prob_threshold = 0.7, verbose = TRUE) diff --git a/man/predict_scClassifyJoint.Rd b/man/predict_scClassifyJoint.Rd index e8febbe..83e530f 100644 --- a/man/predict_scClassifyJoint.Rd +++ b/man/predict_scClassifyJoint.Rd @@ -12,7 +12,7 @@ predict_scClassifyJoint( prob_threshold = 0.7, cor_threshold_static = 0.5, cor_threshold_high = 0.7, - features = "limma", + features = "DM", algorithm = "WKNN", similarity = "pearson", cutoff_method = c("dynamic", "static"), @@ -36,8 +36,8 @@ predict_scClassifyJoint( \item{cor_threshold_high}{A numeric indicates the highest correlation threshold} -\item{features}{A vector indicates the method to select features, set as "limma" by default. -This should be one or more of "limma", "DV", "DD", "chisq", "BI".} +\item{features}{A vector indicates the method to select features, set as "DM" (Difference in Means) by default. +This should be one or more of "DM", "DV", "DD", "chisq", "BI".} \item{algorithm}{A vector indicates the KNN method that are used, set as "WKNN" by default. This should be one or more of "WKNN", "KNN", "DWKNN".} @@ -77,7 +77,7 @@ pred_res_joint <- predict_scClassifyJoint(exprsMat_test = exprsMat_wang_subset, trainRes = trainClassExampleJoint, cellTypes_test = wang_cellTypes, algorithm = "WKNN", -features = c("limma"), +features = "DM", similarity = c("pearson"), prob_threshold = 0.7, verbose = FALSE) diff --git a/man/scClassify.Rd b/man/scClassify.Rd index e0c1a7c..41c7721 100644 --- a/man/scClassify.Rd +++ b/man/scClassify.Rd @@ -7,11 +7,12 @@ scClassify( exprsMat_train = NULL, cellTypes_train = NULL, + sampleID_train = NULL, exprsMat_test = NULL, cellTypes_test = NULL, tree = "HOPACH", algorithm = "WKNN", - selectFeatures = "limma", + selectFeatures = "DM", similarity = "pearson", cutoff_method = c("dynamic", "static"), weighted_ensemble = FALSE, @@ -36,6 +37,8 @@ scClassify( \item{cellTypes_train}{A vector of cell types of reference dataset} +\item{sampleID_train}{Default: \code{NULL}. A vector of sample IDs, if the cells come from more than one individual.} + \item{exprsMat_test}{A list or a matrix indicates the expression matrices of the query datasets} \item{cellTypes_test}{A list or a vector indicates cell types of the query datasets (Optional).} @@ -46,8 +49,8 @@ This should be one of "HOPACH" and "HC" (using hclust).} \item{algorithm}{A vector indicates the KNN method that are used, set as "WKNN" by default. Thisshould be one or more of "WKNN", "KNN", "DWKNN".} -\item{selectFeatures}{A vector indicates the gene selection method, set as "limma" by default. -This should be one or more of "limma", "DV", "DD", "chisq", "BI" and "Cepo".} +\item{selectFeatures}{A vector indicates the gene selection method, set as "DM" (Difference in Means) by default. +This should be one or more of "DM", "DV", "DD", "chisq", "BI" and "Cepo".} \item{similarity}{A vector indicates the similarity measure that are used, set as "pearson" by default. This should be one or more of "pearson", @@ -113,7 +116,7 @@ exprsMat_test = list(wang = exprsMat_wang_subset), cellTypes_test = list(wang = wang_cellTypes), tree = "HOPACH", algorithm = "WKNN", -selectFeatures = c("limma"), +selectFeatures = "DM", similarity = c("pearson"), returnList = FALSE, verbose = FALSE) diff --git a/man/train_scClassify.Rd b/man/train_scClassify.Rd index b100184..d59ca3c 100644 --- a/man/train_scClassify.Rd +++ b/man/train_scClassify.Rd @@ -7,8 +7,9 @@ train_scClassify( exprsMat_train, cellTypes_train, + sampleID_train = NULL, tree = "HOPACH", - selectFeatures = "limma", + selectFeatures = "DM", topN = 50, hopach_kmax = 5, pSig = 0.05, @@ -26,13 +27,15 @@ train_scClassify( \item{cellTypes_train}{A vector of cell types of reference dataset} +\item{sampleID_train}{Default: \code{NULL}. A vector of sample IDs, if the cells come from more than one individual.} + \item{tree}{A vector indicates the method to build hierarchical tree, set as "HOPACH" by default. This should be one of "HOPACH" and "HC" (using stats::hclust).} \item{selectFeatures}{A vector indicates the gene selection method, -set as "limma" by default. -This should be one or more of "limma", "DV", "DD", "chisq", "BI", "Cepo".} +set as "DM" (Difference in Means) by default. +This should be one or more of "DM", "DV", "DD", "chisq", "BI", "Cepo".} \item{topN}{An integer indicates the top number of features that are selected} @@ -71,7 +74,7 @@ xin_cellTypes <- scClassify_example$xin_cellTypes exprsMat_xin_subset <- scClassify_example$exprsMat_xin_subset trainClass <- train_scClassify(exprsMat_train = exprsMat_xin_subset, cellTypes_train = xin_cellTypes, -selectFeatures = c("limma", "BI"), +selectFeatures = c("DM", "BI"), returnList = FALSE ) diff --git a/vignettes/pretrainedModel.Rmd b/vignettes/pretrainedModel.Rmd index 3024a71..1235dc6 100644 --- a/vignettes/pretrainedModel.Rmd +++ b/vignettes/pretrainedModel.Rmd @@ -43,7 +43,6 @@ model of scClassify to predict cell types. A pretrained model is a A list of pretrained model can be found in https://sydneybiox.github.io/scClassify/index.html. - First, install `scClassify`, install `BiocManager` and use `BiocManager::install` to install `scClassify` package. @@ -55,10 +54,6 @@ if (!requireNamespace("BiocManager", quietly = TRUE)) { BiocManager::install("scClassify") ``` - - - - # Setting up the data We assume that you have *log-transformed* (size-factor normalized) matrices as @@ -86,8 +81,8 @@ trainClassExample_xin ``` In this pretrained model, we have selected the genes based on Differential -Expression using limma. To check the genes that are available -in the pretrained model: +Expression using Cohen's *d* To check the genes that are available +in the pretrained model: ```{r} @@ -113,7 +108,7 @@ pred_res <- predict_scClassify(exprsMat_test = exprsMat_wang_subset, trainRes = trainClassExample_xin, cellTypes_test = wang_cellTypes, algorithm = "WKNN", - features = c("limma"), + features = "DM", similarity = c("pearson", "spearman"), prob_threshold = 0.7, verbose = TRUE) @@ -123,24 +118,19 @@ Noted that the `cellType_test` is not a required input. For datasets with unknown labels, users can simply leave it as `cellType_test = NULL`. - - Prediction results for pearson as the similarity metric: ```{r} -table(pred_res$pearson_WKNN_limma$predRes, wang_cellTypes) +table(pred_res$pearson_WKNN_DM$predRes, wang_cellTypes) ``` Prediction results for spearman as the similarity metric: ```{r} -table(pred_res$spearman_WKNN_limma$predRes, wang_cellTypes) +table(pred_res$spearman_WKNN_DM$predRes, wang_cellTypes) ``` - - - # Session Info ```{r} diff --git a/vignettes/scClassify.Rmd b/vignettes/scClassify.Rmd index 9752018..ca95aab 100644 --- a/vignettes/scClassify.Rmd +++ b/vignettes/scClassify.Rmd @@ -103,7 +103,7 @@ scClassify_res <- scClassify(exprsMat_train = exprsMat_xin_subset, cellTypes_test = list(wang = wang_cellTypes), tree = "HOPACH", algorithm = "WKNN", - selectFeatures = c("limma"), + selectFeatures = "DM", similarity = c("pearson"), returnList = FALSE, verbose = FALSE) @@ -123,7 +123,7 @@ Noted that `scClassify_res$trainRes` is a `scClassifyTrainModel` class. Check the prediction results. ```{r} -table(scClassify_res$testRes$wang$pearson_WKNN_limma$predRes, wang_cellTypes) +table(scClassify_res$testRes$wang$pearson_WKNN_DM$predRes, wang_cellTypes) ``` @@ -134,13 +134,13 @@ table(scClassify_res$testRes$wang$pearson_WKNN_limma$predRes, wang_cellTypes) We next perform ensemble `scClassify` by using Xin et al. as our reference dataset and Wang et al. data as our query data. -We use `WKNN` as the KNN algorithm, `DE` as the gene selection method, +We use `WKNN` as the KNN algorithm, `DM` as the gene selection method, and `pearson` and `spearman` as the similarity calculation methods. Thus, we will generate two combinations of gene selection models and similarity metrics as training classifiers: -1. `WKNN` + `DE` + `pearson` -2. `WKNN` + `DE` + `spearman` +1. `WKNN` + `DM` + `pearson` +2. `WKNN` + `DM` + `spearman` Here, we will weight these two classifiers equally by setting `weighted_ensemble = FALSE`. By default this is set as `TRUE`, @@ -154,7 +154,7 @@ scClassify_res_ensemble <- scClassify(exprsMat_train = exprsMat_xin_subset, cellTypes_test = list(wang = wang_cellTypes), tree = "HOPACH", algorithm = "WKNN", - selectFeatures = c("limma"), + selectFeatures = "DM", similarity = c("pearson", "cosine"), weighted_ensemble = FALSE, returnList = FALSE, @@ -165,8 +165,8 @@ scClassify_res_ensemble <- scClassify(exprsMat_train = exprsMat_xin_subset, We can compare the two base classifiers predictions as below. ```{r} -table(scClassify_res_ensemble$testRes$wang$pearson_WKNN_limma$predRes, - scClassify_res_ensemble$testRes$wang$cosine_WKNN_limma$predRes) +table(scClassify_res_ensemble$testRes$wang$pearson_WKNN_DM$predRes, + scClassify_res_ensemble$testRes$wang$cosine_WKNN_DM$predRes) ``` @@ -189,12 +189,12 @@ we will calculate the training error of the reference data as the weights for the individual classifiers. Here, we illustrate the training function with gene selection methods -based on differential expression ("limma") and biomodal distribution ("BI"). +based on difference in means `"DM"` and biomodal distribution `"BI"`. ```{r} trainClass <- train_scClassify(exprsMat_train = exprsMat_xin_subset, cellTypes_train = xin_cellTypes, - selectFeatures = c("limma", "BI"), + selectFeatures = c("DM", "BI"), returnList = FALSE ) ```