diff --git a/CHANGELOG.md b/CHANGELOG.md index 890c4eb7..6f6c50ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,9 +3,12 @@ ## New functionality * Added `metrics/kbet_pg` and `metrics/kbet_pg_label` components (PR #52). + * Added `methods/stacas` new method (PR #58). - Add non-supervised version of STACAS tool for integration of single-cell transcriptomics data. This functionality enables correction of batch effects while preserving biological variability without requiring prior cell type annotations. + * Added `method/drvi` component (PR #61). + * Added `ARI_batch` and `NMI_batch` to `metrics/clustering_overlap` (PR #68). * Added `metrics/cilisi` new metric component (PR #57). @@ -14,6 +17,8 @@ overcorrected datasets with removed cell type signals. We propose adding this metric to substitute iLISI. +* Added `methods/semisupervised_scmerge2` and `methods/unsupervised_scmerge2` components (PR #63). + ## Minor changes * Un-pin the scPRINT version and update parameters (PR #51) diff --git a/src/methods/semisupervised_scmerge2/config.vsh.yaml b/src/methods/semisupervised_scmerge2/config.vsh.yaml new file mode 100644 index 00000000..723a3058 --- /dev/null +++ b/src/methods/semisupervised_scmerge2/config.vsh.yaml @@ -0,0 +1,40 @@ +__merge__: ../../api/comp_method.yaml +name: semisupervised_scmerge2 +label: Semi-supervised Scmerge2 +summary: "scMerge2 is an algorithm that integrates multiple single-cell RNA-seq datasets by leveraging factor analysis of stably expressed genes and pseudoreplication." +description: | + When cell type information are known (e.g. results from cell type classification using reference), + scMerge2 can use this information to construct pseudo-replicates and identify mutual nearest groups with cellTypes input. + scMerge works by integrating multiple single-cell RNA-seq datasets while correcting for batch effects and preserving biological signals. + It first identifies a set of stably expressed genes (SEGs) that are assumed to remain consistent across datasets. + Then, it uses a factor analysis model on these SEGs to estimate and remove unwanted variation. + To improve accuracy, scMerge creates pseudo-replicates which serve as anchors for alignment. + Finally, it corrects the data using these estimates, producing a harmonized expression matrix suitable for downstream analysis. +references: + doi: + - 10.1073/pnas.1820006116 +links: + documentation: https://sydneybiox.github.io/scMerge/articles/scMerge2.html + repository: https://github.com/SydneyBioX/scMerge +info: + method_types: [feature] + preferred_normalization: log_cpm +resources: + - type: r_script + path: script.R +engines: + - type: docker + image: openproblems/base_r:1 + setup: + - type: apt + packages: cmake + - type: r + cran: + - Matrix + bioc: + - scmerge +runners: + - type: executable + - type: nextflow + directives: + label: [midtime,midmem,midcpu] diff --git a/src/methods/semisupervised_scmerge2/script.R b/src/methods/semisupervised_scmerge2/script.R new file mode 100644 index 00000000..13f9efd8 --- /dev/null +++ b/src/methods/semisupervised_scmerge2/script.R @@ -0,0 +1,73 @@ +library(anndata) +library(scMerge) +library(Matrix) +library(stats) + +## VIASH START +par <- list( + input = "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", + output = "output.h5ad" +) +meta <- list( + name = "semisupervised_scmerge2" +) +## VIASH END + +cat("Reading input files\n") +adata <- anndata::read_h5ad(par$input) + +anndataToSemiSupervisedScMerge2 <- function(adata, top_n = 1000, verbose = TRUE) { + counts <- t(as.matrix(adata$layers[["counts"]])) + rownames(counts) <- as.character(adata$var_names) + colnames(counts) <- as.character(adata$obs_names) + + seg_df <- scSEGIndex(exprs_mat = counts) + seg_df <- seg_df[order(seg_df$segIdx, decreasing = TRUE), , drop = FALSE] + ctl <- rownames(seg_df)[seq_len(min(top_n, nrow(seg_df)))] + + exprsMat <- t(as.matrix(adata$layers[["normalized"]])) + rownames(exprsMat) <- as.character(adata$var_names) + colnames(exprsMat) <- as.character(adata$obs_names) + + batch <- as.character(adata$obs$batch) + cellTypes <- as.character(adata$obs$cell_type) + + scMerge2_res <- scMerge2( + exprsMat = exprsMat, + batch = batch, + cellTypes = cellTypes, + ctl = ctl, + verbose = verbose + ) + + return(scMerge2_res) +} + + +cat("Run semi-supervised scMerge2\n") + +scMerge2_res <- anndataToSemiSupervisedScMerge2(adata, top_n = 1000, verbose = TRUE) + + +cat("Store output\n") +corrected_mat <- scMerge2_res$newY +embedding <- prcomp(t(corrected_mat))$x[, 1:10, drop = FALSE] +rownames(embedding) <- colnames(corrected_mat) + +output <- anndata::AnnData( + X = NULL, + obs = adata$obs[, c()], + var = NULL, + obsm = list( + X_emb = embedding[as.character(adata$obs_names), , drop = FALSE] # match input cells + ), + uns = list( + dataset_id = adata$uns[["dataset_id"]], + normalization_id = adata$uns[["normalization_id"]], + method_id = meta$name + ), + shape = adata$shape +) + +cat("Write output AnnData to file\n") +output$write_h5ad(par[["output"]], compression = "gzip") diff --git a/src/methods/unsupervised_scmerge2/config.vsh.yaml b/src/methods/unsupervised_scmerge2/config.vsh.yaml new file mode 100644 index 00000000..9bff630d --- /dev/null +++ b/src/methods/unsupervised_scmerge2/config.vsh.yaml @@ -0,0 +1,38 @@ +__merge__: ../../api/comp_method.yaml +name: unsupervised_scmerge2 +label: unsupervised Scmerge2 +summary: "scMerge2 is an algorithm that integrates multiple single-cell RNA-seq datasets by leveraging factor analysis of stably expressed genes and pseudoreplication." +description: | + scMerge works by integrating multiple single-cell RNA-seq datasets while correcting for batch effects and preserving biological signals. + It first identifies a set of stably expressed genes (SEGs) that are assumed to remain consistent across datasets. + Then, it uses a factor analysis model on these SEGs to estimate and remove unwanted variation. + To improve accuracy, scMerge creates pseudo-replicates which serve as anchors for alignment. + Finally, it corrects the data using these estimates, producing a harmonized expression matrix suitable for downstream analysis. +references: + doi: + - 10.1073/pnas.1820006116 +links: + documentation: https://sydneybiox.github.io/scMerge/articles/scMerge2.html + repository: https://github.com/SydneyBioX/scMerge +info: + method_types: [feature] + preferred_normalization: log_cpm +resources: + - type: r_script + path: script.R +engines: + - type: docker + image: openproblems/base_r:1 + setup: + - type: apt + packages: cmake + - type: r + cran: + - Matrix + bioc: + - scmerge +runners: + - type: executable + - type: nextflow + directives: + label: [midtime,midmem,midcpu] diff --git a/src/methods/unsupervised_scmerge2/script.R b/src/methods/unsupervised_scmerge2/script.R new file mode 100644 index 00000000..503a1d6b --- /dev/null +++ b/src/methods/unsupervised_scmerge2/script.R @@ -0,0 +1,70 @@ +library(anndata) +library(scMerge) +library(Matrix) +library(stats) + +## VIASH START +par <- list( + input = "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", + output = "output.h5ad" +) +meta <- list( + name = "unsupervised_scmerge2" +) +## VIASH END + +cat("Reading input files\n") +adata <- anndata::read_h5ad(par$input) + +anndataToUnsupervisedScMerge2 <- function(adata, top_n = 1000, verbose = TRUE) { + counts <- t(as.matrix(adata$layers[["counts"]])) + rownames(counts) <- as.character(adata$var_names) + colnames(counts) <- as.character(adata$obs_names) + + seg_df <- scSEGIndex(exprs_mat = counts) + seg_df <- seg_df[order(seg_df$segIdx, decreasing = TRUE), , drop = FALSE] + ctl <- rownames(seg_df)[seq_len(min(top_n, nrow(seg_df)))] + + exprsMat <- t(as.matrix(adata$layers[["normalized"]])) + rownames(exprsMat) <- as.character(adata$var_names) + colnames(exprsMat) <- as.character(adata$obs_names) + + batch <- as.character(adata$obs$batch) + cellTypes <- as.character(adata$obs$cell_type) + + scMerge2_res <- scMerge2( + exprsMat = exprsMat, + batch = batch, + ctl = ctl, + verbose = verbose + ) + + return(scMerge2_res) +} + +cat("Run unsupervised scMerge2\n") + +scMerge2_res <- anndataToUnsupervisedScMerge2(adata, top_n = 1000L, verbose = TRUE) + + +cat("Store output\n") +corrected_mat <- scMerge2_res$newY +embedding <- prcomp(t(corrected_mat))$x[, 1:10, drop = FALSE] +rownames(embedding) <- colnames(corrected_mat) + +output <- anndata::AnnData( + X = NULL, + obs = adata$obs[, c()], + var = NULL, + obsm = list( + X_emb = embedding[as.character(adata$obs_names), , drop = FALSE] # match input cells + ), + uns = list( + dataset_id = adata$uns[["dataset_id"]], + normalization_id = adata$uns[["normalization_id"]], + method_id = meta$name + ), + shape = adata$shape +) +cat("Write output AnnData to file\n") +output$write_h5ad(par[["output"]], compression = "gzip")