Skip to content
37 changes: 37 additions & 0 deletions src/methods/semisupervised_scmerge2/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
__merge__: ../../api/comp_method.yaml
name: semisupervised_scmerge2
label: Semi-supervised Scmerge2
summary: "scMerge2 is an algorithm that integrates multiple single-cell RNA-seq datasets by leveraging factor analysis of stably expressed genes and pseudoreplication."
description: |
When cell type information are known (e.g. results from cell type classification using reference),
scMerge2 can use this information to construct pseudo-replicates and identify mutual nearest groups with cellTypes input.
scMerge works by integrating multiple single-cell RNA-seq datasets while correcting for batch effects and preserving biological signals.
It first identifies a set of stably expressed genes (SEGs) that are assumed to remain consistent across datasets.
Then, it uses a factor analysis model on these SEGs to estimate and remove unwanted variation.
To improve accuracy, scMerge creates pseudo-replicates which serve as anchors for alignment.
Finally, it corrects the data using these estimates, producing a harmonized expression matrix suitable for downstream analysis.
references:
doi:
- 10.1073/pnas.1820006116
links:
documentation: https://sydneybiox.github.io/scMerge/articles/scMerge2.html
repository: https://github.com/SydneyBioX/scMerge
info:
preferred_normalization: log_cpm
resources:
- type: r_script
path: script.R
engines:
- type: docker
image: openproblems/base_r:1
setup:
- type: apt
packages: cmake
- type: r
bioc:
- scmerge
runners:
- type: executable
- type: nextflow
directives:
label: [midtime,midmem,midcpu]
99 changes: 99 additions & 0 deletions src/methods/semisupervised_scmerge2/script.R
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comments from unsupervised scMerge2 apply here as well

Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
library(anndata)
library(scMerge)

## VIASH START
par <- list(
input = "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
output = "output.h5ad"
)
meta <- list(
name = "semisupervised_scmerge2"
)
## VIASH END

cat("Reading input files\n")
adata <- anndata::read_h5ad(par$input)
adata$obs["batch"] <- sub("\\+", "plus", adata$obs[["batch"]]) # Replace "+"" characters in batch names

anndataToSemiSupervisedScMerge2 <- function(adata, seg_list, layer = "normalized", verbose = FALSE) {
exprsMat_all <- t(as.matrix(adata$layers[[layer]]))
batch_all <- as.character(adata$obs$batch)
celltypes_all <- as.character(adata$obs$cell_type)

valid_cells <- !is.na(batch_all)
exprsMat <- exprsMat_all[, valid_cells, drop = FALSE]
batch <- batch_all[valid_cells]
cellTypes <- celltypes_all[valid_cells]

# Check overlap with human/mouse scSEG lists
gene_ids <- rownames(exprsMat)
species <- NULL
best_match <- 0

for (organism in names(seg_list)) {
scseg_name <- paste0(organism, "_scSEG")
seg_genes <- seg_list[[organism]][[scseg_name]]
overlap <- length(intersect(gene_ids, seg_genes))

if (overlap > best_match) {
best_match <- overlap
species <- organism
}
}

if (is.null(species) || best_match == 0) {
stop("No match found between gene IDs in exprsMat and scSEG lists for human or mouse. ",
"Please ensure you're using Ensembl IDs for human or mouse, or provide a custom SEG list.")
}

message("Detected species: ", species, " (matched ", best_match, " genes)")

ctl <- seg_list[[species]][[paste0(species, "_scSEG")]]

scMerge2_res <- scMerge2(
exprsMat = exprsMat,
batch = batch,
cellTypes = cellTypes,
ctl = ctl,
verbose = verbose
)

return(scMerge2_res)
}

data("segList_ensemblGeneID")

cat("Run semi-supervised scMerge2\n")

scMerge2_res <- anndataToSemiSupervisedScMerge2(
adata = adata,
seg_list = segList_ensemblGeneID,
layer = "normalized",
verbose = TRUE
)


cat("Store output\n")
corrected_mat <- scMerge2_res$newY

embedding <- prcomp(t(corrected_mat))$x[, 1:10]

rownames(embedding) <- colnames(corrected_mat)

output <- anndata::AnnData(
X = NULL,
obs = adata$obs[, c()],
var = NULL,
obsm = list(
X_emb = embedding[rownames(adata), , drop = FALSE] # match input cells
),
uns = list(
dataset_id = adata$uns[["dataset_id"]],
normalization_id = adata$uns[["normalization_id"]],
method_id = meta$name
),
shape = adata$shape
)

cat("Write output AnnData to file\n")
output$write_h5ad(par[["output"]], compression = "gzip")
35 changes: 35 additions & 0 deletions src/methods/unsupervised_scmerge2/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
__merge__: ../../api/comp_method.yaml
name: unsupervised_scmerge2
label: unsupervised Scmerge2
summary: "scMerge2 is an algorithm that integrates multiple single-cell RNA-seq datasets by leveraging factor analysis of stably expressed genes and pseudoreplication."
description: |
scMerge works by integrating multiple single-cell RNA-seq datasets while correcting for batch effects and preserving biological signals.
It first identifies a set of stably expressed genes (SEGs) that are assumed to remain consistent across datasets.
Then, it uses a factor analysis model on these SEGs to estimate and remove unwanted variation.
To improve accuracy, scMerge creates pseudo-replicates which serve as anchors for alignment.
Finally, it corrects the data using these estimates, producing a harmonized expression matrix suitable for downstream analysis.
references:
doi:
- 10.1073/pnas.1820006116
links:
documentation: https://sydneybiox.github.io/scMerge/articles/scMerge2.html
repository: https://github.com/SydneyBioX/scMerge
info:
preferred_normalization: log_cpm
resources:
- type: r_script
path: script.R
engines:
- type: docker
image: openproblems/base_r:1
setup:
- type: apt
packages: cmake
- type: r
bioc:
- scmerge
runners:
- type: executable
- type: nextflow
directives:
label: [midtime,midmem,midcpu]
96 changes: 96 additions & 0 deletions src/methods/unsupervised_scmerge2/script.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
cat(">> Load dependencies\n")
requireNamespace("anndata", quietly = TRUE)
library(scMerge)

## VIASH START
par <- list(
input = "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
output = "output.h5ad"
)
meta <- list(
name = "unsupervised_scmerge2"
)
## VIASH END

cat("Reading input files\n")
adata <- anndata::read_h5ad(par$input)
adata$obs["batch"] <- sub("\\+", "plus", adata$obs[["batch"]]) # Replace "+"" characters in batch names

anndataToScMerge2 <- function(adata, seg_list, layer = "normalized", verbose = FALSE) {
exprsMat_all <- t(as.matrix(adata$layers[[layer]]))
batch_all <- as.character(adata$obs$batch)

valid_cells <- !is.na(batch_all)
exprsMat <- exprsMat_all[, valid_cells, drop = FALSE]
batch <- batch_all[valid_cells]

# Check overlap with human/mouse scSEG lists
gene_ids <- rownames(exprsMat)
species <- NULL
best_match <- 0

for (organism in names(seg_list)) {
scseg_name <- paste0(organism, "_scSEG")
seg_genes <- seg_list[[organism]][[scseg_name]]
overlap <- length(intersect(gene_ids, seg_genes))

if (overlap > best_match) {
best_match <- overlap
species <- organism
}
}

if (is.null(species) || best_match == 0) {
stop("No match found between gene IDs in exprsMat and scSEG lists for human or mouse. ",
"Please ensure you're using Ensembl IDs for human or mouse, or provide a custom SEG list.")
}

message("Detected species: ", species, " (matched ", best_match, " genes)")

ctl <- seg_list[[species]][[paste0(species, "_scSEG")]]

scMerge2_res <- scMerge2(
exprsMat = exprsMat,
batch = batch,
ctl = ctl,
verbose = verbose
)

return(scMerge2_res)
}

data("segList_ensemblGeneID")

cat("Run scMerge2\n")

scMerge2_res <- anndataToScMerge2(
adata = adata,
seg_list = segList_ensemblGeneID,
layer = "normalized",
verbose = TRUE
)


cat("Store output\n")
corrected_mat <- scMerge2_res$newY

embedding <- prcomp(t(corrected_mat))$x[, 1:10]

rownames(embedding) <- colnames(corrected_mat)

output <- anndata::AnnData(
X = NULL,
obs = adata$obs[, c()],
var = NULL,
obsm = list(
X_emb = embedding[rownames(adata), , drop = FALSE] # match input cells
),
uns = list(
dataset_id = adata$uns[["dataset_id"]],
normalization_id = adata$uns[["normalization_id"]],
method_id = meta$name
),
shape = adata$shape
)
cat("Write output AnnData to file\n")
output$write_h5ad(par[["output"]], compression = "gzip")