diff --git a/CHANGELOG.md b/CHANGELOG.md index 859869e4..5c0af83f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,6 +45,8 @@ A major update to the OpenProblems framework, switching from a Python-based fram * Added scGPT fine-tuned (PR #17). +* Added ComBat-Seq method (PR #55). + ## Major changes diff --git a/src/methods/combat-seq/config.vsh.yaml b/src/methods/combat-seq/config.vsh.yaml new file mode 100644 index 00000000..346c238d --- /dev/null +++ b/src/methods/combat-seq/config.vsh.yaml @@ -0,0 +1,52 @@ +__merge__: ../../api/comp_method.yaml +name: combat_seq +label: ComBat-Seq +summary: Adjusting batch effects in RNA-Seq expression data using empirical Bayes + methods +description: | + ComBat-Seq extends the ComBat method for batch correction in RNA-Seq data. + While ComBat assumes normally distributed data, ComBat-Seq uses a negative + binomial distribution to model the data. While initially developed for + RNA-Seq data, ComBat-Seq can be applied to single-cell RNA-Seq data as well. + + The method is implemented in Python as a part of the inmoose package. It is + based on the original R implementation, distributed through the sva package. + +references: + doi: + - 10.1093/nargab/lqaa078 + - 10.1186/s12859-023-05578-5 + +links: + documentation: https://inmoose.readthedocs.io/en/stable/pycombatseq.html + repository: https://github.com/epigenelabs/inmoose + +# Metadata for your component +info: + # Which normalisation method this component prefers to use (required). + preferred_normalization: counts + method_types: [feature] + +# Resources required to run the component +resources: + - type: python_script + path: script.py + - path: /src/utils/read_anndata_partial.py + +engines: + # Specifications for the Docker image for this component. + - type: docker + image: openproblems/base_python:1.0.0 + # Add custom dependencies here (optional). For more information, see + # https://viash.io/reference/config/engines/docker/#setup . + setup: + - type: python + pip: inmoose + +runners: + # This platform allows running the component natively + - type: executable + # Allows turning the component into a Nextflow module / pipeline. + - type: nextflow + directives: + label: [midtime,midmem,midcpu] diff --git a/src/methods/combat-seq/script.py b/src/methods/combat-seq/script.py new file mode 100644 index 00000000..9826adfd --- /dev/null +++ b/src/methods/combat-seq/script.py @@ -0,0 +1,42 @@ +import sys + +import anndata as ad +import numpy as np +from inmoose.pycombat import pycombat_seq +from scipy.sparse import csr_matrix + +# VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = {"input": "resources_test/.../input.h5ad", "output": "output.h5ad"} +meta = {"name": "combat-seq"} +# VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + +print("Read input", flush=True) +adata = read_anndata( + par["input"], X="layers/normalized", obs="obs", var="var", uns="uns" +) + +print("Run Combat-Seq", flush=True) +counts = adata.T.to_df().astype(np.double).values +corrected_counts = pycombat_seq(counts, adata.obs["batch"]) + +print("Store output", flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + uns={ + "dataset_id": adata.uns["dataset_id"], + "normalization_id": adata.uns["normalization_id"], + "method_id": meta["name"], + }, + layers={ + "corrected_counts": csr_matrix(corrected_counts.T), + }, +) + +print("Store outputs", flush=True) +output.write_h5ad(par["output"], compression="gzip")