add combat-seq method

EpigeneMax · EpigeneMax · commit 331114a31f5b · 2025-08-29T11:26:07.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -45,6 +45,8 @@ A major update to the OpenProblems framework, switching from a Python-based fram
 
 * Added scGPT fine-tuned (PR #17).
 
+* Added ComBat-Seq method (PR #55).
+
 
 ## Major changes
 
diff --git a/src/methods/combat-seq/config.vsh.yaml b/src/methods/combat-seq/config.vsh.yaml
@@ -0,0 +1,51 @@
+__merge__: ../../api/comp_method.yaml
+name: combat_seq
+label: ComBat-Seq
+summary: Adjusting batch effects in RNA-Seq expression data using empirical Bayes
+  methods
+description: |
+  ComBat-Seq extends the ComBat method for batch correction in RNA-Seq data.
+  While ComBat assumes normally distributed data, ComBat-Seq uses a negative
+  binomial distribution to model the data.  While initially developed for
+  RNA-Seq data, ComBat-Seq can be applied to single-cell RNA-Seq data as well.
+
+  The method is implemented in Python as a part of the inmoose package.  It is
+  based on the original R implementation, distributed through the sva package.
+
+references:
+  doi:
+    - 10.1093/nargab/lqaa078
+    - 10.1186/s12859-023-05578-5
+
+links:
+  documentation: https://inmoose.readthedocs.io/en/stable/pycombatseq.html
+  repository: https://github.com/epigenelabs/inmoose
+
+# Metadata for your component
+info:
+  # Which normalisation method this component prefers to use (required).
+  preferred_normalization: counts
+
+# Resources required to run the component
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/read_anndata_partial.py
+
+engines:
+  # Specifications for the Docker image for this component.
+  - type: docker
+    image: openproblems/base_python:1.0.0
+    # Add custom dependencies here (optional). For more information, see
+    # https://viash.io/reference/config/engines/docker/#setup .
+    setup:
+      - type: python
+        pip: inmoose
+
+runners:
+  # This platform allows running the component natively
+  - type: executable
+  # Allows turning the component into a Nextflow module / pipeline.
+  - type: nextflow
+    directives:
+      label: [midtime,midmem,midcpu]
diff --git a/src/methods/combat-seq/script.py b/src/methods/combat-seq/script.py
@@ -0,0 +1,42 @@
+import sys
+
+import anndata as ad
+import numpy as np
+from inmoose.pycombat import pycombat_seq
+from scipy.sparse import csr_matrix
+
+# VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {"input": "resources_test/.../input.h5ad", "output": "output.h5ad"}
+meta = {"name": "combat-seq"}
+# VIASH END
+
+sys.path.append(meta["resources_dir"])
+from read_anndata_partial import read_anndata
+
+print("Read input", flush=True)
+adata = read_anndata(
+    par["input"], X="layers/normalized", obs="obs", var="var", uns="uns"
+)
+
+print("Run Combat-Seq", flush=True)
+counts = adata.T.to_df().astype(np.double).values
+corrected_counts = pycombat_seq(adata.X, adata.obs["batch"])
+
+print("Store output", flush=True)
+output = ad.AnnData(
+    obs=adata.obs[[]],
+    var=adata.var[[]],
+    uns={
+        "dataset_id": adata.uns["dataset_id"],
+        "normalization_id": adata.uns["normalization_id"],
+        "method_id": meta["name"],
+    },
+    layers={
+        "corrected_counts": csr_matrix(corrected_counts.T),
+    },
+)
+
+print("Store outputs", flush=True)
+output.write_h5ad(par["output"], compression="gzip")