openproblems-bio · lazappi · Jun 25, 2025 · Jun 25, 2025 · Jun 25, 2025 · Jun 25, 2025
diff --git a/_viash.yaml b/_viash.yaml
@@ -1,4 +1,4 @@
-viash_version: 0.9.0
+viash_version: 0.9.4
 
 name: task_batch_integration
 organization: openproblems-bio

diff --git a/src/methods/scgpt_czbenchmarks/config.vsh.yaml b/src/methods/scgpt_czbenchmarks/config.vsh.yaml
@@ -0,0 +1,41 @@
+__merge__: ../../api/base_method.yaml
+
+name: scgpt_czbenchmarks
+label: scGPT (CZ Benchmarks)
+summary: "A foundation model for single-cell biology (CZ Benchmarks implementation)"
+description: |
+  scGPT is a foundation model for single-cell biology based on a generative
+  pre-trained transformer and trained on a repository of over 33 million cells.
+
+  Here, we use zero-shot output from a pre-trained model to get an integrated
+  embedding for the batch integration task.
+references:
+  doi:
+    - 10.1038/s41592-024-02201-0
+links:
+  documentation: https://scgpt.readthedocs.io/en/latest/
+  repository: https://github.com/chanzuckerberg/cz-benchmarks/tree/main/docker/scgpt
+
+info:
+  method_types: [embedding]
+  preferred_normalization: counts
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/read_anndata_partial.py
+  - path: /src/utils/exit_codes.py
+
+engines:
+  - type: docker
+    image: public.ecr.aws/czi-virtual-cells/cz-benchmarks-models-public:scgpt
+    setup:
+      - type: apt
+        packages:
+          - python-is-python3
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [hightime, midmem, midcpu, gpu]
diff --git a/src/methods/scgpt_czbenchmarks/script.py b/src/methods/scgpt_czbenchmarks/script.py
@@ -0,0 +1,87 @@
+import sys
+import os
+
+import anndata as ad
+import scgpt
+
+from czbenchmarks.datasets.single_cell import SingleCellDataset
+from czbenchmarks.datasets.types import Organism, DataType
+from czbenchmarks.models.types import ModelType
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+    "input": "resources_test/.../input.h5ad",
+    "output": "output.h5ad",
+}
+meta = {"name": "scgpt_czbenchmarks"}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from read_anndata_partial import read_anndata
+from exit_codes import exit_non_applicable
+
+sys.path.append("/app")
+from model import ScGPT
+
+print(f"====== scGPT version {scgpt.__version__} (czbenchmarks) ======", flush=True)
+
+# Check organism and exit if needed
+adata_uns = read_anndata(par["input"], uns="uns")
+
+if adata_uns.uns["dataset_organism"] != "homo_sapiens":
+    exit_non_applicable(
+        f"scGPT can only be used with human data "
+        f"(dataset_organism == \"{adata_uns.uns['dataset_organism']}\")"
+    )
+
+del adata_uns
+
+print("\n>>> Creating input dataset..", flush=True)
+dataset = SingleCellDataset(path = par["input"], organism = Organism.HUMAN)
+print(dataset)
+dataset.load_data()
+dataset.adata.X = dataset.adata.layers["counts"].copy()
+print(dataset.adata, flush=True)
+
+print("\n>>> Running scGPT..", flush=True)
+model = ScGPT()
+# Run these steps manually instead of using model.run() to avoid reloading data
+print("Validating data...", flush=True)
+dataset.validate()
+model.validate_dataset(dataset)
+print("Data validated successfully", flush=True)
+
+print("Downloading model weights...", flush=True)
+if not os.path.exists("/weights/human"):
+    os.makedirs("/weights/human")
+model.download_model_weights(dataset)
+print("Model weights downloaded successfully", flush=True)
+
+print("Running model...", flush=True)
+model.run_model(dataset)
+print("Model ran successfully", flush=True)
+
+embedding = dataset.get_output(ModelType.SCGPT, DataType.EMBEDDING)
+
+print("\n>>> Storing output...", flush=True)
+output = ad.AnnData(
+    obs=dataset.adata.obs[[]],
+    var=dataset.adata.var[[]],
+    obsm={
+        "X_emb": embedding,
+    },
+    uns={
+        "dataset_id": dataset.adata.uns["dataset_id"],
+        "normalization_id": dataset.adata.uns["normalization_id"],
+        "method_id": meta["name"],
+    },
+)
+print(output)
+
+print("\n>>> Writing output to file...", flush=True)
+print(f"Output H5AD file: '{par['output']}'", flush=True)
+output.write_h5ad(par["output"], compression="gzip")
+
+print("\n>>> Done!", flush=True)
diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
@@ -101,6 +101,7 @@ dependencies:
   - name: methods/scalex
   - name: methods/scanorama
   - name: methods/scanvi
+  - name: methods/scgpt_czbenchmarks
   - name: methods/scgpt_finetuned
   - name: methods/scgpt_zeroshot
   - name: methods/scimilarity

diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
@@ -29,6 +29,7 @@ methods = [
   scalex,
   scanorama,
   scanvi,
+  scgpt_czbenchmarks,
   scgpt_finetuned.run(
     args: [model: file("s3://openproblems-work/cache/scGPT_human.zip")]
   ),