Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion _viash.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
viash_version: 0.9.0
viash_version: 0.9.4

name: task_batch_integration
organization: openproblems-bio
Expand Down
41 changes: 41 additions & 0 deletions src/methods/scgpt_czbenchmarks/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
__merge__: ../../api/base_method.yaml

name: scgpt_czbenchmarks
label: scGPT (CZ Benchmarks)
summary: "A foundation model for single-cell biology (CZ Benchmarks implementation)"
description: |
scGPT is a foundation model for single-cell biology based on a generative
pre-trained transformer and trained on a repository of over 33 million cells.

Here, we use zero-shot output from a pre-trained model to get an integrated
embedding for the batch integration task.
references:
doi:
- 10.1038/s41592-024-02201-0
links:
documentation: https://scgpt.readthedocs.io/en/latest/
repository: https://github.com/chanzuckerberg/cz-benchmarks/tree/main/docker/scgpt

info:
method_types: [embedding]
preferred_normalization: counts

resources:
- type: python_script
path: script.py
- path: /src/utils/read_anndata_partial.py
- path: /src/utils/exit_codes.py

engines:
- type: docker
image: public.ecr.aws/czi-virtual-cells/cz-benchmarks-models-public:scgpt
setup:
- type: apt
packages:
- python-is-python3

runners:
- type: executable
- type: nextflow
directives:
label: [hightime, midmem, midcpu, gpu]
87 changes: 87 additions & 0 deletions src/methods/scgpt_czbenchmarks/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import sys
import os

import anndata as ad
import scgpt

from czbenchmarks.datasets.single_cell import SingleCellDataset
from czbenchmarks.datasets.types import Organism, DataType
from czbenchmarks.models.types import ModelType

## VIASH START
# Note: this section is auto-generated by viash at runtime. To edit it, make changes
# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
par = {
"input": "resources_test/.../input.h5ad",
"output": "output.h5ad",
}
meta = {"name": "scgpt_czbenchmarks"}
## VIASH END

sys.path.append(meta["resources_dir"])
from read_anndata_partial import read_anndata
from exit_codes import exit_non_applicable

sys.path.append("/app")
from model import ScGPT

print(f"====== scGPT version {scgpt.__version__} (czbenchmarks) ======", flush=True)

# Check organism and exit if needed
adata_uns = read_anndata(par["input"], uns="uns")

if adata_uns.uns["dataset_organism"] != "homo_sapiens":
exit_non_applicable(
f"scGPT can only be used with human data "
f"(dataset_organism == \"{adata_uns.uns['dataset_organism']}\")"
)

del adata_uns

print("\n>>> Creating input dataset..", flush=True)
dataset = SingleCellDataset(path = par["input"], organism = Organism.HUMAN)
print(dataset)
dataset.load_data()
dataset.adata.X = dataset.adata.layers["counts"].copy()
print(dataset.adata, flush=True)

print("\n>>> Running scGPT..", flush=True)
model = ScGPT()
# Run these steps manually instead of using model.run() to avoid reloading data
print("Validating data...", flush=True)
dataset.validate()
model.validate_dataset(dataset)
print("Data validated successfully", flush=True)

print("Downloading model weights...", flush=True)
if not os.path.exists("/weights/human"):
os.makedirs("/weights/human")
model.download_model_weights(dataset)
print("Model weights downloaded successfully", flush=True)

print("Running model...", flush=True)
model.run_model(dataset)
print("Model ran successfully", flush=True)

embedding = dataset.get_output(ModelType.SCGPT, DataType.EMBEDDING)

print("\n>>> Storing output...", flush=True)
output = ad.AnnData(
obs=dataset.adata.obs[[]],
var=dataset.adata.var[[]],
obsm={
"X_emb": embedding,
},
uns={
"dataset_id": dataset.adata.uns["dataset_id"],
"normalization_id": dataset.adata.uns["normalization_id"],
"method_id": meta["name"],
},
)
print(output)

print("\n>>> Writing output to file...", flush=True)
print(f"Output H5AD file: '{par['output']}'", flush=True)
output.write_h5ad(par["output"], compression="gzip")

print("\n>>> Done!", flush=True)
1 change: 1 addition & 0 deletions src/workflows/run_benchmark/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ dependencies:
- name: methods/scalex
- name: methods/scanorama
- name: methods/scanvi
- name: methods/scgpt_czbenchmarks
- name: methods/scgpt_finetuned
- name: methods/scgpt_zeroshot
- name: methods/scimilarity
Expand Down
1 change: 1 addition & 0 deletions src/workflows/run_benchmark/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ methods = [
scalex,
scanorama,
scanvi,
scgpt_czbenchmarks,
scgpt_finetuned.run(
args: [model: file("s3://openproblems-work/cache/scGPT_human.zip")]
),
Expand Down
Loading