diff --git a/demos/common.smk b/demos/common.smk index 5929ad96..1ff89474 100644 --- a/demos/common.smk +++ b/demos/common.smk @@ -1,8 +1,17 @@ +import platform from pathlib import Path +import os + +# Check if this is running on O2 +IS_O2 = (platform.system() == "Linux") + +if IS_O2: + O2_USER = os.environ["USER"] + O2_SCRATCH_DIR = f"/n/scratch/users/{O2_USER[0]}/{O2_USER}/vitessce-python/demos" # Directory / file constants SRC_DIR = Path("src") -DATA_DIR = Path("data") +DATA_DIR = Path("data" if not IS_O2 else O2_SCRATCH_DIR) RAW_DIR = DATA_DIR / "raw" PROCESSED_DIR = DATA_DIR / "processed" diff --git a/demos/salcher-2022/Snakefile b/demos/salcher-2022/Snakefile new file mode 100644 index 00000000..8b4b783f --- /dev/null +++ b/demos/salcher-2022/Snakefile @@ -0,0 +1,36 @@ +include: "../common.smk" +configfile: "config.yml" + +# May need to get new URLs from https://cellxgene.cziscience.com/collections/edb893ee-4066-4128-9aec-5eb2b03f8287 + +# The single-cell lung cancer atlas (LuCA) -- extended atlas +H5AD_URL = "https://datasets.cellxgene.cziscience.com/6e5e887d-96f7-40af-908c-9b4fc5057ef9.h5ad" + +rule all: + input: + [ (PROCESSED_DIR / f) for f in config['output'] ] + +rule convert_to_zarr: + input: + (RAW_DIR / "6e5e887d-96f7-40af-908c-9b4fc5057ef9.h5ad") + output: + directory(PROCESSED_DIR / "salcher_2022_extended.h5ad.zarr") + params: + script=(SRC_DIR / "convert_to_zarr.py") + shell: + ''' + python {params.script} \ + -i {input} \ + -o {output} + ''' + +# Download raw h5ad file. +rule download_adata: + output: + (RAW_DIR / "6e5e887d-96f7-40af-908c-9b4fc5057ef9.h5ad") + params: + file_url=H5AD_URL + shell: + ''' + curl -L --retry 999 --retry-delay 3 -C - -o {output} "{params.file_url}" + ''' diff --git a/demos/salcher-2022/config.yml b/demos/salcher-2022/config.yml new file mode 100644 index 00000000..674fa553 --- /dev/null +++ b/demos/salcher-2022/config.yml @@ -0,0 +1,2 @@ +output: +- salcher_2022_extended.h5ad.zarr \ No newline at end of file diff --git a/demos/salcher-2022/src/convert_to_zarr.py b/demos/salcher-2022/src/convert_to_zarr.py new file mode 100644 index 00000000..37be86e6 --- /dev/null +++ b/demos/salcher-2022/src/convert_to_zarr.py @@ -0,0 +1,70 @@ +import argparse +from anndata import read_h5ad +import scipy +import numpy as np +import pandas as pd +import platform +import os +import zarr +import math + +def convert_h5ad_to_zarr(input_path, output_path): + adata = read_h5ad(input_path) + + # Clear X so that we can write it ourselves manually + X = adata.X #.copy() + + adata.X = None + adata.write_zarr(output_path) + + assert isinstance(X, scipy.sparse.spmatrix) + + print(output_path) + + store = zarr.DirectoryStore(output_path) + z = zarr.zeros(shape=X.shape, chunks=(X.shape[0], 10), dtype=X.dtype, store = store, path = "/X", overwrite=True) + + chunk_shape = (10000, 10000) + x_chunks = math.ceil(X.shape[0] / chunk_shape[0]) + y_chunks = math.ceil(X.shape[1] / chunk_shape[1]) + + + for i in range(x_chunks): + for j in range(y_chunks): + x_start = i * chunk_shape[0] + x_end = min((i + 1) * chunk_shape[0], X.shape[0]) + y_start = j * chunk_shape[1] + y_end = min((j + 1) * chunk_shape[1], X.shape[1]) + + X_chunk = X[x_start:x_end, y_start:y_end].tocoo(copy=False) + z.set_coordinate_selection( + # Add x_start and y_start as offsets to the row/chunk coordinates + ([cx+x_start for cx in X_chunk.row], [cy+y_start for cy in X_chunk.col]), + X_chunk.data + ) + + print("done") + +if __name__ == '__main__': + # Argparse + parser = argparse.ArgumentParser() + parser.add_argument( + '-i', + '--input', + type=str, + required=True, + help='Input H5AD file' + ) + parser.add_argument( + '-o', + '--output', + type=str, + required=True, + help='Output Zarr store' + ) + args = parser.parse_args() + + convert_h5ad_to_zarr( + args.input, + args.output, + ) diff --git a/demos/sikkema-2023/Snakefile b/demos/sikkema-2023/Snakefile new file mode 100644 index 00000000..c3551bbf --- /dev/null +++ b/demos/sikkema-2023/Snakefile @@ -0,0 +1,36 @@ +include: "../common.smk" +configfile: "config.yml" + +# May need to get new URLs from https://cellxgene.cziscience.com/collections/edb893ee-4066-4128-9aec-5eb2b03f8287 + +# The single-cell lung cancer atlas (LuCA) -- extended atlas +H5AD_URL = "https://datasets.cellxgene.cziscience.com/3ab47484-a3eb-4f6a-beea-670e1a8fc1e8.h5ad" + +rule all: + input: + [ (PROCESSED_DIR / f) for f in config['output'] ] + +rule convert_to_zarr: + input: + (RAW_DIR / "3ab47484-a3eb-4f6a-beea-670e1a8fc1e8.h5ad") + output: + directory(PROCESSED_DIR / "sikkema_2023_full.h5ad.zarr") + params: + script=(SRC_DIR / "convert_to_zarr.py") + shell: + ''' + python {params.script} \ + -i {input} \ + -o {output} + ''' + +# Download raw h5ad file. +rule download_adata: + output: + (RAW_DIR / "3ab47484-a3eb-4f6a-beea-670e1a8fc1e8.h5ad") + params: + file_url=H5AD_URL + shell: + ''' + curl -L --retry 999 --retry-delay 3 -C - -o {output} "{params.file_url}" + ''' diff --git a/demos/sikkema-2023/config.yml b/demos/sikkema-2023/config.yml new file mode 100644 index 00000000..baeb72df --- /dev/null +++ b/demos/sikkema-2023/config.yml @@ -0,0 +1,2 @@ +output: +- sikkema_2023_full.h5ad.zarr \ No newline at end of file diff --git a/demos/sikkema-2023/src/convert_to_zarr.py b/demos/sikkema-2023/src/convert_to_zarr.py new file mode 100644 index 00000000..aba2a980 --- /dev/null +++ b/demos/sikkema-2023/src/convert_to_zarr.py @@ -0,0 +1,41 @@ +import argparse +from anndata import read_h5ad +from scipy import sparse +from vitessce.data_utils import ( + to_uint8, +) + + +def convert_h5ad_to_zarr(input_path, output_path): + adata = read_h5ad(input_path) + + adata.layers['X_uint8'] = to_uint8(adata.X, norm_along="var") + + # Vitessce plays nicely with csc matrices + # TODO: automate conversion to csc in optimize_adata function + if isinstance(adata.X, sparse.spmatrix): + adata.X = adata.X.tocsc() + adata.write_zarr(output_path, chunks=[adata.shape[0], 10]) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '-i', + '--input', + type=str, + required=True, + help='Input H5AD file' + ) + parser.add_argument( + '-o', + '--output', + type=str, + required=True, + help='Output Zarr store' + ) + args = parser.parse_args() + convert_h5ad_to_zarr( + args.input, + args.output + ) diff --git a/demos/spatialdata-2024/Snakefile b/demos/spatialdata-2024/Snakefile new file mode 100644 index 00000000..7f4d9a16 --- /dev/null +++ b/demos/spatialdata-2024/Snakefile @@ -0,0 +1,35 @@ + +include: "../common.smk" +configfile: "config.yml" + +BASE_URL = "https://s3.embl.de/spatialdata/spatialdata-sandbox/{dataset}.zip" + + +rule all: + input: + [ (PROCESSED_DIR / f) for f in config['output'] ] + + +# Unzip the downloaded zip files +rule unzip_file: + input: + (RAW_DIR / "{dataset}.zip") + output: + directory(PROCESSED_DIR / "{dataset}.zarr") + shell: + """ + mkdir -p {RAW_DIR}/{wildcards.dataset} &&\ + unzip {input} -d {RAW_DIR}/{wildcards.dataset} &&\ + mv {RAW_DIR}/{wildcards.dataset}/data.zarr {PROCESSED_DIR}/{wildcards.dataset}.zarr + """ + +# Download visium .zip file containing single-cell data. +rule download_data: + output: + (RAW_DIR / "{dataset}.zip") + params: + file_url=BASE_URL + shell: + ''' + curl -L -o {output} {params.file_url} + ''' \ No newline at end of file diff --git a/demos/spatialdata-2024/config.yml b/demos/spatialdata-2024/config.yml new file mode 100644 index 00000000..4f75ebc2 --- /dev/null +++ b/demos/spatialdata-2024/config.yml @@ -0,0 +1,10 @@ +output: +- toy.zarr +- visium_hd_3.0.0_io.zarr +- visium_associated_xenium_io.zarr +- xenium_rep1_io.zarr +- xenium_rep2_io.zarr +- mcmicro_io.zarr +- merfish.zarr +- mibitof.zarr +- steinbock_io.zarr \ No newline at end of file