Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion demos/common.smk
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
import platform
from pathlib import Path
import os

# Check if this is running on O2
IS_O2 = (platform.system() == "Linux")

if IS_O2:
O2_USER = os.environ["USER"]
O2_SCRATCH_DIR = f"/n/scratch/users/{O2_USER[0]}/{O2_USER}/vitessce-python/demos"

# Directory / file constants
SRC_DIR = Path("src")
DATA_DIR = Path("data")
DATA_DIR = Path("data" if not IS_O2 else O2_SCRATCH_DIR)
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"

Expand Down
36 changes: 36 additions & 0 deletions demos/salcher-2022/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
include: "../common.smk"
configfile: "config.yml"

# May need to get new URLs from https://cellxgene.cziscience.com/collections/edb893ee-4066-4128-9aec-5eb2b03f8287

# The single-cell lung cancer atlas (LuCA) -- extended atlas
H5AD_URL = "https://datasets.cellxgene.cziscience.com/6e5e887d-96f7-40af-908c-9b4fc5057ef9.h5ad"

rule all:
input:
[ (PROCESSED_DIR / f) for f in config['output'] ]

rule convert_to_zarr:
input:
(RAW_DIR / "6e5e887d-96f7-40af-908c-9b4fc5057ef9.h5ad")
output:
directory(PROCESSED_DIR / "salcher_2022_extended.h5ad.zarr")
params:
script=(SRC_DIR / "convert_to_zarr.py")
shell:
'''
python {params.script} \
-i {input} \
-o {output}
'''

# Download raw h5ad file.
rule download_adata:
output:
(RAW_DIR / "6e5e887d-96f7-40af-908c-9b4fc5057ef9.h5ad")
params:
file_url=H5AD_URL
shell:
'''
curl -L --retry 999 --retry-delay 3 -C - -o {output} "{params.file_url}"
'''
2 changes: 2 additions & 0 deletions demos/salcher-2022/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
output:
- salcher_2022_extended.h5ad.zarr
70 changes: 70 additions & 0 deletions demos/salcher-2022/src/convert_to_zarr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import argparse
from anndata import read_h5ad
import scipy
import numpy as np
import pandas as pd
import platform
import os
import zarr
import math

def convert_h5ad_to_zarr(input_path, output_path):
adata = read_h5ad(input_path)

# Clear X so that we can write it ourselves manually
X = adata.X #.copy()

adata.X = None
adata.write_zarr(output_path)

assert isinstance(X, scipy.sparse.spmatrix)

print(output_path)

store = zarr.DirectoryStore(output_path)
z = zarr.zeros(shape=X.shape, chunks=(X.shape[0], 10), dtype=X.dtype, store = store, path = "/X", overwrite=True)

chunk_shape = (10000, 10000)
x_chunks = math.ceil(X.shape[0] / chunk_shape[0])
y_chunks = math.ceil(X.shape[1] / chunk_shape[1])


for i in range(x_chunks):
for j in range(y_chunks):
x_start = i * chunk_shape[0]
x_end = min((i + 1) * chunk_shape[0], X.shape[0])
y_start = j * chunk_shape[1]
y_end = min((j + 1) * chunk_shape[1], X.shape[1])

X_chunk = X[x_start:x_end, y_start:y_end].tocoo(copy=False)
z.set_coordinate_selection(
# Add x_start and y_start as offsets to the row/chunk coordinates
([cx+x_start for cx in X_chunk.row], [cy+y_start for cy in X_chunk.col]),
X_chunk.data
)

print("done")

if __name__ == '__main__':
# Argparse
parser = argparse.ArgumentParser()
parser.add_argument(
'-i',
'--input',
type=str,
required=True,
help='Input H5AD file'
)
parser.add_argument(
'-o',
'--output',
type=str,
required=True,
help='Output Zarr store'
)
args = parser.parse_args()

convert_h5ad_to_zarr(
args.input,
args.output,
)
36 changes: 36 additions & 0 deletions demos/sikkema-2023/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
include: "../common.smk"
configfile: "config.yml"

# May need to get new URLs from https://cellxgene.cziscience.com/collections/edb893ee-4066-4128-9aec-5eb2b03f8287

# The single-cell lung cancer atlas (LuCA) -- extended atlas
H5AD_URL = "https://datasets.cellxgene.cziscience.com/3ab47484-a3eb-4f6a-beea-670e1a8fc1e8.h5ad"

rule all:
input:
[ (PROCESSED_DIR / f) for f in config['output'] ]

rule convert_to_zarr:
input:
(RAW_DIR / "3ab47484-a3eb-4f6a-beea-670e1a8fc1e8.h5ad")
output:
directory(PROCESSED_DIR / "sikkema_2023_full.h5ad.zarr")
params:
script=(SRC_DIR / "convert_to_zarr.py")
shell:
'''
python {params.script} \
-i {input} \
-o {output}
'''

# Download raw h5ad file.
rule download_adata:
output:
(RAW_DIR / "3ab47484-a3eb-4f6a-beea-670e1a8fc1e8.h5ad")
params:
file_url=H5AD_URL
shell:
'''
curl -L --retry 999 --retry-delay 3 -C - -o {output} "{params.file_url}"
'''
2 changes: 2 additions & 0 deletions demos/sikkema-2023/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
output:
- sikkema_2023_full.h5ad.zarr
41 changes: 41 additions & 0 deletions demos/sikkema-2023/src/convert_to_zarr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import argparse
from anndata import read_h5ad
from scipy import sparse
from vitessce.data_utils import (
to_uint8,
)


def convert_h5ad_to_zarr(input_path, output_path):
adata = read_h5ad(input_path)

adata.layers['X_uint8'] = to_uint8(adata.X, norm_along="var")

# Vitessce plays nicely with csc matrices
# TODO: automate conversion to csc in optimize_adata function
if isinstance(adata.X, sparse.spmatrix):
adata.X = adata.X.tocsc()
adata.write_zarr(output_path, chunks=[adata.shape[0], 10])


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'-i',
'--input',
type=str,
required=True,
help='Input H5AD file'
)
parser.add_argument(
'-o',
'--output',
type=str,
required=True,
help='Output Zarr store'
)
args = parser.parse_args()
convert_h5ad_to_zarr(
args.input,
args.output
)
35 changes: 35 additions & 0 deletions demos/spatialdata-2024/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@

include: "../common.smk"
configfile: "config.yml"

BASE_URL = "https://s3.embl.de/spatialdata/spatialdata-sandbox/{dataset}.zip"


rule all:
input:
[ (PROCESSED_DIR / f) for f in config['output'] ]


# Unzip the downloaded zip files
rule unzip_file:
input:
(RAW_DIR / "{dataset}.zip")
output:
directory(PROCESSED_DIR / "{dataset}.zarr")
shell:
"""
mkdir -p {RAW_DIR}/{wildcards.dataset} &&\
unzip {input} -d {RAW_DIR}/{wildcards.dataset} &&\
mv {RAW_DIR}/{wildcards.dataset}/data.zarr {PROCESSED_DIR}/{wildcards.dataset}.zarr
"""

# Download visium .zip file containing single-cell data.
rule download_data:
output:
(RAW_DIR / "{dataset}.zip")
params:
file_url=BASE_URL
shell:
'''
curl -L -o {output} {params.file_url}
'''
10 changes: 10 additions & 0 deletions demos/spatialdata-2024/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
output:
- toy.zarr
- visium_hd_3.0.0_io.zarr
- visium_associated_xenium_io.zarr
- xenium_rep1_io.zarr
- xenium_rep2_io.zarr
- mcmicro_io.zarr
- merfish.zarr
- mibitof.zarr
- steinbock_io.zarr