Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified docs/images/spatialxe-metromap.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
37 changes: 37 additions & 0 deletions modules/local/baysor/preprocess/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
process BAYSOR_PREPROCESS_TRANSCRIPTS {
tag "$meta.id"
label 'process_low'

container "ghcr.io/scverse/spatialdata:spatialdata0.3.0_spatialdata-io0.1.7_spatialdata-plot0.2.9"

input:
tuple val(meta), path(transcripts)
val(min_qv)
val(max_x)
val(min_x)
val(max_y)
val(min_y)

output:
tuple val(meta), path("*.parquet"), emit: transcripts_parquet
path("versions.yml") , emit: versions

when:
task.ext.when == null || task.ext.when

script:
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
error "PREPROCESS_TRANSCRIPTS module does not support Conda. Please use Docker / Singularity / Podman instead."
}

template 'preprocess_transcripts.py'

stub:
"""
touch ${transcripts}.parquet
cat <<-END_VERSIONS > versions.yml
"${task.process}":
baysor_preprocess_transcripts: "1.0.0"
END_VERSIONS
"""
}
70 changes: 70 additions & 0 deletions modules/local/baysor/preprocess/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
name: "baysor_preprocess"
description: Filter transcript.parquet fiel based on the specified thresholds
keywords:
- baysor
- transcripts
- filter_transcripts
tools:
- "baysor":
description: "Baysor is a tool that segments cells using spatial gene expression maps. Optionally, segmentation masks can be given as additional input."
homepage: "https://kharchenkolab.github.io/Baysor/dev/"
documentation: "https://kharchenkolab.github.io/Baysor/dev/"
tool_dev_url: "https://github.com/kharchenkolab/Baysor"
doi: "https://doi.org/10.1038/s41587-021-01044-w"
licence: ["MIT license"]
identifier:

input:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test' ]
- transcripts:
type: file
description: transcripts.parquet file from the xenium bundle
pattern: "*.parquet"

- min_qv:
type: float
description: minimum Q-Score to pass filtering (default - 20.0)
- max_x:
type: float
description: Only keep transcripts whose x-coordinate is less than specified limit
if no limit is specified, the default value will retain all
transcripts since Xenium slide is <24000 microns in x and y (default - 24000.0)
- min_x:
type: float
description: only keep transcripts whose x-coordinate is greater than specified limit
if no limit is specified, the default minimum value will be 0.0
- max_y:
type: float
description: only keep transcripts whose y-coordinate is less than specified limit
if no limit is specified, the default value will retain all
transcripts since Xenium slide is <24000 microns in x and y (default - 24000.0)
- min_y:
type: float
description: only keep transcripts whose y-coordinate is greater than specified limit
if no limit is specified, the default minimum value will be 0.0

output:
- - transcripts_parquet:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test' ]
- "*.parquet":
type: file
description: filtered transcripts.parquet
pattern: "filtered_transcripts.parquet"

- - versions:
type: file
description: File containing software versions
pattern: "versions.yml"

authors:
- "@khersameesh24"
maintainers:
- "@khersameesh24"
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/usr/bin/env python3

import pandas as pd


def filter_transcripts (
transcripts: str,
min_qv: float = 20.0,
min_x: float = 0.0,
max_x: float = 24000.0,
min_y: float = 0.0,
max_y: float = 24000.0
) -> None:
"""
Filter transcripts based on the specified thresholds

Args:
transcripts - path to transcripts parquet
----------------------------------- filters --------------------------------------------
min_qv - minimum Q-Score to pass filtering (default: 20.0)
min_x - only keep transcripts whose x-coordinate is greater than specified limit
if no limit is specified, the default minimum value will be 0.0
max_x - only keep transcripts whose x-coordinate is less than specified limit
if no limit is specified, the default value will retain all
transcripts since Xenium slide is <24000 microns in x and y (default: 24000.0)
min_y - only keep transcripts whose y-coordinate is greater than specified limit
if no limit is specified, the default minimum value will be 0.0
max_y - only keep transcripts whose y-coordinate is less than specified limit
if no limit is specified, the default value will retain all
transcripts since Xenium slide is <24000 microns in x and y (default: 24000.0)
"""
df = pd.read_parquet(transcripts, engine = 'pyarrow')

# filter transcripts df with thresholds, ignore negative controls
filtered_df = df[(df["qv"] >= min_qv) &
(df["x_location"] >= min_x) &
(df["x_location"] <= max_x) &
(df["y_location"] >= min_y) &
(df["y_location"] <= max_y) &
(~df["feature_name"].str.startswith("NegControlProbe_")) &
(~df["feature_name"].str.startswith("antisense_")) &
(~df["feature_name"].str.startswith("NegControlCodeword_")) &
(~df["feature_name"].str.startswith("BLANK_"))]

# change cell_id of cell-free transcripts from -1 to 0
neg_cell_row = filtered_df["cell_id"] == -1
filtered_df.loc[neg_cell_row,"cell_id"] = 0

# Output filtered transcripts to parquet
filtered_df.to_parquet(
'_'.join(["X"+str(min_x)+"-"+str(max_x), "Y"+str(min_y)+"-"+str(max_y), "filtered_transcripts.parquet"]),
index=False
)

return None


def generate_version_yml() -> None:
with open("versions.yml", "w") as yml:
yml.write('"${task.process}":\\n')
yml.write("Baysor-Preprocess Transcripts: 1.0.0'\\n")

return None


if __name__ == "__main__":

transcripts: str = "${transcripts}"

filter_transcripts (
transcripts=transcripts,
)

generate_version_yml()
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def convert_parquet (
extension=extension
)

#Output version information
#Output versions.yml
with open("versions.yml", "w") as f:
f.write('"${task.process}":\\n')
f.write(f'spatialconverter: "v0.0.1"\\n')
f.write('spatialconverter: "v0.0.1"\\n')
9 changes: 6 additions & 3 deletions modules/local/spatialdata/write/templates/write.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def main():
outputfolder = "${outputfolder}"
segmented_object = "${segmented_object}"

cells_as_circles=True
cells_boundaries=False
nucleus_boundaries=False
cells_labels=False
Expand All @@ -22,20 +23,22 @@ def main():
if ( segmented_object == 'cells' ):
cells_boundaries=True
cells_labels=True
if ( segmented_object == 'nuclei' ):
elif ( segmented_object == 'nuclei' ):
nucleus_boundaries=True
nucleus_labels=True
if ( segmented_object == 'cells_and_nuclei' ):
elif ( segmented_object == 'cells_and_nuclei' ):
cells_boundaries=True
nucleus_boundaries=True
cells_labels=True
nucleus_labels=True
else:
cells_as_circles=False

format = "${params.format}"
if ( format == "xenium" ):
sd_xenium_obj = xenium(
input_path,
cells_as_circles=True,
cells_as_circles=cells_as_circles,
cells_boundaries=cells_boundaries,
nucleus_boundaries=nucleus_boundaries,
cells_labels=cells_labels,
Expand Down
32 changes: 32 additions & 0 deletions modules/local/utility/segger2xr/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
process SEGGER2XR {
tag "$meta.id"
label 'process_low'

container "ghcr.io/scverse/spatialdata:spatialdata0.3.0_spatialdata-io0.1.7_spatialdata-plot0.2.9"

input:
tuple val(meta), path(transcripts)

output:
tuple val(meta), path("transcripts.parquet"), emit: transcripts_parquet
path("versions.yml") , emit: versions

when:
task.ext.when == null || task.ext.when

script:
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
error "SEGGER2XR module does not support Conda. Please use Docker / Singularity / Podman instead."
}

template 'segger2xr.py'

stub:
"""
touch ${transcripts}.parquet
cat <<-END_VERSIONS > versions.yml
"${task.process}":
segger2xr: "${task.version}"
END_VERSIONS
"""
}
61 changes: 61 additions & 0 deletions modules/local/utility/segger2xr/templates/segger2xr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/usr/bin/env python3

import pandas as pd
from pathlib import Path
from typing import List

# Expected columns in transcripts.parquet
REQUIRED_COLUMNS: List[str] = [
"transcript_id",
"cell_id",
"overlaps_nucleus",
"feature_name",
"x_location",
"y_location",
"z_location",
"qv",
"segger_id"
]

def refine_transcripts(parquet_path: str) -> pd.DataFrame:
"""
Replace the cell_id column with segger_id
"""
parquet_file = Path(parquet_path)
if not parquet_file.exists():
raise FileNotFoundError(f"File not found: {parquet_path}")

# Read parquet file
df = pd.read_parquet(parquet_file, engine="pyarrow")

# Validate required columns
missing_cols = [col for col in REQUIRED_COLUMNS if col not in df.columns]
if missing_cols:
raise ValueError(f"Missing required columns: {missing_cols}")

# get 'cell_id' column index
cell_id_index = df.columns.get_loc("cell_id")

# Drop 'cell_id' and insert 'segger_id' at the same position
df = df.drop(columns=["cell_id"])
segger_series = df.pop("segger_id")
df.insert(cell_id_index, "cell_id", segger_series)

return df


def main(input_file: str) -> None:
transcripts = refine_transcripts(input_file)
transcripts.to_parquet("transcripts.parquet", engine="pyarrow")


if __name__ == "__main__":

transcripts = "${transcripts}"

main(input_file=transcripts)

#Output versions.yml
with open("versions.yml", "w") as f:
f.write('"${task.process}":\\n')
f.write('segger2xr: "v0.0.1"\\n')
34 changes: 34 additions & 0 deletions modules/local/utility/split_transcripts/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
process SPLIT_TRANSCRIPTS {
tag "$meta.id"
label 'process_low'

container "ghcr.io/scverse/spatialdata:spatialdata0.3.0_spatialdata-io0.1.7_spatialdata-plot0.2.9"

input:
tuple val(meta), path(transcripts)
val(x_bins)
val(y_bins)

output:
tuple val(meta), path("splits.csv"), emit: splits_csv
path("versions.yml") , emit: versions

when:
task.ext.when == null || task.ext.when

script:
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
error "SPLIT_TRANSCRIPTS module does not support Conda. Please use Docker / Singularity / Podman instead."
}

template 'split_transcripts.py'

stub:
"""
touch ${transcripts}.parquet
cat <<-END_VERSIONS > versions.yml
"${task.process}":
baysor_split_parquet: "1.0.0"
END_VERSIONS
"""
}
Loading