Skip to content

Commit b4e4926

Browse files
Merge pull request nf-core#95 from khersameesh24/dev
choice of segmentation, changes to baysor, updated metromap
2 parents 488989d + 331ce5c commit b4e4926

22 files changed

Lines changed: 868 additions & 93 deletions

File tree

docs/images/spatialxe-metromap.png

100 KB
Loading
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
process BAYSOR_PREPROCESS_TRANSCRIPTS {
2+
tag "$meta.id"
3+
label 'process_low'
4+
5+
container "ghcr.io/scverse/spatialdata:spatialdata0.3.0_spatialdata-io0.1.7_spatialdata-plot0.2.9"
6+
7+
input:
8+
tuple val(meta), path(transcripts)
9+
val(min_qv)
10+
val(max_x)
11+
val(min_x)
12+
val(max_y)
13+
val(min_y)
14+
15+
output:
16+
tuple val(meta), path("*.parquet"), emit: transcripts_parquet
17+
path("versions.yml") , emit: versions
18+
19+
when:
20+
task.ext.when == null || task.ext.when
21+
22+
script:
23+
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
24+
error "PREPROCESS_TRANSCRIPTS module does not support Conda. Please use Docker / Singularity / Podman instead."
25+
}
26+
27+
template 'preprocess_transcripts.py'
28+
29+
stub:
30+
"""
31+
touch ${transcripts}.parquet
32+
cat <<-END_VERSIONS > versions.yml
33+
"${task.process}":
34+
baysor_preprocess_transcripts: "1.0.0"
35+
END_VERSIONS
36+
"""
37+
}
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
name: "baysor_preprocess"
2+
description: Filter transcript.parquet fiel based on the specified thresholds
3+
keywords:
4+
- baysor
5+
- transcripts
6+
- filter_transcripts
7+
tools:
8+
- "baysor":
9+
description: "Baysor is a tool that segments cells using spatial gene expression maps. Optionally, segmentation masks can be given as additional input."
10+
homepage: "https://kharchenkolab.github.io/Baysor/dev/"
11+
documentation: "https://kharchenkolab.github.io/Baysor/dev/"
12+
tool_dev_url: "https://github.com/kharchenkolab/Baysor"
13+
doi: "https://doi.org/10.1038/s41587-021-01044-w"
14+
licence: ["MIT license"]
15+
identifier:
16+
17+
input:
18+
- - meta:
19+
type: map
20+
description: |
21+
Groovy Map containing sample information
22+
e.g. [ id:'test' ]
23+
- transcripts:
24+
type: file
25+
description: transcripts.parquet file from the xenium bundle
26+
pattern: "*.parquet"
27+
28+
- min_qv:
29+
type: float
30+
description: minimum Q-Score to pass filtering (default - 20.0)
31+
- max_x:
32+
type: float
33+
description: Only keep transcripts whose x-coordinate is less than specified limit
34+
if no limit is specified, the default value will retain all
35+
transcripts since Xenium slide is <24000 microns in x and y (default - 24000.0)
36+
- min_x:
37+
type: float
38+
description: only keep transcripts whose x-coordinate is greater than specified limit
39+
if no limit is specified, the default minimum value will be 0.0
40+
- max_y:
41+
type: float
42+
description: only keep transcripts whose y-coordinate is less than specified limit
43+
if no limit is specified, the default value will retain all
44+
transcripts since Xenium slide is <24000 microns in x and y (default - 24000.0)
45+
- min_y:
46+
type: float
47+
description: only keep transcripts whose y-coordinate is greater than specified limit
48+
if no limit is specified, the default minimum value will be 0.0
49+
50+
output:
51+
- - transcripts_parquet:
52+
- meta:
53+
type: map
54+
description: |
55+
Groovy Map containing sample information
56+
e.g. [ id:'test' ]
57+
- "*.parquet":
58+
type: file
59+
description: filtered transcripts.parquet
60+
pattern: "filtered_transcripts.parquet"
61+
62+
- - versions:
63+
type: file
64+
description: File containing software versions
65+
pattern: "versions.yml"
66+
67+
authors:
68+
- "@khersameesh24"
69+
maintainers:
70+
- "@khersameesh24"
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#!/usr/bin/env python3
2+
3+
import pandas as pd
4+
5+
6+
def filter_transcripts (
7+
transcripts: str,
8+
min_qv: float = 20.0,
9+
min_x: float = 0.0,
10+
max_x: float = 24000.0,
11+
min_y: float = 0.0,
12+
max_y: float = 24000.0
13+
) -> None:
14+
"""
15+
Filter transcripts based on the specified thresholds
16+
17+
Args:
18+
transcripts - path to transcripts parquet
19+
----------------------------------- filters --------------------------------------------
20+
min_qv - minimum Q-Score to pass filtering (default: 20.0)
21+
min_x - only keep transcripts whose x-coordinate is greater than specified limit
22+
if no limit is specified, the default minimum value will be 0.0
23+
max_x - only keep transcripts whose x-coordinate is less than specified limit
24+
if no limit is specified, the default value will retain all
25+
transcripts since Xenium slide is <24000 microns in x and y (default: 24000.0)
26+
min_y - only keep transcripts whose y-coordinate is greater than specified limit
27+
if no limit is specified, the default minimum value will be 0.0
28+
max_y - only keep transcripts whose y-coordinate is less than specified limit
29+
if no limit is specified, the default value will retain all
30+
transcripts since Xenium slide is <24000 microns in x and y (default: 24000.0)
31+
"""
32+
df = pd.read_parquet(transcripts, engine = 'pyarrow')
33+
34+
# filter transcripts df with thresholds, ignore negative controls
35+
filtered_df = df[(df["qv"] >= min_qv) &
36+
(df["x_location"] >= min_x) &
37+
(df["x_location"] <= max_x) &
38+
(df["y_location"] >= min_y) &
39+
(df["y_location"] <= max_y) &
40+
(~df["feature_name"].str.startswith("NegControlProbe_")) &
41+
(~df["feature_name"].str.startswith("antisense_")) &
42+
(~df["feature_name"].str.startswith("NegControlCodeword_")) &
43+
(~df["feature_name"].str.startswith("BLANK_"))]
44+
45+
# change cell_id of cell-free transcripts from -1 to 0
46+
neg_cell_row = filtered_df["cell_id"] == -1
47+
filtered_df.loc[neg_cell_row,"cell_id"] = 0
48+
49+
# Output filtered transcripts to parquet
50+
filtered_df.to_parquet(
51+
'_'.join(["X"+str(min_x)+"-"+str(max_x), "Y"+str(min_y)+"-"+str(max_y), "filtered_transcripts.parquet"]),
52+
index=False
53+
)
54+
55+
return None
56+
57+
58+
def generate_version_yml() -> None:
59+
with open("versions.yml", "w") as yml:
60+
yml.write('"${task.process}":\\n')
61+
yml.write("Baysor-Preprocess Transcripts: 1.0.0'\\n")
62+
63+
return None
64+
65+
66+
if __name__ == "__main__":
67+
68+
transcripts: str = "${transcripts}"
69+
70+
filter_transcripts (
71+
transcripts=transcripts,
72+
)
73+
74+
generate_version_yml()

modules/local/spatialconverter/parquet_to_csv/templates/parquet_to_csv.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def convert_parquet (
3232
extension=extension
3333
)
3434

35-
#Output version information
35+
#Output versions.yml
3636
with open("versions.yml", "w") as f:
3737
f.write('"${task.process}":\\n')
38-
f.write(f'spatialconverter: "v0.0.1"\\n')
38+
f.write('spatialconverter: "v0.0.1"\\n')

modules/local/spatialdata/write/templates/write.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ def main():
1414
outputfolder = "${outputfolder}"
1515
segmented_object = "${segmented_object}"
1616

17+
cells_as_circles=True
1718
cells_boundaries=False
1819
nucleus_boundaries=False
1920
cells_labels=False
@@ -22,20 +23,22 @@ def main():
2223
if ( segmented_object == 'cells' ):
2324
cells_boundaries=True
2425
cells_labels=True
25-
if ( segmented_object == 'nuclei' ):
26+
elif ( segmented_object == 'nuclei' ):
2627
nucleus_boundaries=True
2728
nucleus_labels=True
28-
if ( segmented_object == 'cells_and_nuclei' ):
29+
elif ( segmented_object == 'cells_and_nuclei' ):
2930
cells_boundaries=True
3031
nucleus_boundaries=True
3132
cells_labels=True
3233
nucleus_labels=True
34+
else:
35+
cells_as_circles=False
3336

3437
format = "${params.format}"
3538
if ( format == "xenium" ):
3639
sd_xenium_obj = xenium(
3740
input_path,
38-
cells_as_circles=True,
41+
cells_as_circles=cells_as_circles,
3942
cells_boundaries=cells_boundaries,
4043
nucleus_boundaries=nucleus_boundaries,
4144
cells_labels=cells_labels,
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
process SEGGER2XR {
2+
tag "$meta.id"
3+
label 'process_low'
4+
5+
container "ghcr.io/scverse/spatialdata:spatialdata0.3.0_spatialdata-io0.1.7_spatialdata-plot0.2.9"
6+
7+
input:
8+
tuple val(meta), path(transcripts)
9+
10+
output:
11+
tuple val(meta), path("transcripts.parquet"), emit: transcripts_parquet
12+
path("versions.yml") , emit: versions
13+
14+
when:
15+
task.ext.when == null || task.ext.when
16+
17+
script:
18+
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
19+
error "SEGGER2XR module does not support Conda. Please use Docker / Singularity / Podman instead."
20+
}
21+
22+
template 'segger2xr.py'
23+
24+
stub:
25+
"""
26+
touch ${transcripts}.parquet
27+
cat <<-END_VERSIONS > versions.yml
28+
"${task.process}":
29+
segger2xr: "${task.version}"
30+
END_VERSIONS
31+
"""
32+
}
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#!/usr/bin/env python3
2+
3+
import pandas as pd
4+
from pathlib import Path
5+
from typing import List
6+
7+
# Expected columns in transcripts.parquet
8+
REQUIRED_COLUMNS: List[str] = [
9+
"transcript_id",
10+
"cell_id",
11+
"overlaps_nucleus",
12+
"feature_name",
13+
"x_location",
14+
"y_location",
15+
"z_location",
16+
"qv",
17+
"segger_id"
18+
]
19+
20+
def refine_transcripts(parquet_path: str) -> pd.DataFrame:
21+
"""
22+
Replace the cell_id column with segger_id
23+
"""
24+
parquet_file = Path(parquet_path)
25+
if not parquet_file.exists():
26+
raise FileNotFoundError(f"File not found: {parquet_path}")
27+
28+
# Read parquet file
29+
df = pd.read_parquet(parquet_file, engine="pyarrow")
30+
31+
# Validate required columns
32+
missing_cols = [col for col in REQUIRED_COLUMNS if col not in df.columns]
33+
if missing_cols:
34+
raise ValueError(f"Missing required columns: {missing_cols}")
35+
36+
# get 'cell_id' column index
37+
cell_id_index = df.columns.get_loc("cell_id")
38+
39+
# Drop 'cell_id' and insert 'segger_id' at the same position
40+
df = df.drop(columns=["cell_id"])
41+
segger_series = df.pop("segger_id")
42+
df.insert(cell_id_index, "cell_id", segger_series)
43+
44+
return df
45+
46+
47+
def main(input_file: str) -> None:
48+
transcripts = refine_transcripts(input_file)
49+
transcripts.to_parquet("transcripts.parquet", engine="pyarrow")
50+
51+
52+
if __name__ == "__main__":
53+
54+
transcripts = "${transcripts}"
55+
56+
main(input_file=transcripts)
57+
58+
#Output versions.yml
59+
with open("versions.yml", "w") as f:
60+
f.write('"${task.process}":\\n')
61+
f.write('segger2xr: "v0.0.1"\\n')
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
process SPLIT_TRANSCRIPTS {
2+
tag "$meta.id"
3+
label 'process_low'
4+
5+
container "ghcr.io/scverse/spatialdata:spatialdata0.3.0_spatialdata-io0.1.7_spatialdata-plot0.2.9"
6+
7+
input:
8+
tuple val(meta), path(transcripts)
9+
val(x_bins)
10+
val(y_bins)
11+
12+
output:
13+
tuple val(meta), path("splits.csv"), emit: splits_csv
14+
path("versions.yml") , emit: versions
15+
16+
when:
17+
task.ext.when == null || task.ext.when
18+
19+
script:
20+
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
21+
error "SPLIT_TRANSCRIPTS module does not support Conda. Please use Docker / Singularity / Podman instead."
22+
}
23+
24+
template 'split_transcripts.py'
25+
26+
stub:
27+
"""
28+
touch ${transcripts}.parquet
29+
cat <<-END_VERSIONS > versions.yml
30+
"${task.process}":
31+
baysor_split_parquet: "1.0.0"
32+
END_VERSIONS
33+
"""
34+
}

0 commit comments

Comments
 (0)