khersameesh24
diff --git a/‎docs/images/spatialxe-metromap.png‎
100 KB b/‎docs/images/spatialxe-metromap.png‎
100 KB
diff --git a/‎modules/local/baysor/preprocess/main.nf‎
Lines changed: 37 additions & 0 deletions b/‎modules/local/baysor/preprocess/main.nf‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎modules/local/baysor/preprocess/meta.yml‎
Lines changed: 70 additions & 0 deletions b/‎modules/local/baysor/preprocess/meta.yml‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎modules/local/baysor/preprocess/templates/preprocess_transcripts.py‎
Lines changed: 74 additions & 0 deletions b/‎modules/local/baysor/preprocess/templates/preprocess_transcripts.py‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎modules/local/spatialconverter/parquet_to_csv/templates/parquet_to_csv.py‎
Lines changed: 2 additions & 2 deletions b/‎modules/local/spatialconverter/parquet_to_csv/templates/parquet_to_csv.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎modules/local/spatialdata/write/templates/write.py‎
Lines changed: 6 additions & 3 deletions b/‎modules/local/spatialdata/write/templates/write.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎modules/local/utility/segger2xr/main.nf‎
Lines changed: 32 additions & 0 deletions b/‎modules/local/utility/segger2xr/main.nf‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎modules/local/utility/segger2xr/templates/segger2xr.py‎
Lines changed: 61 additions & 0 deletions b/‎modules/local/utility/segger2xr/templates/segger2xr.py‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎modules/local/utility/split_transcripts/main.nf‎
Lines changed: 34 additions & 0 deletions b/‎modules/local/utility/split_transcripts/main.nf‎
Lines changed: 34 additions & 0 deletions
@@ -0,0 +1,37 @@
+process BAYSOR_PREPROCESS_TRANSCRIPTS {
+    tag "$meta.id"
+    label 'process_low'
+
+    container "ghcr.io/scverse/spatialdata:spatialdata0.3.0_spatialdata-io0.1.7_spatialdata-plot0.2.9"
+
+    input:
+    tuple val(meta), path(transcripts)
+    val(min_qv)
+    val(max_x)
+    val(min_x)
+    val(max_y)
+    val(min_y)
+
+    output:
+    tuple val(meta), path("*.parquet"), emit: transcripts_parquet
+    path("versions.yml")              , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
+        error "PREPROCESS_TRANSCRIPTS module does not support Conda. Please use Docker / Singularity / Podman instead."
+    }
+
+    template 'preprocess_transcripts.py'
+
+    stub:
+    """
+    touch ${transcripts}.parquet
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        baysor_preprocess_transcripts: "1.0.0"
+    END_VERSIONS
+    """
+}
@@ -0,0 +1,70 @@
+name: "baysor_preprocess"
+description: Filter transcript.parquet fiel based on the specified thresholds
+keywords:
+  - baysor
+  - transcripts
+  - filter_transcripts
+tools:
+  - "baysor":
+    description: "Baysor is a tool that segments cells using spatial gene expression maps. Optionally, segmentation masks can be given as additional input."
+    homepage: "https://kharchenkolab.github.io/Baysor/dev/"
+    documentation: "https://kharchenkolab.github.io/Baysor/dev/"
+    tool_dev_url: "https://github.com/kharchenkolab/Baysor"
+    doi: "https://doi.org/10.1038/s41587-021-01044-w"
+    licence: ["MIT license"]
+    identifier:
+
+input:
+  - - meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. [ id:'test' ]
+    - transcripts:
+        type: file
+        description: transcripts.parquet file from the xenium bundle
+        pattern: "*.parquet"
+
+  - min_qv:
+      type: float
+      description: minimum Q-Score to pass filtering (default - 20.0)
+  - max_x:
+      type: float
+      description: Only keep transcripts whose x-coordinate is less than specified limit
+        if no limit is specified, the default value will retain all
+        transcripts since Xenium slide is <24000 microns in x and y (default - 24000.0)
+  - min_x:
+      type: float
+      description: only keep transcripts whose x-coordinate is greater than specified limit
+        if no limit is specified, the default minimum value will be 0.0
+  - max_y:
+      type: float
+      description: only keep transcripts whose y-coordinate is less than specified limit
+        if no limit is specified, the default value will retain all
+        transcripts since Xenium slide is <24000 microns in x and y (default - 24000.0)
+  - min_y:
+      type: float
+      description: only keep transcripts whose y-coordinate is greater than specified limit
+        if no limit is specified, the default minimum value will be 0.0
+
+output:
+  - - transcripts_parquet:
+        - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. [ id:'test' ]
+        - "*.parquet":
+          type: file
+          description: filtered transcripts.parquet
+          pattern: "filtered_transcripts.parquet"
+
+  - - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+
+authors:
+  - "@khersameesh24"
+maintainers:
+  - "@khersameesh24"
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+
+import pandas as pd
+
+
+def filter_transcripts (
+    transcripts: str,
+    min_qv: float = 20.0,
+    min_x: float = 0.0,
+    max_x: float = 24000.0,
+    min_y: float = 0.0,
+    max_y: float = 24000.0
+) -> None:
+    """
+    Filter transcripts based on the specified thresholds
+
+    Args:
+    transcripts - path to transcripts parquet
+    ----------------------------------- filters --------------------------------------------
+    min_qv - minimum Q-Score to pass filtering (default: 20.0)
+    min_x  - only keep transcripts whose x-coordinate is greater than specified limit
+             if no limit is specified, the default minimum value will be 0.0
+    max_x  - only keep transcripts whose x-coordinate is less than specified limit
+             if no limit is specified, the default value will retain all
+             transcripts since Xenium slide is <24000 microns in x and y (default: 24000.0)
+    min_y  - only keep transcripts whose y-coordinate is greater than specified limit
+             if no limit is specified, the default minimum value will be 0.0
+    max_y  - only keep transcripts whose y-coordinate is less than specified limit
+             if no limit is specified, the default value will retain all
+             transcripts since Xenium slide is <24000 microns in x and y (default: 24000.0)
+    """
+    df = pd.read_parquet(transcripts, engine = 'pyarrow')
+
+    # filter transcripts df with thresholds, ignore negative controls
+    filtered_df = df[(df["qv"] >= min_qv) &
+                                (df["x_location"] >= min_x) &
+                                (df["x_location"] <= max_x) &
+                                (df["y_location"] >= min_y) &
+                                (df["y_location"] <= max_y) &
+                                (~df["feature_name"].str.startswith("NegControlProbe_")) &
+                                (~df["feature_name"].str.startswith("antisense_")) &
+                                (~df["feature_name"].str.startswith("NegControlCodeword_")) &
+                                (~df["feature_name"].str.startswith("BLANK_"))]
+
+    # change cell_id of cell-free transcripts from -1 to 0
+    neg_cell_row = filtered_df["cell_id"] == -1
+    filtered_df.loc[neg_cell_row,"cell_id"] = 0
+
+    # Output filtered transcripts to parquet
+    filtered_df.to_parquet(
+        '_'.join(["X"+str(min_x)+"-"+str(max_x), "Y"+str(min_y)+"-"+str(max_y), "filtered_transcripts.parquet"]),
+        index=False
+    )
+
+    return None
+
+
+def generate_version_yml() -> None:
+    with open("versions.yml", "w") as yml:
+        yml.write('"${task.process}":\\n')
+        yml.write("Baysor-Preprocess Transcripts: 1.0.0'\\n")
+
+    return None
+
+
+if __name__ == "__main__":
+
+    transcripts: str = "${transcripts}"
+
+    filter_transcripts (
+        transcripts=transcripts,
+    )
+
+    generate_version_yml()
@@ -32,7 +32,7 @@ def convert_parquet (
         extension=extension
     )
 
-    #Output version information
+    #Output versions.yml
     with open("versions.yml", "w") as f:
         f.write('"${task.process}":\\n')
-        f.write(f'spatialconverter: "v0.0.1"\\n')
+        f.write('spatialconverter: "v0.0.1"\\n')
@@ -14,6 +14,7 @@ def main():
     outputfolder = "${outputfolder}"
     segmented_object = "${segmented_object}"
 
+    cells_as_circles=True
     cells_boundaries=False
     nucleus_boundaries=False
     cells_labels=False
@@ -22,20 +23,22 @@ def main():
     if ( segmented_object == 'cells' ):
         cells_boundaries=True
         cells_labels=True
-    if ( segmented_object == 'nuclei' ):
+    elif ( segmented_object == 'nuclei' ):
         nucleus_boundaries=True
         nucleus_labels=True
-    if ( segmented_object == 'cells_and_nuclei' ):
+    elif ( segmented_object == 'cells_and_nuclei' ):
         cells_boundaries=True
         nucleus_boundaries=True
         cells_labels=True
         nucleus_labels=True
+    else:
+        cells_as_circles=False
 
     format = "${params.format}"
     if ( format == "xenium" ):
         sd_xenium_obj = xenium(
             input_path,
-            cells_as_circles=True,
+            cells_as_circles=cells_as_circles,
             cells_boundaries=cells_boundaries,
             nucleus_boundaries=nucleus_boundaries,
             cells_labels=cells_labels,
 
@@ -0,0 +1,32 @@
+process SEGGER2XR {
+    tag "$meta.id"
+    label 'process_low'
+
+    container "ghcr.io/scverse/spatialdata:spatialdata0.3.0_spatialdata-io0.1.7_spatialdata-plot0.2.9"
+
+    input:
+    tuple val(meta), path(transcripts)
+
+    output:
+    tuple val(meta), path("transcripts.parquet"), emit: transcripts_parquet
+    path("versions.yml")                        , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
+        error "SEGGER2XR module does not support Conda. Please use Docker / Singularity / Podman instead."
+    }
+
+    template 'segger2xr.py'
+
+    stub:
+    """
+    touch ${transcripts}.parquet
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        segger2xr: "${task.version}"
+    END_VERSIONS
+    """
+}
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+
+import pandas as pd
+from pathlib import Path
+from typing import List
+
+# Expected columns in transcripts.parquet
+REQUIRED_COLUMNS: List[str] = [
+    "transcript_id",
+    "cell_id",
+    "overlaps_nucleus",
+    "feature_name",
+    "x_location",
+    "y_location",
+    "z_location",
+    "qv",
+    "segger_id"
+]
+
+def refine_transcripts(parquet_path: str) -> pd.DataFrame:
+    """
+    Replace the cell_id column with segger_id
+    """
+    parquet_file = Path(parquet_path)
+    if not parquet_file.exists():
+        raise FileNotFoundError(f"File not found: {parquet_path}")
+
+    # Read parquet file
+    df = pd.read_parquet(parquet_file, engine="pyarrow")
+
+    # Validate required columns
+    missing_cols = [col for col in REQUIRED_COLUMNS if col not in df.columns]
+    if missing_cols:
+        raise ValueError(f"Missing required columns: {missing_cols}")
+
+    # get 'cell_id' column index
+    cell_id_index = df.columns.get_loc("cell_id")
+
+    # Drop 'cell_id' and insert 'segger_id' at the same position
+    df = df.drop(columns=["cell_id"])
+    segger_series = df.pop("segger_id")
+    df.insert(cell_id_index, "cell_id", segger_series)
+
+    return df
+
+
+def main(input_file: str) -> None:
+    transcripts = refine_transcripts(input_file)
+    transcripts.to_parquet("transcripts.parquet", engine="pyarrow")
+
+
+if __name__ == "__main__":
+
+    transcripts = "${transcripts}"
+
+    main(input_file=transcripts)
+
+    #Output versions.yml
+    with open("versions.yml", "w") as f:
+        f.write('"${task.process}":\\n')
+        f.write('segger2xr: "v0.0.1"\\n')
@@ -0,0 +1,34 @@
+process SPLIT_TRANSCRIPTS {
+    tag "$meta.id"
+    label 'process_low'
+
+    container "ghcr.io/scverse/spatialdata:spatialdata0.3.0_spatialdata-io0.1.7_spatialdata-plot0.2.9"
+
+    input:
+    tuple val(meta), path(transcripts)
+    val(x_bins)
+    val(y_bins)
+
+    output:
+    tuple val(meta), path("splits.csv"), emit: splits_csv
+    path("versions.yml")               , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
+        error "SPLIT_TRANSCRIPTS module does not support Conda. Please use Docker / Singularity / Podman instead."
+    }
+
+    template 'split_transcripts.py'
+
+    stub:
+    """
+    touch ${transcripts}.parquet
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        baysor_split_parquet: "1.0.0"
+    END_VERSIONS
+    """
+}
Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@ def convert_parquet (`
`32`	`32`	`extension=extension`
`33`	`33`	`)`
`34`	`34`
`35`		`- #Output version information`
	`35`	`+ #Output versions.yml`
`36`	`36`	`with open("versions.yml", "w") as f:`
`37`	`37`	`f.write('"${task.process}":\\n')`
`38`		`- f.write(f'spatialconverter: "v0.0.1"\\n')`
	`38`	`+ f.write('spatialconverter: "v0.0.1"\\n')`