From ea46ad0222f7ab572b30638aab418a04e9f7fcc7 Mon Sep 17 00:00:00 2001 From: an-altosian Date: Tue, 5 May 2026 16:10:25 +0000 Subject: [PATCH 1/3] refactor(modules): convert all 19 module-level bin scripts to Nextflow templates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nf-core core team requires no module-level `bin/` for release approval (PR #139, comment r3185699629 from awgymer). Move every per-module `resources/usr/bin/*.py` into `templates/*.py` and invoke via the `template '.py'` directive in the script: block. - 18 modules: mechanical conversion. Replace argparse with module-top Groovy interpolation (RAW_BUNDLE = "${raw_bundle}", etc.). For modules with task.ext.args (ficture/preprocess, segger/predict, segger/create_dataset, xenium_patch/stitch): inject ARGS = "${args}", build sys.argv via shlex.split, keep existing argparse code path. - segger/create_dataset: NUMBA_CACHE_DIR setup moves from shell prelude into Python BEFORE numba/torch imports (file-level # ruff: noqa: E402 to document the load-order requirement). - xenium_patch/stitch: stitch_transcripts.py + stitch_postprocess.py merged into one templates/stitch.py orchestrator that calls both inline (single-file template constraint of the directive). Containers, inputs, outputs, and stub blocks are unchanged across all 19 modules. Pipeline-level bin/ scripts (divide_transcripts.py, stitch_transcripts.py — used by xenium_patch/divide and utility/reconstruct_patches) are untouched; pipeline-level bin/ remains nf-core compliant. --- modules/local/baysor/create_dataset/main.nf | 7 +- .../usr/bin => templates}/create_dataset.py | 29 +--- modules/local/baysor/preprocess/main.nf | 11 +- .../preprocess_transcripts.py | 63 ++------ modules/local/ficture/preprocess/main.nf | 9 +- .../bin => templates}/ficture_preprocess.py | 36 +++-- modules/local/segger/create_dataset/main.nf | 14 +- .../bin => templates}/run_create_dataset.py | 61 ++++---- modules/local/segger/predict/main.nf | 14 +- .../usr/bin => templates}/run_predict.py | 32 ++++ modules/local/spatialdata/merge/main.nf | 8 +- .../bin => templates}/spatialdata_merge.py | 43 ++---- modules/local/spatialdata/meta/main.nf | 9 +- .../usr/bin => templates}/spatialdata_meta.py | 32 ++-- modules/local/spatialdata/write/main.nf | 10 +- .../bin => templates}/spatialdata_write.py | 40 +++-- .../local/utility/convert_mask_uint32/main.nf | 7 +- .../bin => templates}/convert_mask_uint32.py | 23 +-- .../utility/downscale_morphology/main.nf | 9 +- .../bin => templates}/downscale_morphology.py | 28 ++-- modules/local/utility/extract_dapi/main.nf | 8 +- .../usr/bin => templates}/extract_dapi.py | 31 +--- .../utility/extract_preview_data/main.nf | 6 +- .../usr/bin => templates}/extract_data.py | 29 +--- modules/local/utility/get_coordinates/main.nf | 5 +- .../usr/bin => templates}/get_coordinates.py | 21 +-- modules/local/utility/parquet_to_csv/main.nf | 7 +- .../usr/bin => templates}/parquet_to_csv.py | 36 +---- modules/local/utility/resize_tif/main.nf | 9 +- .../usr/bin => templates}/resize_tif.py | 33 ++--- modules/local/utility/segger2xr/main.nf | 9 +- .../usr/bin => templates}/segger2xr.py | 37 +---- .../local/utility/split_transcripts/main.nf | 10 +- .../bin => templates}/split_transcripts.py | 46 ++---- modules/local/utility/upscale_mask/main.nf | 8 +- .../usr/bin => templates}/upscale_mask.py | 24 +-- modules/local/xenium_patch/stitch/main.nf | 12 +- .../resources/usr/bin/stitch_postprocess.py | 98 ------------ .../stitch.py} | 140 ++++++++++++++++-- 39 files changed, 393 insertions(+), 661 deletions(-) rename modules/local/baysor/create_dataset/{resources/usr/bin => templates}/create_dataset.py (76%) mode change 100755 => 100644 rename modules/local/baysor/preprocess/{resources/usr/bin => templates}/preprocess_transcripts.py (66%) mode change 100755 => 100644 rename modules/local/ficture/preprocess/{resources/usr/bin => templates}/ficture_preprocess.py (83%) mode change 100755 => 100644 rename modules/local/segger/create_dataset/{resources/usr/bin => templates}/run_create_dataset.py (87%) mode change 100755 => 100644 rename modules/local/segger/predict/{resources/usr/bin => templates}/run_predict.py (83%) mode change 100755 => 100644 rename modules/local/spatialdata/merge/{resources/usr/bin => templates}/spatialdata_merge.py (55%) mode change 100755 => 100644 rename modules/local/spatialdata/meta/{resources/usr/bin => templates}/spatialdata_meta.py (78%) mode change 100755 => 100644 rename modules/local/spatialdata/write/{resources/usr/bin => templates}/spatialdata_write.py (78%) mode change 100755 => 100644 rename modules/local/utility/convert_mask_uint32/{resources/usr/bin => templates}/convert_mask_uint32.py (60%) mode change 100755 => 100644 rename modules/local/utility/downscale_morphology/{resources/usr/bin => templates}/downscale_morphology.py (79%) mode change 100755 => 100644 rename modules/local/utility/extract_dapi/{resources/usr/bin => templates}/extract_dapi.py (60%) mode change 100755 => 100644 rename modules/local/utility/extract_preview_data/{resources/usr/bin => templates}/extract_data.py (90%) mode change 100755 => 100644 rename modules/local/utility/get_coordinates/{resources/usr/bin => templates}/get_coordinates.py (71%) mode change 100755 => 100644 rename modules/local/utility/parquet_to_csv/{resources/usr/bin => templates}/parquet_to_csv.py (57%) mode change 100755 => 100644 rename modules/local/utility/resize_tif/{resources/usr/bin => templates}/resize_tif.py (80%) mode change 100755 => 100644 rename modules/local/utility/segger2xr/{resources/usr/bin => templates}/segger2xr.py (90%) mode change 100755 => 100644 rename modules/local/utility/split_transcripts/{resources/usr/bin => templates}/split_transcripts.py (68%) mode change 100755 => 100644 rename modules/local/utility/upscale_mask/{resources/usr/bin => templates}/upscale_mask.py (71%) mode change 100755 => 100644 delete mode 100755 modules/local/xenium_patch/stitch/resources/usr/bin/stitch_postprocess.py rename modules/local/xenium_patch/stitch/{resources/usr/bin/stitch_transcripts.py => templates/stitch.py} (85%) mode change 100755 => 100644 diff --git a/modules/local/baysor/create_dataset/main.nf b/modules/local/baysor/create_dataset/main.nf index d4e0043e..91aa67a0 100644 --- a/modules/local/baysor/create_dataset/main.nf +++ b/modules/local/baysor/create_dataset/main.nf @@ -23,12 +23,7 @@ process BAYSOR_CREATE_DATASET { prefix = task.ext.prefix ?: "${meta.id}" - """ - create_dataset.py \\ - --transcripts ${transcripts} \\ - --sample-fraction ${sample_fraction} \\ - --prefix ${prefix} - """ + template 'create_dataset.py' stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/baysor/create_dataset/resources/usr/bin/create_dataset.py b/modules/local/baysor/create_dataset/templates/create_dataset.py old mode 100755 new mode 100644 similarity index 76% rename from modules/local/baysor/create_dataset/resources/usr/bin/create_dataset.py rename to modules/local/baysor/create_dataset/templates/create_dataset.py index 4e5a263a..bbf05320 --- a/modules/local/baysor/create_dataset/resources/usr/bin/create_dataset.py +++ b/modules/local/baysor/create_dataset/templates/create_dataset.py @@ -6,12 +6,16 @@ writing the result to a new CSV file. """ -import argparse import csv import os import random from pathlib import Path +# Nextflow-injected variables +TRANSCRIPTS = "${transcripts}" +SAMPLE_FRACTION = "${sample_fraction}" +PREFIX = "${prefix}" + class BaysorPreview(): """ @@ -62,31 +66,14 @@ def main() -> None: """ Run create dataset as nf module """ - parser = argparse.ArgumentParser( - description="Create sampled dataset for Baysor preview" - ) - parser.add_argument( - "--transcripts", required=True, - help="Path to transcripts CSV file" - ) - parser.add_argument( - "--sample-fraction", required=True, type=float, - help="Fraction of rows to sample" - ) - parser.add_argument( - "--prefix", required=True, - help="Output directory prefix" - ) - args = parser.parse_args() - sampled_transcripts = "sampled_transcripts.csv" # generate dataset BaysorPreview.generate_dataset( - transcripts=args.transcripts, + transcripts=TRANSCRIPTS, sampled_transcripts=sampled_transcripts, - sample_fraction=args.sample_fraction, - prefix=args.prefix + sample_fraction=float(SAMPLE_FRACTION), + prefix=PREFIX, ) return None diff --git a/modules/local/baysor/preprocess/main.nf b/modules/local/baysor/preprocess/main.nf index 7b3c6ac8..b11479d1 100644 --- a/modules/local/baysor/preprocess/main.nf +++ b/modules/local/baysor/preprocess/main.nf @@ -29,16 +29,7 @@ process BAYSOR_PREPROCESS_TRANSCRIPTS { prefix = task.ext.prefix ?: "${meta.id}" - """ - preprocess_transcripts.py \\ - --transcripts ${transcripts} \\ - --prefix ${prefix} \\ - --min-qv ${min_qv} \\ - --min-x ${min_x} \\ - --max-x ${max_x} \\ - --min-y ${min_y} \\ - --max-y ${max_y} - """ + template 'preprocess_transcripts.py' stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/baysor/preprocess/resources/usr/bin/preprocess_transcripts.py b/modules/local/baysor/preprocess/templates/preprocess_transcripts.py old mode 100755 new mode 100644 similarity index 66% rename from modules/local/baysor/preprocess/resources/usr/bin/preprocess_transcripts.py rename to modules/local/baysor/preprocess/templates/preprocess_transcripts.py index 2662f83c..25bc41dc --- a/modules/local/baysor/preprocess/resources/usr/bin/preprocess_transcripts.py +++ b/modules/local/baysor/preprocess/templates/preprocess_transcripts.py @@ -6,11 +6,19 @@ removes negative control probes, and outputs filtered CSV for Baysor compatibility. """ -import argparse import os import pandas as pd +# Nextflow-injected variables +TRANSCRIPTS = "${transcripts}" +PREFIX = "${prefix}" +MIN_QV = "${min_qv}" +MIN_X = "${min_x}" +MAX_X = "${max_x}" +MIN_Y = "${min_y}" +MAX_Y = "${max_y}" + def filter_transcripts( transcripts: str, @@ -70,53 +78,14 @@ def main() -> None: """ Run preprocess transcripts as nf module. """ - parser = argparse.ArgumentParser( - description="Preprocess Xenium transcripts for Baysor" - ) - parser.add_argument( - "--transcripts", required=True, help="Path to transcripts parquet file" - ) - parser.add_argument("--prefix", required=True, help="Output directory prefix") - parser.add_argument( - "--min-qv", - type=float, - default=20.0, - help="Minimum Q-Score threshold (default: 20.0)", - ) - parser.add_argument( - "--min-x", - type=float, - default=0.0, - help="Minimum x-coordinate threshold (default: 0.0)", - ) - parser.add_argument( - "--max-x", - type=float, - default=24000.0, - help="Maximum x-coordinate threshold (default: 24000.0)", - ) - parser.add_argument( - "--min-y", - type=float, - default=0.0, - help="Minimum y-coordinate threshold (default: 0.0)", - ) - parser.add_argument( - "--max-y", - type=float, - default=24000.0, - help="Maximum y-coordinate threshold (default: 24000.0)", - ) - args = parser.parse_args() - filter_transcripts( - transcripts=args.transcripts, - min_qv=args.min_qv, - min_x=args.min_x, - max_x=args.max_x, - min_y=args.min_y, - max_y=args.max_y, - prefix=args.prefix, + transcripts=TRANSCRIPTS, + min_qv=float(MIN_QV), + min_x=float(MIN_X), + max_x=float(MAX_X), + min_y=float(MIN_Y), + max_y=float(MAX_Y), + prefix=PREFIX, ) return None diff --git a/modules/local/ficture/preprocess/main.nf b/modules/local/ficture/preprocess/main.nf index 7ec3c081..76005c10 100644 --- a/modules/local/ficture/preprocess/main.nf +++ b/modules/local/ficture/preprocess/main.nf @@ -21,15 +21,8 @@ process FICTURE_PREPROCESS { script: def args = task.ext.args ?: '' - def features_arg = features ? "--features ${features}" : "" - """ - ficture_preprocess.py \\ - --transcripts ${transcripts} \\ - ${features_arg} \\ - --negative-control-regex '${params.negative_control_regex}' \\ - ${args} - """ + template 'ficture_preprocess.py' stub: """ diff --git a/modules/local/ficture/preprocess/resources/usr/bin/ficture_preprocess.py b/modules/local/ficture/preprocess/templates/ficture_preprocess.py old mode 100755 new mode 100644 similarity index 83% rename from modules/local/ficture/preprocess/resources/usr/bin/ficture_preprocess.py rename to modules/local/ficture/preprocess/templates/ficture_preprocess.py index 2e0c687c..92ae9721 --- a/modules/local/ficture/preprocess/resources/usr/bin/ficture_preprocess.py +++ b/modules/local/ficture/preprocess/templates/ficture_preprocess.py @@ -1,18 +1,40 @@ #!/usr/bin/env python3 """Preprocess Xenium transcripts for FICTURE analysis.""" -import argparse import gzip import logging import os import re +import shlex import sys import pandas as pd +# Nextflow-injected variables +TRANSCRIPTS = "${transcripts}" +FEATURES = "${features}" +NEGATIVE_CONTROL_REGEX = "${params.negative_control_regex}" +ARGS = "${args}" + + +def main(): + """Run FICTURE preprocessing.""" + print("[START]") + + # Re-build argv so parse_known_args() can absorb any extra flags from task.ext.args + sys.argv = [ + sys.argv[0], + "--transcripts", + TRANSCRIPTS, + "--negative-control-regex", + NEGATIVE_CONTROL_REGEX, + ] + if FEATURES: + sys.argv += ["--features", FEATURES] + sys.argv += shlex.split(ARGS) + + import argparse -def parse_args(): - """Parse command-line arguments.""" parser = argparse.ArgumentParser( description="Preprocess Xenium transcripts for FICTURE" ) @@ -25,13 +47,7 @@ def parse_args(): parser.add_argument( "--negative-control-regex", default="", help="Regex for negative control probes" ) - return parser.parse_args() - - -def main(): - """Run FICTURE preprocessing.""" - args = parse_args() - print("[START]") + args, _ = parser.parse_known_args() negctrl_regex = "BLANK|NegCon" if args.negative_control_regex: diff --git a/modules/local/segger/create_dataset/main.nf b/modules/local/segger/create_dataset/main.nf index 81320eff..2b03d2c9 100644 --- a/modules/local/segger/create_dataset/main.nf +++ b/modules/local/segger/create_dataset/main.nf @@ -29,19 +29,7 @@ process SEGGER_CREATE_DATASET { error("${params.format} is an invalid platform type.") } - """ - export NUMBA_CACHE_DIR=\$PWD/.numba_cache - mkdir -p \$NUMBA_CACHE_DIR - - run_create_dataset.py \\ - --bundle-dir ${base_dir} \\ - --output-dir ${prefix} \\ - --sample-type ${params.format} \\ - --tile-width ${params.tile_width} \\ - --tile-height ${params.tile_height} \\ - --n-workers ${task.cpus} \\ - ${args} - """ + template 'run_create_dataset.py' stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/segger/create_dataset/resources/usr/bin/run_create_dataset.py b/modules/local/segger/create_dataset/templates/run_create_dataset.py old mode 100755 new mode 100644 similarity index 87% rename from modules/local/segger/create_dataset/resources/usr/bin/run_create_dataset.py rename to modules/local/segger/create_dataset/templates/run_create_dataset.py index c73ab006..28df3cff --- a/modules/local/segger/create_dataset/resources/usr/bin/run_create_dataset.py +++ b/modules/local/segger/create_dataset/templates/run_create_dataset.py @@ -11,8 +11,14 @@ Each WORKAROUND should be removable when the upstream segger bug is fixed. """ -import argparse +# ruff: noqa: E402 -- NUMBA_CACHE_DIR must be set before subsequent imports +# (segger / torch transitively pull in numba). Order matters; do not reorder. import os + +os.environ["NUMBA_CACHE_DIR"] = os.path.join(os.getcwd(), ".numba_cache") +os.makedirs(os.environ["NUMBA_CACHE_DIR"], exist_ok=True) + +import shlex import shutil import subprocess import sys @@ -26,18 +32,14 @@ SEGGER_CLI = "/workspace/segger_dev/src/segger/cli/create_dataset_fast.py" - -def parse_args(): - p = argparse.ArgumentParser() - p.add_argument("--bundle-dir", required=True) - p.add_argument("--output-dir", required=True) - p.add_argument("--sample-type", required=True, choices=["xenium"]) - p.add_argument("--tile-width", type=int, required=True) - p.add_argument("--tile-height", type=int, required=True) - p.add_argument("--n-workers", type=int, required=True) - # remaining args forwarded to segger CLI - args, extra = p.parse_known_args() - return args, extra +# Nextflow-injected variables +BUNDLE_DIR = "${base_dir}" +OUTPUT_DIR = "${prefix}" +SAMPLE_TYPE = "${params.format}" +TILE_WIDTH = "${params.tile_width}" +TILE_HEIGHT = "${params.tile_height}" +N_WORKERS = "${task.cpus}" +ARGS = "${args}" def prepare_bundle(bundle_dir): @@ -110,22 +112,22 @@ def add_parquet_stats(): print("=== End Debug ===\n") -def run_segger_cli(args, extra): +def run_segger_cli(output_dir, sample_type, tile_width, tile_height, n_workers, extra): cmd = [ "python3", SEGGER_CLI, "--base_dir", "bundle_stats", "--data_dir", - args.output_dir, + output_dir, "--sample_type", - args.sample_type, + sample_type, "--tile_width", - str(args.tile_width), + str(tile_width), "--tile_height", - str(args.tile_height), + str(tile_height), "--n_workers", - str(args.n_workers), + str(n_workers), *extra, ] print(f"Running: {' '.join(cmd)}") @@ -228,13 +230,9 @@ def fix_bd_x_nan(prefix): def main(): - args, extra = parse_args() + extra = shlex.split(ARGS) - # Ensure numba cache dir is writable (env var should be set by caller, but belt-and-suspenders) - os.environ.setdefault("NUMBA_CACHE_DIR", os.path.join(os.getcwd(), ".numba_cache")) - os.makedirs(os.environ["NUMBA_CACHE_DIR"], exist_ok=True) - - prepare_bundle(args.bundle_dir) + prepare_bundle(BUNDLE_DIR) print("Adding statistics to parquet files...") add_parquet_stats() @@ -243,10 +241,17 @@ def main(): for item in sorted(Path("bundle_stats").iterdir()): print(f" {item.name}") - run_segger_cli(args, extra) + run_segger_cli( + OUTPUT_DIR, + SAMPLE_TYPE, + int(TILE_WIDTH), + int(TILE_HEIGHT), + int(N_WORKERS), + extra, + ) - filter_trainable_tiles_if_needed(args.output_dir) - fix_bd_x_nan(args.output_dir) + filter_trainable_tiles_if_needed(OUTPUT_DIR) + fix_bd_x_nan(OUTPUT_DIR) if __name__ == "__main__": diff --git a/modules/local/segger/predict/main.nf b/modules/local/segger/predict/main.nf index 0da7a594..d8384c74 100644 --- a/modules/local/segger/predict/main.nf +++ b/modules/local/segger/predict/main.nf @@ -26,18 +26,8 @@ process SEGGER_PREDICT { def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" - """ - run_predict.py \\ - --models-dir ${models_dir} \\ - --segger-data-dir ${segger_dataset} \\ - --transcripts-file ${transcripts} \\ - --benchmarks-dir benchmarks_dir \\ - --batch-size ${params.batch_size_predict} \\ - --use-cc ${params.cc_analysis} \\ - --knn-method ${params.segger_knn_method} \\ - --num-workers ${task.cpus} \\ - ${args} - """ + + template 'run_predict.py' stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/segger/predict/resources/usr/bin/run_predict.py b/modules/local/segger/predict/templates/run_predict.py old mode 100755 new mode 100644 similarity index 83% rename from modules/local/segger/predict/resources/usr/bin/run_predict.py rename to modules/local/segger/predict/templates/run_predict.py index 56a77ffc..ca2051a7 --- a/modules/local/segger/predict/resources/usr/bin/run_predict.py +++ b/modules/local/segger/predict/templates/run_predict.py @@ -12,12 +12,23 @@ import argparse import os +import shlex import subprocess import sys SEGGER_CLI = "/workspace/segger_dev/src/segger/cli/predict_fast.py" +# Nextflow-injected variables +MODELS_DIR = "${models_dir}" +SEGGER_DATASET = "${segger_dataset}" +TRANSCRIPTS = "${transcripts}" +BATCH_SIZE = "${params.batch_size_predict}" +USE_CC = "${params.cc_analysis}" +KNN_METHOD = "${params.segger_knn_method}" +NUM_WORKERS = "${task.cpus}" +ARGS = "${args}" + def parse_args(): p = argparse.ArgumentParser() @@ -113,6 +124,27 @@ def run_segger_cli(args, extra, gpu_ids): def main(): + # Re-build argv from Nextflow-injected constants + any extra task.ext.args flags + sys.argv = [ + sys.argv[0], + "--models-dir", + MODELS_DIR, + "--segger-data-dir", + SEGGER_DATASET, + "--transcripts-file", + TRANSCRIPTS, + "--benchmarks-dir", + "benchmarks_dir", + "--batch-size", + BATCH_SIZE, + "--use-cc", + USE_CC, + "--knn-method", + KNN_METHOD, + "--num-workers", + NUM_WORKERS, + ] + shlex.split(ARGS) + args, extra = parse_args() # Limit cupy GPU memory to 80% so PyTorch has headroom for graph attention ops diff --git a/modules/local/spatialdata/merge/main.nf b/modules/local/spatialdata/merge/main.nf index 9a57f877..9ff4b112 100644 --- a/modules/local/spatialdata/merge/main.nf +++ b/modules/local/spatialdata/merge/main.nf @@ -25,13 +25,7 @@ process SPATIALDATA_MERGE { prefix = task.ext.prefix ?: "${meta.id}" - """ - spatialdata_merge.py \\ - --raw-bundle ${raw_bundle} \\ - --redefined-bundle ${redefined_bundle} \\ - --prefix ${prefix} \\ - --output-folder ${outputfolder} - """ + template 'spatialdata_merge.py' stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/spatialdata/merge/resources/usr/bin/spatialdata_merge.py b/modules/local/spatialdata/merge/templates/spatialdata_merge.py old mode 100755 new mode 100644 similarity index 55% rename from modules/local/spatialdata/merge/resources/usr/bin/spatialdata_merge.py rename to modules/local/spatialdata/merge/templates/spatialdata_merge.py index 409d8c00..cd55e00c --- a/modules/local/spatialdata/merge/resources/usr/bin/spatialdata_merge.py +++ b/modules/local/spatialdata/merge/templates/spatialdata_merge.py @@ -1,45 +1,36 @@ #!/usr/bin/env python3 """Merge two spatialdata bundles to create a layered spatialdata object.""" -import argparse import json import os import shutil -import spatialdata +import spatialdata # noqa: F401 (kept so versions topic via `import spatialdata` is valid) - -def parse_args(): - """Parse command-line arguments.""" - parser = argparse.ArgumentParser(description="Merge two spatialdata bundles") - parser.add_argument("--raw-bundle", required=True, help="Path to raw spatialdata bundle") - parser.add_argument("--redefined-bundle", required=True, help="Path to redefined spatialdata bundle") - parser.add_argument("--prefix", required=True, help="Output prefix (sample ID)") - parser.add_argument("--output-folder", required=True, help="Output folder name") - return parser.parse_args() +# Nextflow-injected variables +RAW_BUNDLE = "${raw_bundle}" +REDEFINED_BUNDLE = "${redefined_bundle}" +PREFIX = "${prefix}" +OUTPUT_FOLDER = "${outputfolder}" def main(): """Run spatialdata merge.""" - args = parse_args() print("[START]") - output_dir = f"spatialdata/{args.prefix}/{args.output_folder}" + output_dir = f"spatialdata/{PREFIX}/{OUTPUT_FOLDER}" - # Ensure the output folder exists if os.path.exists(output_dir): shutil.rmtree(output_dir) os.makedirs(output_dir) - # Copy the entire reference bundle as is - for root, _, files in os.walk(args.raw_bundle): - rel_path = os.path.relpath(root, args.raw_bundle) + for root, _, files in os.walk(RAW_BUNDLE): + rel_path = os.path.relpath(root, RAW_BUNDLE) target_path = os.path.join(output_dir, rel_path) os.makedirs(target_path, exist_ok=True) for file in files: shutil.copy(os.path.join(root, file), os.path.join(target_path, file)) - # Rename folders in Points, Shapes, and Tables to raw_* for category in ["points", "shapes", "tables"]: category_path = os.path.join(output_dir, category) if os.path.exists(category_path): @@ -49,9 +40,8 @@ def main(): new_path = os.path.join(category_path, f"raw_{folder}") os.rename(old_path, new_path) - # Copy folders from redefined_bundle and rename them as redefined_* for category in ["points", "shapes", "tables"]: - add_category_path = os.path.join(args.redefined_bundle, category) + add_category_path = os.path.join(REDEFINED_BUNDLE, category) output_category_path = os.path.join(output_dir, category) os.makedirs(output_category_path, exist_ok=True) @@ -62,17 +52,16 @@ def main(): shutil.copytree(src_folder, dest_folder) # Invalidate consolidated metadata in zarr.json -- the directory renames above - # made the element paths in the metadata stale (e.g., 'points/transcripts' -> - # 'points/raw_transcripts'). Without consolidated metadata, sd.read_zarr() - # discovers elements by scanning the filesystem directly. + # made the element paths in the metadata stale. Without consolidated metadata, + # sd.read_zarr() discovers elements by scanning the filesystem directly. zarr_json = os.path.join(output_dir, "zarr.json") if os.path.exists(zarr_json): with open(zarr_json) as f: - meta = json.load(f) - if "consolidated_metadata" in meta: - del meta["consolidated_metadata"] + meta_obj = json.load(f) + if "consolidated_metadata" in meta_obj: + del meta_obj["consolidated_metadata"] with open(zarr_json, "w") as f: - json.dump(meta, f) + json.dump(meta_obj, f) print("[NOTE] Removed stale consolidated metadata from zarr.json") print("[FINISH]") diff --git a/modules/local/spatialdata/meta/main.nf b/modules/local/spatialdata/meta/main.nf index 40e2efae..c3665b1f 100644 --- a/modules/local/spatialdata/meta/main.nf +++ b/modules/local/spatialdata/meta/main.nf @@ -25,14 +25,7 @@ process SPATIALDATA_META { prefix = task.ext.prefix ?: "${meta.id}" - """ - spatialdata_meta.py \\ - --spatialdata-bundle ${spatialdata_bundle} \\ - --xenium-bundle ${xenium_bundle} \\ - --prefix ${prefix} \\ - --metadata '${meta}' \\ - --output-folder ${outputfolder} - """ + template 'spatialdata_meta.py' stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/spatialdata/meta/resources/usr/bin/spatialdata_meta.py b/modules/local/spatialdata/meta/templates/spatialdata_meta.py old mode 100755 new mode 100644 similarity index 78% rename from modules/local/spatialdata/meta/resources/usr/bin/spatialdata_meta.py rename to modules/local/spatialdata/meta/templates/spatialdata_meta.py index 935f39b2..e7f4d3d2 --- a/modules/local/spatialdata/meta/resources/usr/bin/spatialdata_meta.py +++ b/modules/local/spatialdata/meta/templates/spatialdata_meta.py @@ -1,13 +1,12 @@ #!/usr/bin/env python3 """Add metadata to SpatialData bundle.""" -import argparse import json import sys import pandas as pd import spatialdata as sd -import zarr +import zarr # noqa: F401 (preserved from original; ensures zarr is loaded before zarr.core.group) # Fix zarr v3 + anndata + numcodecs incompatibility: # anndata's string writer passes numcodecs.VLenUTF8 to zarr.Group.create_array, @@ -17,6 +16,13 @@ import numcodecs import zarr.core.group as _zarr_group +# Nextflow-injected variables +SPATIALDATA_BUNDLE = "${spatialdata_bundle}" +XENIUM_BUNDLE = "${xenium_bundle}" +PREFIX = "${prefix}" +METADATA = "${meta}" +OUTPUT_FOLDER = "${outputfolder}" + _orig_create_array = _zarr_group.Group.create_array @@ -70,27 +76,15 @@ def convert_arrow_to_numpy(sdata): _convert_df_arrow_to_numpy(adata.var) -def parse_args(): - """Parse command-line arguments.""" - parser = argparse.ArgumentParser(description="Add metadata to SpatialData bundle") - parser.add_argument("--spatialdata-bundle", required=True, help="Path to spatialdata bundle") - parser.add_argument("--xenium-bundle", required=True, help="Path to xenium bundle") - parser.add_argument("--prefix", required=True, help="Output prefix (sample ID)") - parser.add_argument("--metadata", required=True, help="Metadata string from Nextflow meta map") - parser.add_argument("--output-folder", required=True, help="Output folder name") - return parser.parse_args() - - def main(): """Run spatialdata metadata addition.""" - args = parse_args() print("[START]") - sdata = sd.read_zarr(args.spatialdata_bundle) + sdata = sd.read_zarr(SPATIALDATA_BUNDLE) # Convert metadata into dict print("[NOTE] Read in provenance ...") - metadata = args.metadata.strip("[]") # Remove square brackets + metadata = METADATA.strip("[]") # Remove square brackets pairs = metadata.split(", ") # Split by comma and space metadata = {k: v for k, v in (pair.split(":") for pair in pairs)} # Create dictionary @@ -103,7 +97,7 @@ def main(): # Add experimental metadata print("[NOTE] Read in experiment metadata ...") sdata['raw_table'].uns['experiment_xenium'] = '' - metadata_experiment = f'{args.xenium_bundle}/experiment.xenium' + metadata_experiment = f'{XENIUM_BUNDLE}/experiment.xenium' with open(metadata_experiment, "r") as f: metadata_experiment = json.load(f) sdata['raw_table'].uns['experiment_xenium'] = json.dumps(metadata_experiment) @@ -111,13 +105,13 @@ def main(): # Add gene panel metadata print("[NOTE] Read in gene panel metadata ...") sdata['raw_table'].uns['gene_panel'] = '' - metadata_gene_panel = f'{args.xenium_bundle}/gene_panel.json' + metadata_gene_panel = f'{XENIUM_BUNDLE}/gene_panel.json' with open(metadata_gene_panel, "r") as f: metadata_gene_panel = json.load(f) sdata['raw_table'].uns['gene_panel'] = json.dumps(metadata_gene_panel) convert_arrow_to_numpy(sdata) - sdata.write(f"spatialdata/{args.prefix}/{args.output_folder}", overwrite=True, consolidate_metadata=True, sdata_formats=None) + sdata.write(f"spatialdata/{PREFIX}/{OUTPUT_FOLDER}", overwrite=True, consolidate_metadata=True, sdata_formats=None) print("[FINISH]") diff --git a/modules/local/spatialdata/write/main.nf b/modules/local/spatialdata/write/main.nf index 43acc073..1144b7c9 100644 --- a/modules/local/spatialdata/write/main.nf +++ b/modules/local/spatialdata/write/main.nf @@ -27,15 +27,7 @@ process SPATIALDATA_WRITE { prefix = task.ext.prefix ?: "${meta.id}" - """ - spatialdata_write.py \\ - --bundle ${bundle} \\ - --prefix ${prefix} \\ - --output-folder ${outputfolder} \\ - --segmented-object ${segmented_object} \\ - --coordinate-space ${coordinate_space} \\ - --format ${params.format} - """ + template 'spatialdata_write.py' stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/spatialdata/write/resources/usr/bin/spatialdata_write.py b/modules/local/spatialdata/write/templates/spatialdata_write.py old mode 100755 new mode 100644 similarity index 78% rename from modules/local/spatialdata/write/resources/usr/bin/spatialdata_write.py rename to modules/local/spatialdata/write/templates/spatialdata_write.py index 421e830f..9428dccd --- a/modules/local/spatialdata/write/resources/usr/bin/spatialdata_write.py +++ b/modules/local/spatialdata/write/templates/spatialdata_write.py @@ -1,11 +1,10 @@ #!/usr/bin/env python3 """Write spatialdata object from segmentation format.""" -import argparse import sys import pandas as pd -import spatialdata +import spatialdata # noqa: F401 (preserved from original; ensures spatialdata loads before spatialdata_io) from spatialdata_io import xenium # Fix zarr v3 + anndata + numcodecs incompatibility: @@ -16,6 +15,14 @@ import numcodecs import zarr.core.group as _zarr_group +# Nextflow-injected variables +BUNDLE = "${bundle}" +PREFIX = "${prefix}" +OUTPUT_FOLDER = "${outputfolder}" +SEGMENTED_OBJECT = "${segmented_object}" +COORDINATE_SPACE = "${coordinate_space}" +FORMAT = "${params.format}" + _orig_create_array = _zarr_group.Group.create_array @@ -77,21 +84,8 @@ def convert_arrow_to_numpy(sdata): _convert_df_arrow_to_numpy(adata.var) -def parse_args(): - """Parse command-line arguments.""" - parser = argparse.ArgumentParser(description="Write spatialdata object from segmentation format") - parser.add_argument("--bundle", required=True, help="Path to input bundle") - parser.add_argument("--prefix", required=True, help="Output prefix (sample ID)") - parser.add_argument("--output-folder", required=True, help="Output folder name") - parser.add_argument("--segmented-object", required=True, help="Segmented object type (cells, nuclei, cells_and_nuclei)") - parser.add_argument("--coordinate-space", required=True, help="Coordinate space (pixels, microns)") - parser.add_argument("--format", required=True, help="Input format (xenium)") - return parser.parse_args() - - def main(): """Run spatialdata write.""" - args = parse_args() print("[START]") cells_as_circles = False @@ -100,13 +94,13 @@ def main(): cells_labels = False nucleus_labels = False - if args.segmented_object == "cells": + if SEGMENTED_OBJECT == "cells": cells_boundaries = True cells_labels = True - elif args.segmented_object == "nuclei": + elif SEGMENTED_OBJECT == "nuclei": nucleus_boundaries = True nucleus_labels = True - elif args.segmented_object == "cells_and_nuclei": + elif SEGMENTED_OBJECT == "cells_and_nuclei": cells_boundaries = True nucleus_boundaries = True cells_labels = True @@ -115,7 +109,7 @@ def main(): cells_as_circles = False # set sd variables based on the coordinate space - if args.coordinate_space == "pixels": + if COORDINATE_SPACE == "pixels": cells_labels = True nucleus_labels = True # Labels are sufficient in pixel space; boundaries can contain @@ -124,16 +118,16 @@ def main(): cells_boundaries = False nucleus_boundaries = False - if args.coordinate_space == "microns": + if COORDINATE_SPACE == "microns": cells_labels = False cells_boundaries = True nucleus_boundaries = False nucleus_labels = False cells_as_circles = False - if args.format == "xenium": + if FORMAT == "xenium": sd_xenium_obj = xenium( - args.bundle, + BUNDLE, cells_as_circles=cells_as_circles, cells_boundaries=cells_boundaries, nucleus_boundaries=nucleus_boundaries, @@ -145,7 +139,7 @@ def main(): ) print(sd_xenium_obj) convert_arrow_to_numpy(sd_xenium_obj) - sd_xenium_obj.write(f"spatialdata/{args.prefix}/{args.output_folder}") + sd_xenium_obj.write(f"spatialdata/{PREFIX}/{OUTPUT_FOLDER}") else: sys.exit("[ERROR] Format not found") diff --git a/modules/local/utility/convert_mask_uint32/main.nf b/modules/local/utility/convert_mask_uint32/main.nf index 3f0333a7..b80acfad 100644 --- a/modules/local/utility/convert_mask_uint32/main.nf +++ b/modules/local/utility/convert_mask_uint32/main.nf @@ -34,11 +34,8 @@ process CONVERT_MASK_UINT32 { script: prefix = task.ext.prefix ?: "${meta.id}" - """ - convert_mask_uint32.py \\ - --input ${mask} \\ - --output ${prefix}_uint32_mask.tif - """ + + template 'convert_mask_uint32.py' stub: prefix = task.ext.prefix ?: "${meta.id}" diff --git a/modules/local/utility/convert_mask_uint32/resources/usr/bin/convert_mask_uint32.py b/modules/local/utility/convert_mask_uint32/templates/convert_mask_uint32.py old mode 100755 new mode 100644 similarity index 60% rename from modules/local/utility/convert_mask_uint32/resources/usr/bin/convert_mask_uint32.py rename to modules/local/utility/convert_mask_uint32/templates/convert_mask_uint32.py index 955ad4b7..f0a6835d --- a/modules/local/utility/convert_mask_uint32/resources/usr/bin/convert_mask_uint32.py +++ b/modules/local/utility/convert_mask_uint32/templates/convert_mask_uint32.py @@ -7,11 +7,13 @@ the input mask, casts it to uint32, and writes the result. """ -import argparse - import numpy as np import tifffile +# Nextflow-injected variables +INPUT_PATH = "${mask}" +OUTPUT_PATH = "${prefix}_uint32_mask.tif" + def convert_mask_to_uint32(input_path: str, output_path: str) -> None: """ @@ -27,20 +29,5 @@ def convert_mask_to_uint32(input_path: str, output_path: str) -> None: print("Output dtype: uint32") -def parse_args() -> argparse.Namespace: - """Parse command-line arguments.""" - parser = argparse.ArgumentParser( - description="Convert a segmentation mask TIFF to uint32 dtype." - ) - parser.add_argument( - "--input", required=True, help="Path to input mask TIFF" - ) - parser.add_argument( - "--output", required=True, help="Path where uint32 mask will be written" - ) - return parser.parse_args() - - if __name__ == "__main__": - args = parse_args() - convert_mask_to_uint32(input_path=args.input, output_path=args.output) + convert_mask_to_uint32(input_path=INPUT_PATH, output_path=OUTPUT_PATH) diff --git a/modules/local/utility/downscale_morphology/main.nf b/modules/local/utility/downscale_morphology/main.nf index edaf3d67..39620bd6 100644 --- a/modules/local/utility/downscale_morphology/main.nf +++ b/modules/local/utility/downscale_morphology/main.nf @@ -40,13 +40,8 @@ process DOWNSCALE_MORPHOLOGY { def diameter = task.ext.diameter ?: 9 def diam_mean = 30 prefix = task.ext.prefix ?: "${meta.id}" - """ - downscale_morphology.py \\ - --image ${image} \\ - --diameter ${diameter} \\ - --diam-mean ${diam_mean} \\ - --prefix ${prefix} - """ + + template 'downscale_morphology.py' stub: prefix = task.ext.prefix ?: "${meta.id}" diff --git a/modules/local/utility/downscale_morphology/resources/usr/bin/downscale_morphology.py b/modules/local/utility/downscale_morphology/templates/downscale_morphology.py old mode 100755 new mode 100644 similarity index 79% rename from modules/local/utility/downscale_morphology/resources/usr/bin/downscale_morphology.py rename to modules/local/utility/downscale_morphology/templates/downscale_morphology.py index 8544ecf3..2f0f1e7d --- a/modules/local/utility/downscale_morphology/resources/usr/bin/downscale_morphology.py +++ b/modules/local/utility/downscale_morphology/templates/downscale_morphology.py @@ -13,13 +13,18 @@ {prefix}/scale_info.json - Scale factor and original/new dimensions. """ -import argparse import json from pathlib import Path import tifffile from skimage.transform import resize +# Nextflow-injected variables +IMAGE_PATH = "${image}" +DIAMETER = float("${diameter}") +DIAM_MEAN = float("${diam_mean}") +PREFIX = "${prefix}" + # Cellpose network requires a minimum spatial size of 256 px. MIN_DIM = 256 @@ -81,23 +86,10 @@ def downscale_image( print(f"Done: downscaled.tif written, shape={img_ds.shape}") -def parse_args() -> argparse.Namespace: - """Parse command-line arguments.""" - parser = argparse.ArgumentParser( - description="Pre-downscale a morphology image for Cellpose." - ) - parser.add_argument("--image", required=True, help="Morphology TIFF input") - parser.add_argument("--diameter", type=float, required=True, help="Target object diameter") - parser.add_argument("--diam-mean", type=float, required=True, help="Cellpose model diam_mean") - parser.add_argument("--prefix", required=True, help="Output directory") - return parser.parse_args() - - if __name__ == "__main__": - args = parse_args() downscale_image( - image_path=args.image, - diameter=args.diameter, - diam_mean=args.diam_mean, - prefix=args.prefix, + image_path=IMAGE_PATH, + diameter=DIAMETER, + diam_mean=DIAM_MEAN, + prefix=PREFIX, ) diff --git a/modules/local/utility/extract_dapi/main.nf b/modules/local/utility/extract_dapi/main.nf index 79cce91f..206e6519 100644 --- a/modules/local/utility/extract_dapi/main.nf +++ b/modules/local/utility/extract_dapi/main.nf @@ -35,12 +35,8 @@ process EXTRACT_DAPI { script: prefix = task.ext.prefix ?: "${meta.id}" def channel_index = task.ext.channel_index ?: 0 - """ - extract_dapi.py \\ - --input ${image} \\ - --output ${prefix}_dapi.tif \\ - --channel-index ${channel_index} - """ + + template 'extract_dapi.py' stub: prefix = task.ext.prefix ?: "${meta.id}" diff --git a/modules/local/utility/extract_dapi/resources/usr/bin/extract_dapi.py b/modules/local/utility/extract_dapi/templates/extract_dapi.py old mode 100755 new mode 100644 similarity index 60% rename from modules/local/utility/extract_dapi/resources/usr/bin/extract_dapi.py rename to modules/local/utility/extract_dapi/templates/extract_dapi.py index 3d60f563..f0d9b9f2 --- a/modules/local/utility/extract_dapi/resources/usr/bin/extract_dapi.py +++ b/modules/local/utility/extract_dapi/templates/extract_dapi.py @@ -8,10 +8,13 @@ the requested channel, and writes the result. """ -import argparse - import tifffile +# Nextflow-injected variables +INPUT_PATH = "${image}" +OUTPUT_PATH = "${prefix}_dapi.tif" +CHANNEL_INDEX = int("${channel_index}") + def extract_channel(input_path: str, output_path: str, channel_index: int) -> None: """ @@ -34,27 +37,9 @@ def extract_channel(input_path: str, output_path: str, channel_index: int) -> No print(f"Input shape: {orig_shape} -> extracted channel {channel_index}: {img.shape}") -def parse_args() -> argparse.Namespace: - """Parse command-line arguments.""" - parser = argparse.ArgumentParser( - description="Extract a single channel from a multi-channel OME-TIFF." - ) - parser.add_argument( - "--input", required=True, help="Path to multi-channel OME-TIFF morphology image" - ) - parser.add_argument( - "--output", required=True, help="Path where the single-channel TIFF will be written" - ) - parser.add_argument( - "--channel-index", type=int, default=0, help="Channel index to extract (default: 0)" - ) - return parser.parse_args() - - if __name__ == "__main__": - args = parse_args() extract_channel( - input_path=args.input, - output_path=args.output, - channel_index=args.channel_index, + input_path=INPUT_PATH, + output_path=OUTPUT_PATH, + channel_index=CHANNEL_INDEX, ) diff --git a/modules/local/utility/extract_preview_data/main.nf b/modules/local/utility/extract_preview_data/main.nf index fb07df29..34d997fb 100644 --- a/modules/local/utility/extract_preview_data/main.nf +++ b/modules/local/utility/extract_preview_data/main.nf @@ -25,11 +25,7 @@ process EXTRACT_PREVIEW_DATA { prefix = task.ext.prefix ?: "${meta.id}" - """ - extract_data.py \\ - --preview-html ${preview_html} \\ - --prefix ${prefix} - """ + template 'extract_data.py' stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/utility/extract_preview_data/resources/usr/bin/extract_data.py b/modules/local/utility/extract_preview_data/templates/extract_data.py old mode 100755 new mode 100644 similarity index 90% rename from modules/local/utility/extract_preview_data/resources/usr/bin/extract_data.py rename to modules/local/utility/extract_preview_data/templates/extract_data.py index 0ea737c2..d34fedaf --- a/modules/local/utility/extract_preview_data/resources/usr/bin/extract_data.py +++ b/modules/local/utility/extract_preview_data/templates/extract_data.py @@ -6,7 +6,6 @@ Baysor preview.html file, writing MultiQC-compatible TSV and PNG files. """ -import argparse import base64 import html import json @@ -18,6 +17,10 @@ import pandas as pd from bs4 import BeautifulSoup +# Nextflow-injected variables +PREVIEW_HTML = "${preview_html}" +PREFIX = "${prefix}" + def get_png_files(soup: BeautifulSoup, outdir: Path) -> None: """Get png base64 images following specific h1 tags in preview.html""" @@ -160,29 +163,9 @@ def write_tsvs(specs: Dict[str, str], outdir: Path) -> List[Path]: return written -def parse_args() -> argparse.Namespace: - """Parse command-line arguments.""" - parser = argparse.ArgumentParser( - description="Extract preview data from Baysor preview HTML reports." - ) - parser.add_argument( - "--preview-html", - required=True, - help="Path to Baysor preview HTML file", - ) - parser.add_argument( - "--prefix", - required=True, - help="Output directory prefix (sample ID)", - ) - return parser.parse_args() - - if __name__ == "__main__": - args = parse_args() - - input_path: Path = Path(args.preview_html) - outdir: Path = Path(args.prefix) + input_path: Path = Path(PREVIEW_HTML) + outdir: Path = Path(PREFIX) text = input_path.read_text(encoding="utf-8", errors="ignore") soup = BeautifulSoup(text, "html.parser") diff --git a/modules/local/utility/get_coordinates/main.nf b/modules/local/utility/get_coordinates/main.nf index 3fdd7862..f6b70571 100644 --- a/modules/local/utility/get_coordinates/main.nf +++ b/modules/local/utility/get_coordinates/main.nf @@ -24,10 +24,7 @@ process GET_TRANSCRIPTS_COORDINATES { prefix = task.ext.prefix ?: "${meta.id}" - """ - get_coordinates.py \\ - --transcripts ${transcripts} - """ + template 'get_coordinates.py' stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/utility/get_coordinates/resources/usr/bin/get_coordinates.py b/modules/local/utility/get_coordinates/templates/get_coordinates.py old mode 100755 new mode 100644 similarity index 71% rename from modules/local/utility/get_coordinates/resources/usr/bin/get_coordinates.py rename to modules/local/utility/get_coordinates/templates/get_coordinates.py index 8770f9a2..05c01567 --- a/modules/local/utility/get_coordinates/resources/usr/bin/get_coordinates.py +++ b/modules/local/utility/get_coordinates/templates/get_coordinates.py @@ -6,10 +6,11 @@ bounding box (x_min, x_max, y_min, y_max) to stdout. """ -import argparse - import pandas as pd +# Nextflow-injected variables +TRANSCRIPTS = "${transcripts}" + def get_coordinates(parquet_path: str): """ @@ -41,20 +42,6 @@ def get_coordinates(parquet_path: str): ) -def parse_args() -> argparse.Namespace: - """Parse command-line arguments.""" - parser = argparse.ArgumentParser( - description="Get transcript coordinate bounds from a Parquet file." - ) - parser.add_argument( - "--transcripts", - required=True, - help="Path to transcripts parquet file", - ) - return parser.parse_args() - - if __name__ == "__main__": - args = parse_args() - result = get_coordinates(args.transcripts) + result = get_coordinates(TRANSCRIPTS) print(",".join(str(v) for v in result)) diff --git a/modules/local/utility/parquet_to_csv/main.nf b/modules/local/utility/parquet_to_csv/main.nf index 033ed00a..6df3da37 100644 --- a/modules/local/utility/parquet_to_csv/main.nf +++ b/modules/local/utility/parquet_to_csv/main.nf @@ -24,12 +24,7 @@ process PARQUET_TO_CSV { } prefix = task.ext.prefix ?: "${meta.id}" - """ - parquet_to_csv.py \\ - --transcripts ${transcripts} \\ - --extension ${extension} \\ - --prefix ${prefix} - """ + template 'parquet_to_csv.py' stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/utility/parquet_to_csv/resources/usr/bin/parquet_to_csv.py b/modules/local/utility/parquet_to_csv/templates/parquet_to_csv.py old mode 100755 new mode 100644 similarity index 57% rename from modules/local/utility/parquet_to_csv/resources/usr/bin/parquet_to_csv.py rename to modules/local/utility/parquet_to_csv/templates/parquet_to_csv.py index bfa19c40..cb1d8107 --- a/modules/local/utility/parquet_to_csv/resources/usr/bin/parquet_to_csv.py +++ b/modules/local/utility/parquet_to_csv/templates/parquet_to_csv.py @@ -5,11 +5,15 @@ Reads a Parquet file and writes it as CSV, optionally gzip-compressed. """ -import argparse from pathlib import Path import pandas as pd +# Nextflow-injected variables +TRANSCRIPTS = "${transcripts}" +EXTENSION = "${extension}" +PREFIX = "${prefix}" + def convert_parquet( transcripts: str, @@ -38,33 +42,9 @@ def convert_parquet( return None -def parse_args() -> argparse.Namespace: - """Parse command-line arguments.""" - parser = argparse.ArgumentParser( - description="Convert a Parquet file to CSV format." - ) - parser.add_argument( - "--transcripts", - required=True, - help="Input parquet filename", - ) - parser.add_argument( - "--extension", - default=".csv", - help="Output extension: '.csv' or '.gz' (default: .csv)", - ) - parser.add_argument( - "--prefix", - required=True, - help="Output directory prefix (sample ID)", - ) - return parser.parse_args() - - if __name__ == "__main__": - args = parse_args() convert_parquet( - transcripts=args.transcripts, - extension=args.extension, - prefix=args.prefix, + transcripts=TRANSCRIPTS, + extension=EXTENSION, + prefix=PREFIX, ) diff --git a/modules/local/utility/resize_tif/main.nf b/modules/local/utility/resize_tif/main.nf index 6877af27..965d652e 100644 --- a/modules/local/utility/resize_tif/main.nf +++ b/modules/local/utility/resize_tif/main.nf @@ -25,14 +25,7 @@ process RESIZE_TIF { prefix = task.ext.prefix ?: "${meta.id}" - """ - resize_tif.py \\ - --mask ${mask} \\ - --transcripts ${transcripts} \\ - --metadata ${metadata} \\ - --prefix ${prefix} \\ - --mask-filename ${mask} - """ + template 'resize_tif.py' stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/utility/resize_tif/resources/usr/bin/resize_tif.py b/modules/local/utility/resize_tif/templates/resize_tif.py old mode 100755 new mode 100644 similarity index 80% rename from modules/local/utility/resize_tif/resources/usr/bin/resize_tif.py rename to modules/local/utility/resize_tif/templates/resize_tif.py index 6cca640d..5674a094 --- a/modules/local/utility/resize_tif/resources/usr/bin/resize_tif.py +++ b/modules/local/utility/resize_tif/templates/resize_tif.py @@ -6,7 +6,6 @@ space of Xenium transcript data using microns-per-pixel metadata. """ -import argparse import json import os from typing import Tuple @@ -16,6 +15,13 @@ import tifffile from skimage.transform import resize +# Nextflow-injected variables +MASK = "${mask}" +TRANSCRIPTS = "${transcripts}" +METADATA = "${metadata}" +PREFIX = "${prefix}" +MASK_FILENAME = "${mask}" + def read_mask(mask_path: str) -> np.ndarray: """Read the segmentation mask from a TIFF file.""" @@ -107,28 +113,13 @@ def main(mask_path: str, transcripts_path: str, metadata_path: str, output_path: print(f"Saved resized mask -> {output_path}") -def parse_args() -> argparse.Namespace: - """Parse command-line arguments.""" - parser = argparse.ArgumentParser( - description="Resize a segmentation TIFF mask to match transcript coordinates." - ) - parser.add_argument("--mask", required=True, help="Path to segmentation mask TIFF") - parser.add_argument("--transcripts", required=True, help="Path to transcripts file") - parser.add_argument("--metadata", required=True, help="Path to metadata JSON") - parser.add_argument("--prefix", required=True, help="Output directory prefix") - parser.add_argument("--mask-filename", required=True, help="Original mask filename for output naming") - return parser.parse_args() - - if __name__ == "__main__": - args = parse_args() - - os.makedirs(args.prefix, exist_ok=True) - output_mask: str = os.path.join(args.prefix, f"resized_{args.mask_filename}.tif") + os.makedirs(PREFIX, exist_ok=True) + output_mask: str = os.path.join(PREFIX, f"resized_{MASK_FILENAME}.tif") main( - mask_path=args.mask, - transcripts_path=args.transcripts, - metadata_path=args.metadata, + mask_path=MASK, + transcripts_path=TRANSCRIPTS, + metadata_path=METADATA, output_path=output_mask, ) diff --git a/modules/local/utility/segger2xr/main.nf b/modules/local/utility/segger2xr/main.nf index 1964469a..b562eed4 100644 --- a/modules/local/utility/segger2xr/main.nf +++ b/modules/local/utility/segger2xr/main.nf @@ -24,14 +24,9 @@ process SEGGER2XR { error "SEGGER2XR module does not support Conda. Please use Docker / Singularity / Podman instead." } - def min_transcripts = task.ext.min_transcripts_per_cell ?: 3 + min_transcripts = task.ext.min_transcripts_per_cell ?: 3 - """ - segger2xr.py \\ - --transcripts ${transcripts} \\ - --prefix ${meta.id} \\ - --min-transcripts ${min_transcripts} - """ + template 'segger2xr.py' stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/utility/segger2xr/resources/usr/bin/segger2xr.py b/modules/local/utility/segger2xr/templates/segger2xr.py old mode 100755 new mode 100644 similarity index 90% rename from modules/local/utility/segger2xr/resources/usr/bin/segger2xr.py rename to modules/local/utility/segger2xr/templates/segger2xr.py index 22889e82..96f2d8df --- a/modules/local/utility/segger2xr/resources/usr/bin/segger2xr.py +++ b/modules/local/utility/segger2xr/templates/segger2xr.py @@ -7,7 +7,6 @@ and GeoJSON cell boundary polygons for xeniumranger import-segmentation. """ -import argparse import json from pathlib import Path from typing import List @@ -15,6 +14,11 @@ import pandas as pd from scipy.spatial import ConvexHull +# Nextflow-injected variables +TRANSCRIPTS = "${transcripts}" +PREFIX = "${meta.id}" +MIN_TRANSCRIPTS = int("${min_transcripts}") + # Expected columns in transcripts.parquet REQUIRED_COLUMNS: List[str] = [ "transcript_id", @@ -214,34 +218,9 @@ def main(input_file: str, prefix: str, min_transcripts: int = 3) -> None: generate_viz_polygons(transcripts, f"{prefix}/segmentation_polygons.json", cell_map) -def parse_args() -> argparse.Namespace: - """Parse command-line arguments.""" - parser = argparse.ArgumentParser( - description="Convert Segger prediction output to XeniumRanger-compatible format." - ) - parser.add_argument( - "--transcripts", - required=True, - help="Path to Segger output transcripts parquet file", - ) - parser.add_argument( - "--prefix", - required=True, - help="Output directory prefix (sample ID)", - ) - parser.add_argument( - "--min-transcripts", - type=int, - default=3, - help="Minimum transcripts per cell (default: 3)", - ) - return parser.parse_args() - - if __name__ == "__main__": - args = parse_args() main( - input_file=args.transcripts, - prefix=args.prefix, - min_transcripts=args.min_transcripts, + input_file=TRANSCRIPTS, + prefix=PREFIX, + min_transcripts=MIN_TRANSCRIPTS, ) diff --git a/modules/local/utility/split_transcripts/main.nf b/modules/local/utility/split_transcripts/main.nf index 5cfa0b65..3cc54042 100644 --- a/modules/local/utility/split_transcripts/main.nf +++ b/modules/local/utility/split_transcripts/main.nf @@ -23,15 +23,9 @@ process SPLIT_TRANSCRIPTS { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { error "SPLIT_TRANSCRIPTS module does not support Conda. Please use Docker / Singularity / Podman instead." } - def prefix = task.ext.prefix ?: "${meta.id}" + prefix = task.ext.prefix ?: "${meta.id}" - """ - split_transcripts.py \\ - --transcripts ${transcripts} \\ - --x-bins ${x_bins} \\ - --y-bins ${y_bins} \\ - --prefix ${prefix} - """ + template 'split_transcripts.py' stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/utility/split_transcripts/resources/usr/bin/split_transcripts.py b/modules/local/utility/split_transcripts/templates/split_transcripts.py old mode 100755 new mode 100644 similarity index 68% rename from modules/local/utility/split_transcripts/resources/usr/bin/split_transcripts.py rename to modules/local/utility/split_transcripts/templates/split_transcripts.py index 275fbab1..1fa6db15 --- a/modules/local/utility/split_transcripts/resources/usr/bin/split_transcripts.py +++ b/modules/local/utility/split_transcripts/templates/split_transcripts.py @@ -6,12 +6,17 @@ tiles, writing a splits.csv with tile boundaries. """ -import argparse import os from typing import List import pandas as pd +# Nextflow-injected variables +TRANSCRIPTS = "${transcripts}" +X_BINS = "${x_bins}" +Y_BINS = "${y_bins}" +PREFIX = "${prefix}" + def compute_quantile_ranges(df: pd.DataFrame, col: str, n_bins: int) -> List: """ @@ -69,41 +74,10 @@ def main( return None -def parse_args() -> argparse.Namespace: - """Parse command-line arguments.""" - parser = argparse.ArgumentParser( - description="Split transcript coordinates into spatial tiles." - ) - parser.add_argument( - "--transcripts", - required=True, - help="Path to transcripts parquet file", - ) - parser.add_argument( - "--x-bins", - type=int, - required=True, - help="Number of bins along X axis", - ) - parser.add_argument( - "--y-bins", - type=int, - required=True, - help="Number of bins along Y axis", - ) - parser.add_argument( - "--prefix", - required=True, - help="Output directory prefix", - ) - return parser.parse_args() - - if __name__ == "__main__": - args = parse_args() main( - transcripts=args.transcripts, - x_bins=args.x_bins, - y_bins=args.y_bins, - prefix=args.prefix, + transcripts=TRANSCRIPTS, + x_bins=int(X_BINS), + y_bins=int(Y_BINS), + prefix=PREFIX, ) diff --git a/modules/local/utility/upscale_mask/main.nf b/modules/local/utility/upscale_mask/main.nf index a201abf1..41b3ff91 100644 --- a/modules/local/utility/upscale_mask/main.nf +++ b/modules/local/utility/upscale_mask/main.nf @@ -34,12 +34,8 @@ process UPSCALE_MASK { script: prefix = task.ext.prefix ?: "${meta.id}" - """ - upscale_mask.py \\ - --mask ${mask} \\ - --scale-info ${scale_info} \\ - --prefix ${prefix} - """ + + template 'upscale_mask.py' stub: prefix = task.ext.prefix ?: "${meta.id}" diff --git a/modules/local/utility/upscale_mask/resources/usr/bin/upscale_mask.py b/modules/local/utility/upscale_mask/templates/upscale_mask.py old mode 100755 new mode 100644 similarity index 71% rename from modules/local/utility/upscale_mask/resources/usr/bin/upscale_mask.py rename to modules/local/utility/upscale_mask/templates/upscale_mask.py index 6cc1694e..9c0ef0bb --- a/modules/local/utility/upscale_mask/resources/usr/bin/upscale_mask.py +++ b/modules/local/utility/upscale_mask/templates/upscale_mask.py @@ -9,7 +9,6 @@ Output: {prefix}/upscaled_{mask_basename}.tif """ -import argparse import json from pathlib import Path @@ -17,6 +16,11 @@ import tifffile from PIL import Image +# Nextflow-injected variables +MASK = "${mask}" +SCALE_INFO = "${scale_info}" +PREFIX = "${prefix}" + def upscale_mask(mask_path: str, scale_info_path: str, prefix: str) -> None: """ @@ -52,21 +56,9 @@ def upscale_mask(mask_path: str, scale_info_path: str, prefix: str) -> None: ) -def parse_args() -> argparse.Namespace: - """Parse command-line arguments.""" - parser = argparse.ArgumentParser( - description="Upscale a Cellpose mask back to original resolution." - ) - parser.add_argument("--mask", required=True, help="Downscaled mask TIFF") - parser.add_argument("--scale-info", required=True, help="scale_info.json from downscale step") - parser.add_argument("--prefix", required=True, help="Output directory") - return parser.parse_args() - - if __name__ == "__main__": - args = parse_args() upscale_mask( - mask_path=args.mask, - scale_info_path=args.scale_info, - prefix=args.prefix, + mask_path=MASK, + scale_info_path=SCALE_INFO, + prefix=PREFIX, ) diff --git a/modules/local/xenium_patch/stitch/main.nf b/modules/local/xenium_patch/stitch/main.nf index d805a0f5..118e77a7 100644 --- a/modules/local/xenium_patch/stitch/main.nf +++ b/modules/local/xenium_patch/stitch/main.nf @@ -34,18 +34,8 @@ process XENIUM_PATCH_STITCH { script: def args = task.ext.args ?: '' - """ - stitch_transcripts.py \\ - --patches ${patches} \\ - --output output \\ - ${args} - # Post-process: ensure all GeoJSON geometries are Polygon and - # reconcile dropped cells in the transcript CSV. - stitch_postprocess.py \\ - --geojson output/xr-cell-polygons.geojson \\ - --csv output/xr-transcript-metadata.csv - """ + template 'stitch.py' stub: """ diff --git a/modules/local/xenium_patch/stitch/resources/usr/bin/stitch_postprocess.py b/modules/local/xenium_patch/stitch/resources/usr/bin/stitch_postprocess.py deleted file mode 100755 index 7144b1ac..00000000 --- a/modules/local/xenium_patch/stitch/resources/usr/bin/stitch_postprocess.py +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/env python3 -""" -Post-process stitched per-patch segmentation outputs. - -Ensures every GeoJSON feature is a single Polygon: make_valid() and -sopa.solve_conflicts() can produce MultiPolygon, MultiLineString, or -GeometryCollection geometries that XeniumRanger rejects. Cells dropped -during cleanup are also reassigned to UNASSIGNED in the transcript CSV -so the two outputs stay consistent. -""" - -import argparse -import csv -import json - -import shapely -from shapely.geometry import mapping, shape - - -def clean_geojson(geojson_path: str) -> set: - """ - Force every feature to a single valid Polygon. - - Returns the set of cell ids whose features were dropped. - """ - with open(geojson_path) as f: - data = json.load(f) - - clean = [] - dropped_cells = set() - for feat in data["features"]: - geom = shape(feat["geometry"]) - if not geom.is_valid: - geom = shapely.make_valid(geom) - poly = None - if geom.geom_type == "Polygon": - poly = geom - elif geom.geom_type == "MultiPolygon": - poly = max(geom.geoms, key=lambda g: g.area) - elif geom.geom_type == "GeometryCollection": - polys = [g for g in geom.geoms if g.geom_type == "Polygon"] - if polys: - poly = max(polys, key=lambda g: g.area) - if poly is not None and not poly.is_empty: - feat["geometry"] = mapping(poly) - clean.append(feat) - else: - cell_id = feat.get("id") or feat.get("properties", {}).get("cell_id", "") - dropped_cells.add(str(cell_id)) - - print(f"GeoJSON: {len(clean)} kept, {len(dropped_cells)} dropped: {dropped_cells}") - data["features"] = clean - with open(geojson_path, "w") as f: - json.dump(data, f) - - return dropped_cells - - -def reassign_dropped(csv_path: str, dropped_cells: set) -> None: - """ - Reassign transcripts of dropped cells to UNASSIGNED in the CSV. - """ - if not dropped_cells: - return - - with open(csv_path) as f: - reader = csv.DictReader(f) - fieldnames = reader.fieldnames - rows = list(reader) - - reassigned = 0 - for row in rows: - if row["cell"] in dropped_cells: - row["cell"] = "" - row["is_noise"] = "1" - reassigned += 1 - - with open(csv_path, "w", newline="") as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(rows) - print(f"CSV: {reassigned} transcripts reassigned to UNASSIGNED") - - -def parse_args() -> argparse.Namespace: - """Parse command-line arguments.""" - parser = argparse.ArgumentParser( - description="Clean stitched GeoJSON polygons and reconcile transcript CSV." - ) - parser.add_argument("--geojson", required=True, help="Path to xr-cell-polygons.geojson") - parser.add_argument("--csv", required=True, help="Path to xr-transcript-metadata.csv") - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - dropped = clean_geojson(args.geojson) - reassign_dropped(args.csv, dropped) diff --git a/modules/local/xenium_patch/stitch/resources/usr/bin/stitch_transcripts.py b/modules/local/xenium_patch/stitch/templates/stitch.py old mode 100755 new mode 100644 similarity index 85% rename from modules/local/xenium_patch/stitch/resources/usr/bin/stitch_transcripts.py rename to modules/local/xenium_patch/stitch/templates/stitch.py index d9fb8d41..2306a7dc --- a/modules/local/xenium_patch/stitch/resources/usr/bin/stitch_transcripts.py +++ b/modules/local/xenium_patch/stitch/templates/stitch.py @@ -1,15 +1,25 @@ #!/usr/bin/env python3 -"""Stitch per-patch Baysor segmentation results into unified output. - -Standalone script that replaces the xenium_patch CLI package's stitch -functionality. Uses sopa's solve_conflicts() for overlap resolution. +"""Stitch per-patch segmentation results, then post-process the GeoJSON + CSV. + +Phase 1 (stitch_transcripts): + Stitch per-patch Baysor segmentation results into unified output. Uses + sopa's solve_conflicts() for overlap resolution at patch boundaries. + +Phase 2 (postprocess): + Ensures every GeoJSON feature is a single Polygon: make_valid() and + sopa.solve_conflicts() can produce MultiPolygon, MultiLineString, or + GeometryCollection geometries that XeniumRanger rejects. Cells dropped + during cleanup are also reassigned to UNASSIGNED in the transcript CSV + so the two outputs stay consistent. """ from __future__ import annotations import argparse +import csv import json import os +import shlex from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass from pathlib import Path @@ -24,6 +34,11 @@ from shapely.geometry import mapping, shape from sopa.segmentation.resolve import solve_conflicts +# Nextflow-injected variables +PATCHES = "${patches}" +ARGS = "${args}" # task.ext.args, may be empty + + # --------------------------------------------------------------------------- # Geometry helpers # --------------------------------------------------------------------------- @@ -44,7 +59,7 @@ def _ensure_polygon(geom) -> "shapely.Polygon | None": if geom.geom_type == "GeometryCollection": polys = [g for g in geom.geoms if g.geom_type == "Polygon"] return max(polys, key=lambda g: g.area) if polys else None - # LineString, MultiLineString, Point, etc. — not a polygon + # LineString, MultiLineString, Point, etc. -- not a polygon return None @@ -630,7 +645,7 @@ def _stitch_sopa_resolve( # --------------------------------------------------------------------------- -# Main orchestrator +# Main orchestrator (stitch phase) # --------------------------------------------------------------------------- @@ -764,11 +779,17 @@ def stitch_transcript_assignments( # --------------------------------------------------------------------------- -# CLI +# Phase 1 entry point: stitch_transcripts (replaces stitch_transcripts.py main) # --------------------------------------------------------------------------- -def main() -> None: +def stitch_transcripts(patches_dir: str, output_dir: str, args_str: str) -> None: + """Phase 1: parse extra CLI args and run the stitch orchestrator. + + Mirrors the original ``stitch_transcripts.py`` argparse interface so that + any flags passed through ``task.ext.args`` are honored by the same + parser as before. + """ parser = argparse.ArgumentParser( description="Stitch per-patch Baysor segmentation results into unified output." ) @@ -794,13 +815,106 @@ def main() -> None: default="segmentation_polygons.json", help="GeoJSON filename within each patch (default: segmentation_polygons.json)", ) - args = parser.parse_args() + + argv = ["--patches", patches_dir, "--output", output_dir] + if args_str: + argv.extend(shlex.split(args_str)) + parsed = parser.parse_args(argv) stitch_transcript_assignments( - patches_dir=args.patches, - output_dir=args.output, - csv_filename=args.csv_filename, - geojson_filename=args.geojson_filename, + patches_dir=parsed.patches, + output_dir=parsed.output, + csv_filename=parsed.csv_filename, + geojson_filename=parsed.geojson_filename, + ) + + +# --------------------------------------------------------------------------- +# Phase 2: post-processing (from stitch_postprocess.py) +# --------------------------------------------------------------------------- + + +def clean_geojson(geojson_path: str) -> set: + """ + Force every feature to a single valid Polygon. + + Returns the set of cell ids whose features were dropped. + """ + with open(geojson_path) as f: + data = json.load(f) + + clean = [] + dropped_cells = set() + for feat in data["features"]: + geom = shape(feat["geometry"]) + if not geom.is_valid: + geom = shapely.make_valid(geom) + poly = None + if geom.geom_type == "Polygon": + poly = geom + elif geom.geom_type == "MultiPolygon": + poly = max(geom.geoms, key=lambda g: g.area) + elif geom.geom_type == "GeometryCollection": + polys = [g for g in geom.geoms if g.geom_type == "Polygon"] + if polys: + poly = max(polys, key=lambda g: g.area) + if poly is not None and not poly.is_empty: + feat["geometry"] = mapping(poly) + clean.append(feat) + else: + cell_id = feat.get("id") or feat.get("properties", {}).get("cell_id", "") + dropped_cells.add(str(cell_id)) + + print(f"GeoJSON: {len(clean)} kept, {len(dropped_cells)} dropped: {dropped_cells}") + data["features"] = clean + with open(geojson_path, "w") as f: + json.dump(data, f) + + return dropped_cells + + +def reassign_dropped(csv_path: str, dropped_cells: set) -> None: + """ + Reassign transcripts of dropped cells to UNASSIGNED in the CSV. + """ + if not dropped_cells: + return + + with open(csv_path) as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames + rows = list(reader) + + reassigned = 0 + for row in rows: + if row["cell"] in dropped_cells: + row["cell"] = "" + row["is_noise"] = "1" + reassigned += 1 + + with open(csv_path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + print(f"CSV: {reassigned} transcripts reassigned to UNASSIGNED") + + +def postprocess(geojson_path: str, csv_path: str) -> None: + """Phase 2 entry point: cleanup polygons and reconcile the CSV.""" + dropped = clean_geojson(geojson_path) + reassign_dropped(csv_path, dropped) + + +# --------------------------------------------------------------------------- +# Main: run both phases sequentially +# --------------------------------------------------------------------------- + + +def main() -> None: + stitch_transcripts(PATCHES, "output", ARGS) + postprocess( + "output/xr-cell-polygons.geojson", + "output/xr-transcript-metadata.csv", ) From dbd7a35df4b842a3e727a1cb0dca82c7582a61a6 Mon Sep 17 00:00:00 2001 From: an-altosian Date: Tue, 5 May 2026 16:43:39 +0000 Subject: [PATCH 2/3] fix(modules): switch from `template` directive to shell-call invocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI on PR #154 surfaced this hard Nextflow constraint: Process output of type 'eval' is only allowed with Bash process scripts -- Current interpreter: /usr/bin/env python3 The `template '.py'` directive sets the process interpreter to Python from the shebang, but every module emits a version string via an `eval('python3 -c ...')` topic channel — and `eval(...)` outputs only work when the process script body is Bash. All 19 modules failed identically. Fix: keep the Python files in `modules/local//templates/` (satisfying the no-module-level-bin requirement from PR #139's review) but invoke them from a shell `script:` block via `python3 \${moduleDir}/templates/.py --flag value`. The process stays Bash, `eval()` works, and the original argparse-based scripts are restored verbatim — no constants conversion, no script merging, no moved env preludes. For xenium_patch/stitch this means restoring the two original scripts (stitch_transcripts.py + stitch_postprocess.py) instead of the merged stitch.py. For segger/create_dataset, the NUMBA_CACHE_DIR shell prelude is restored to its original location in the .nf script: block. Net result vs. the previous attempt: same nf-core compliance (no module-level bin/), but invasive Python rewrites are reverted. The original behavior is preserved exactly. --- modules/local/baysor/create_dataset/main.nf | 7 +- .../templates/create_dataset.py | 29 +++- modules/local/baysor/preprocess/main.nf | 11 +- .../templates/preprocess_transcripts.py | 63 ++++++-- modules/local/ficture/preprocess/main.nf | 9 +- .../templates/ficture_preprocess.py | 36 ++--- modules/local/segger/create_dataset/main.nf | 14 +- .../templates/run_create_dataset.py | 61 ++++---- modules/local/segger/predict/main.nf | 14 +- .../segger/predict/templates/run_predict.py | 32 ---- modules/local/spatialdata/merge/main.nf | 8 +- .../merge/templates/spatialdata_merge.py | 43 ++++-- modules/local/spatialdata/meta/main.nf | 9 +- .../meta/templates/spatialdata_meta.py | 32 ++-- modules/local/spatialdata/write/main.nf | 10 +- .../write/templates/spatialdata_write.py | 40 ++--- .../local/utility/convert_mask_uint32/main.nf | 7 +- .../templates/convert_mask_uint32.py | 23 ++- .../utility/downscale_morphology/main.nf | 9 +- .../templates/downscale_morphology.py | 28 ++-- modules/local/utility/extract_dapi/main.nf | 8 +- .../extract_dapi/templates/extract_dapi.py | 31 +++- .../utility/extract_preview_data/main.nf | 6 +- .../templates/extract_data.py | 29 +++- modules/local/utility/get_coordinates/main.nf | 5 +- .../templates/get_coordinates.py | 21 ++- modules/local/utility/parquet_to_csv/main.nf | 7 +- .../templates/parquet_to_csv.py | 36 ++++- modules/local/utility/resize_tif/main.nf | 9 +- .../resize_tif/templates/resize_tif.py | 33 +++-- modules/local/utility/segger2xr/main.nf | 9 +- .../utility/segger2xr/templates/segger2xr.py | 37 ++++- .../local/utility/split_transcripts/main.nf | 10 +- .../templates/split_transcripts.py | 46 ++++-- modules/local/utility/upscale_mask/main.nf | 8 +- .../upscale_mask/templates/upscale_mask.py | 24 ++- modules/local/xenium_patch/stitch/main.nf | 12 +- .../stitch/templates/stitch_postprocess.py | 98 ++++++++++++ .../{stitch.py => stitch_transcripts.py} | 140 ++---------------- 39 files changed, 661 insertions(+), 393 deletions(-) create mode 100644 modules/local/xenium_patch/stitch/templates/stitch_postprocess.py rename modules/local/xenium_patch/stitch/templates/{stitch.py => stitch_transcripts.py} (85%) diff --git a/modules/local/baysor/create_dataset/main.nf b/modules/local/baysor/create_dataset/main.nf index 91aa67a0..b05350fa 100644 --- a/modules/local/baysor/create_dataset/main.nf +++ b/modules/local/baysor/create_dataset/main.nf @@ -23,7 +23,12 @@ process BAYSOR_CREATE_DATASET { prefix = task.ext.prefix ?: "${meta.id}" - template 'create_dataset.py' + """ + python3 ${moduleDir}/templates/create_dataset.py \\ + --transcripts ${transcripts} \\ + --sample-fraction ${sample_fraction} \\ + --prefix ${prefix} + """ stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/baysor/create_dataset/templates/create_dataset.py b/modules/local/baysor/create_dataset/templates/create_dataset.py index bbf05320..4e5a263a 100644 --- a/modules/local/baysor/create_dataset/templates/create_dataset.py +++ b/modules/local/baysor/create_dataset/templates/create_dataset.py @@ -6,16 +6,12 @@ writing the result to a new CSV file. """ +import argparse import csv import os import random from pathlib import Path -# Nextflow-injected variables -TRANSCRIPTS = "${transcripts}" -SAMPLE_FRACTION = "${sample_fraction}" -PREFIX = "${prefix}" - class BaysorPreview(): """ @@ -66,14 +62,31 @@ def main() -> None: """ Run create dataset as nf module """ + parser = argparse.ArgumentParser( + description="Create sampled dataset for Baysor preview" + ) + parser.add_argument( + "--transcripts", required=True, + help="Path to transcripts CSV file" + ) + parser.add_argument( + "--sample-fraction", required=True, type=float, + help="Fraction of rows to sample" + ) + parser.add_argument( + "--prefix", required=True, + help="Output directory prefix" + ) + args = parser.parse_args() + sampled_transcripts = "sampled_transcripts.csv" # generate dataset BaysorPreview.generate_dataset( - transcripts=TRANSCRIPTS, + transcripts=args.transcripts, sampled_transcripts=sampled_transcripts, - sample_fraction=float(SAMPLE_FRACTION), - prefix=PREFIX, + sample_fraction=args.sample_fraction, + prefix=args.prefix ) return None diff --git a/modules/local/baysor/preprocess/main.nf b/modules/local/baysor/preprocess/main.nf index b11479d1..1a5a42fe 100644 --- a/modules/local/baysor/preprocess/main.nf +++ b/modules/local/baysor/preprocess/main.nf @@ -29,7 +29,16 @@ process BAYSOR_PREPROCESS_TRANSCRIPTS { prefix = task.ext.prefix ?: "${meta.id}" - template 'preprocess_transcripts.py' + """ + python3 ${moduleDir}/templates/preprocess_transcripts.py \\ + --transcripts ${transcripts} \\ + --prefix ${prefix} \\ + --min-qv ${min_qv} \\ + --min-x ${min_x} \\ + --max-x ${max_x} \\ + --min-y ${min_y} \\ + --max-y ${max_y} + """ stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/baysor/preprocess/templates/preprocess_transcripts.py b/modules/local/baysor/preprocess/templates/preprocess_transcripts.py index 25bc41dc..2662f83c 100644 --- a/modules/local/baysor/preprocess/templates/preprocess_transcripts.py +++ b/modules/local/baysor/preprocess/templates/preprocess_transcripts.py @@ -6,19 +6,11 @@ removes negative control probes, and outputs filtered CSV for Baysor compatibility. """ +import argparse import os import pandas as pd -# Nextflow-injected variables -TRANSCRIPTS = "${transcripts}" -PREFIX = "${prefix}" -MIN_QV = "${min_qv}" -MIN_X = "${min_x}" -MAX_X = "${max_x}" -MIN_Y = "${min_y}" -MAX_Y = "${max_y}" - def filter_transcripts( transcripts: str, @@ -78,14 +70,53 @@ def main() -> None: """ Run preprocess transcripts as nf module. """ + parser = argparse.ArgumentParser( + description="Preprocess Xenium transcripts for Baysor" + ) + parser.add_argument( + "--transcripts", required=True, help="Path to transcripts parquet file" + ) + parser.add_argument("--prefix", required=True, help="Output directory prefix") + parser.add_argument( + "--min-qv", + type=float, + default=20.0, + help="Minimum Q-Score threshold (default: 20.0)", + ) + parser.add_argument( + "--min-x", + type=float, + default=0.0, + help="Minimum x-coordinate threshold (default: 0.0)", + ) + parser.add_argument( + "--max-x", + type=float, + default=24000.0, + help="Maximum x-coordinate threshold (default: 24000.0)", + ) + parser.add_argument( + "--min-y", + type=float, + default=0.0, + help="Minimum y-coordinate threshold (default: 0.0)", + ) + parser.add_argument( + "--max-y", + type=float, + default=24000.0, + help="Maximum y-coordinate threshold (default: 24000.0)", + ) + args = parser.parse_args() + filter_transcripts( - transcripts=TRANSCRIPTS, - min_qv=float(MIN_QV), - min_x=float(MIN_X), - max_x=float(MAX_X), - min_y=float(MIN_Y), - max_y=float(MAX_Y), - prefix=PREFIX, + transcripts=args.transcripts, + min_qv=args.min_qv, + min_x=args.min_x, + max_x=args.max_x, + min_y=args.min_y, + max_y=args.max_y, + prefix=args.prefix, ) return None diff --git a/modules/local/ficture/preprocess/main.nf b/modules/local/ficture/preprocess/main.nf index 76005c10..09b6fb4c 100644 --- a/modules/local/ficture/preprocess/main.nf +++ b/modules/local/ficture/preprocess/main.nf @@ -21,8 +21,15 @@ process FICTURE_PREPROCESS { script: def args = task.ext.args ?: '' + def features_arg = features ? "--features ${features}" : "" - template 'ficture_preprocess.py' + """ + python3 ${moduleDir}/templates/ficture_preprocess.py \\ + --transcripts ${transcripts} \\ + ${features_arg} \\ + --negative-control-regex '${params.negative_control_regex}' \\ + ${args} + """ stub: """ diff --git a/modules/local/ficture/preprocess/templates/ficture_preprocess.py b/modules/local/ficture/preprocess/templates/ficture_preprocess.py index 92ae9721..2e0c687c 100644 --- a/modules/local/ficture/preprocess/templates/ficture_preprocess.py +++ b/modules/local/ficture/preprocess/templates/ficture_preprocess.py @@ -1,40 +1,18 @@ #!/usr/bin/env python3 """Preprocess Xenium transcripts for FICTURE analysis.""" +import argparse import gzip import logging import os import re -import shlex import sys import pandas as pd -# Nextflow-injected variables -TRANSCRIPTS = "${transcripts}" -FEATURES = "${features}" -NEGATIVE_CONTROL_REGEX = "${params.negative_control_regex}" -ARGS = "${args}" - - -def main(): - """Run FICTURE preprocessing.""" - print("[START]") - - # Re-build argv so parse_known_args() can absorb any extra flags from task.ext.args - sys.argv = [ - sys.argv[0], - "--transcripts", - TRANSCRIPTS, - "--negative-control-regex", - NEGATIVE_CONTROL_REGEX, - ] - if FEATURES: - sys.argv += ["--features", FEATURES] - sys.argv += shlex.split(ARGS) - - import argparse +def parse_args(): + """Parse command-line arguments.""" parser = argparse.ArgumentParser( description="Preprocess Xenium transcripts for FICTURE" ) @@ -47,7 +25,13 @@ def main(): parser.add_argument( "--negative-control-regex", default="", help="Regex for negative control probes" ) - args, _ = parser.parse_known_args() + return parser.parse_args() + + +def main(): + """Run FICTURE preprocessing.""" + args = parse_args() + print("[START]") negctrl_regex = "BLANK|NegCon" if args.negative_control_regex: diff --git a/modules/local/segger/create_dataset/main.nf b/modules/local/segger/create_dataset/main.nf index 2b03d2c9..bd81f7b8 100644 --- a/modules/local/segger/create_dataset/main.nf +++ b/modules/local/segger/create_dataset/main.nf @@ -29,7 +29,19 @@ process SEGGER_CREATE_DATASET { error("${params.format} is an invalid platform type.") } - template 'run_create_dataset.py' + """ + export NUMBA_CACHE_DIR=\$PWD/.numba_cache + mkdir -p \$NUMBA_CACHE_DIR + + python3 ${moduleDir}/templates/run_create_dataset.py \\ + --bundle-dir ${base_dir} \\ + --output-dir ${prefix} \\ + --sample-type ${params.format} \\ + --tile-width ${params.tile_width} \\ + --tile-height ${params.tile_height} \\ + --n-workers ${task.cpus} \\ + ${args} + """ stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/segger/create_dataset/templates/run_create_dataset.py b/modules/local/segger/create_dataset/templates/run_create_dataset.py index 28df3cff..c73ab006 100644 --- a/modules/local/segger/create_dataset/templates/run_create_dataset.py +++ b/modules/local/segger/create_dataset/templates/run_create_dataset.py @@ -11,14 +11,8 @@ Each WORKAROUND should be removable when the upstream segger bug is fixed. """ -# ruff: noqa: E402 -- NUMBA_CACHE_DIR must be set before subsequent imports -# (segger / torch transitively pull in numba). Order matters; do not reorder. +import argparse import os - -os.environ["NUMBA_CACHE_DIR"] = os.path.join(os.getcwd(), ".numba_cache") -os.makedirs(os.environ["NUMBA_CACHE_DIR"], exist_ok=True) - -import shlex import shutil import subprocess import sys @@ -32,14 +26,18 @@ SEGGER_CLI = "/workspace/segger_dev/src/segger/cli/create_dataset_fast.py" -# Nextflow-injected variables -BUNDLE_DIR = "${base_dir}" -OUTPUT_DIR = "${prefix}" -SAMPLE_TYPE = "${params.format}" -TILE_WIDTH = "${params.tile_width}" -TILE_HEIGHT = "${params.tile_height}" -N_WORKERS = "${task.cpus}" -ARGS = "${args}" + +def parse_args(): + p = argparse.ArgumentParser() + p.add_argument("--bundle-dir", required=True) + p.add_argument("--output-dir", required=True) + p.add_argument("--sample-type", required=True, choices=["xenium"]) + p.add_argument("--tile-width", type=int, required=True) + p.add_argument("--tile-height", type=int, required=True) + p.add_argument("--n-workers", type=int, required=True) + # remaining args forwarded to segger CLI + args, extra = p.parse_known_args() + return args, extra def prepare_bundle(bundle_dir): @@ -112,22 +110,22 @@ def add_parquet_stats(): print("=== End Debug ===\n") -def run_segger_cli(output_dir, sample_type, tile_width, tile_height, n_workers, extra): +def run_segger_cli(args, extra): cmd = [ "python3", SEGGER_CLI, "--base_dir", "bundle_stats", "--data_dir", - output_dir, + args.output_dir, "--sample_type", - sample_type, + args.sample_type, "--tile_width", - str(tile_width), + str(args.tile_width), "--tile_height", - str(tile_height), + str(args.tile_height), "--n_workers", - str(n_workers), + str(args.n_workers), *extra, ] print(f"Running: {' '.join(cmd)}") @@ -230,9 +228,13 @@ def fix_bd_x_nan(prefix): def main(): - extra = shlex.split(ARGS) + args, extra = parse_args() - prepare_bundle(BUNDLE_DIR) + # Ensure numba cache dir is writable (env var should be set by caller, but belt-and-suspenders) + os.environ.setdefault("NUMBA_CACHE_DIR", os.path.join(os.getcwd(), ".numba_cache")) + os.makedirs(os.environ["NUMBA_CACHE_DIR"], exist_ok=True) + + prepare_bundle(args.bundle_dir) print("Adding statistics to parquet files...") add_parquet_stats() @@ -241,17 +243,10 @@ def main(): for item in sorted(Path("bundle_stats").iterdir()): print(f" {item.name}") - run_segger_cli( - OUTPUT_DIR, - SAMPLE_TYPE, - int(TILE_WIDTH), - int(TILE_HEIGHT), - int(N_WORKERS), - extra, - ) + run_segger_cli(args, extra) - filter_trainable_tiles_if_needed(OUTPUT_DIR) - fix_bd_x_nan(OUTPUT_DIR) + filter_trainable_tiles_if_needed(args.output_dir) + fix_bd_x_nan(args.output_dir) if __name__ == "__main__": diff --git a/modules/local/segger/predict/main.nf b/modules/local/segger/predict/main.nf index d8384c74..48195d53 100644 --- a/modules/local/segger/predict/main.nf +++ b/modules/local/segger/predict/main.nf @@ -26,8 +26,18 @@ process SEGGER_PREDICT { def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" - - template 'run_predict.py' + """ + python3 ${moduleDir}/templates/run_predict.py \\ + --models-dir ${models_dir} \\ + --segger-data-dir ${segger_dataset} \\ + --transcripts-file ${transcripts} \\ + --benchmarks-dir benchmarks_dir \\ + --batch-size ${params.batch_size_predict} \\ + --use-cc ${params.cc_analysis} \\ + --knn-method ${params.segger_knn_method} \\ + --num-workers ${task.cpus} \\ + ${args} + """ stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/segger/predict/templates/run_predict.py b/modules/local/segger/predict/templates/run_predict.py index ca2051a7..56a77ffc 100644 --- a/modules/local/segger/predict/templates/run_predict.py +++ b/modules/local/segger/predict/templates/run_predict.py @@ -12,23 +12,12 @@ import argparse import os -import shlex import subprocess import sys SEGGER_CLI = "/workspace/segger_dev/src/segger/cli/predict_fast.py" -# Nextflow-injected variables -MODELS_DIR = "${models_dir}" -SEGGER_DATASET = "${segger_dataset}" -TRANSCRIPTS = "${transcripts}" -BATCH_SIZE = "${params.batch_size_predict}" -USE_CC = "${params.cc_analysis}" -KNN_METHOD = "${params.segger_knn_method}" -NUM_WORKERS = "${task.cpus}" -ARGS = "${args}" - def parse_args(): p = argparse.ArgumentParser() @@ -124,27 +113,6 @@ def run_segger_cli(args, extra, gpu_ids): def main(): - # Re-build argv from Nextflow-injected constants + any extra task.ext.args flags - sys.argv = [ - sys.argv[0], - "--models-dir", - MODELS_DIR, - "--segger-data-dir", - SEGGER_DATASET, - "--transcripts-file", - TRANSCRIPTS, - "--benchmarks-dir", - "benchmarks_dir", - "--batch-size", - BATCH_SIZE, - "--use-cc", - USE_CC, - "--knn-method", - KNN_METHOD, - "--num-workers", - NUM_WORKERS, - ] + shlex.split(ARGS) - args, extra = parse_args() # Limit cupy GPU memory to 80% so PyTorch has headroom for graph attention ops diff --git a/modules/local/spatialdata/merge/main.nf b/modules/local/spatialdata/merge/main.nf index 9ff4b112..9e063844 100644 --- a/modules/local/spatialdata/merge/main.nf +++ b/modules/local/spatialdata/merge/main.nf @@ -25,7 +25,13 @@ process SPATIALDATA_MERGE { prefix = task.ext.prefix ?: "${meta.id}" - template 'spatialdata_merge.py' + """ + python3 ${moduleDir}/templates/spatialdata_merge.py \\ + --raw-bundle ${raw_bundle} \\ + --redefined-bundle ${redefined_bundle} \\ + --prefix ${prefix} \\ + --output-folder ${outputfolder} + """ stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/spatialdata/merge/templates/spatialdata_merge.py b/modules/local/spatialdata/merge/templates/spatialdata_merge.py index cd55e00c..409d8c00 100644 --- a/modules/local/spatialdata/merge/templates/spatialdata_merge.py +++ b/modules/local/spatialdata/merge/templates/spatialdata_merge.py @@ -1,36 +1,45 @@ #!/usr/bin/env python3 """Merge two spatialdata bundles to create a layered spatialdata object.""" +import argparse import json import os import shutil -import spatialdata # noqa: F401 (kept so versions topic via `import spatialdata` is valid) +import spatialdata -# Nextflow-injected variables -RAW_BUNDLE = "${raw_bundle}" -REDEFINED_BUNDLE = "${redefined_bundle}" -PREFIX = "${prefix}" -OUTPUT_FOLDER = "${outputfolder}" + +def parse_args(): + """Parse command-line arguments.""" + parser = argparse.ArgumentParser(description="Merge two spatialdata bundles") + parser.add_argument("--raw-bundle", required=True, help="Path to raw spatialdata bundle") + parser.add_argument("--redefined-bundle", required=True, help="Path to redefined spatialdata bundle") + parser.add_argument("--prefix", required=True, help="Output prefix (sample ID)") + parser.add_argument("--output-folder", required=True, help="Output folder name") + return parser.parse_args() def main(): """Run spatialdata merge.""" + args = parse_args() print("[START]") - output_dir = f"spatialdata/{PREFIX}/{OUTPUT_FOLDER}" + output_dir = f"spatialdata/{args.prefix}/{args.output_folder}" + # Ensure the output folder exists if os.path.exists(output_dir): shutil.rmtree(output_dir) os.makedirs(output_dir) - for root, _, files in os.walk(RAW_BUNDLE): - rel_path = os.path.relpath(root, RAW_BUNDLE) + # Copy the entire reference bundle as is + for root, _, files in os.walk(args.raw_bundle): + rel_path = os.path.relpath(root, args.raw_bundle) target_path = os.path.join(output_dir, rel_path) os.makedirs(target_path, exist_ok=True) for file in files: shutil.copy(os.path.join(root, file), os.path.join(target_path, file)) + # Rename folders in Points, Shapes, and Tables to raw_* for category in ["points", "shapes", "tables"]: category_path = os.path.join(output_dir, category) if os.path.exists(category_path): @@ -40,8 +49,9 @@ def main(): new_path = os.path.join(category_path, f"raw_{folder}") os.rename(old_path, new_path) + # Copy folders from redefined_bundle and rename them as redefined_* for category in ["points", "shapes", "tables"]: - add_category_path = os.path.join(REDEFINED_BUNDLE, category) + add_category_path = os.path.join(args.redefined_bundle, category) output_category_path = os.path.join(output_dir, category) os.makedirs(output_category_path, exist_ok=True) @@ -52,16 +62,17 @@ def main(): shutil.copytree(src_folder, dest_folder) # Invalidate consolidated metadata in zarr.json -- the directory renames above - # made the element paths in the metadata stale. Without consolidated metadata, - # sd.read_zarr() discovers elements by scanning the filesystem directly. + # made the element paths in the metadata stale (e.g., 'points/transcripts' -> + # 'points/raw_transcripts'). Without consolidated metadata, sd.read_zarr() + # discovers elements by scanning the filesystem directly. zarr_json = os.path.join(output_dir, "zarr.json") if os.path.exists(zarr_json): with open(zarr_json) as f: - meta_obj = json.load(f) - if "consolidated_metadata" in meta_obj: - del meta_obj["consolidated_metadata"] + meta = json.load(f) + if "consolidated_metadata" in meta: + del meta["consolidated_metadata"] with open(zarr_json, "w") as f: - json.dump(meta_obj, f) + json.dump(meta, f) print("[NOTE] Removed stale consolidated metadata from zarr.json") print("[FINISH]") diff --git a/modules/local/spatialdata/meta/main.nf b/modules/local/spatialdata/meta/main.nf index c3665b1f..54d9ae25 100644 --- a/modules/local/spatialdata/meta/main.nf +++ b/modules/local/spatialdata/meta/main.nf @@ -25,7 +25,14 @@ process SPATIALDATA_META { prefix = task.ext.prefix ?: "${meta.id}" - template 'spatialdata_meta.py' + """ + python3 ${moduleDir}/templates/spatialdata_meta.py \\ + --spatialdata-bundle ${spatialdata_bundle} \\ + --xenium-bundle ${xenium_bundle} \\ + --prefix ${prefix} \\ + --metadata '${meta}' \\ + --output-folder ${outputfolder} + """ stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/spatialdata/meta/templates/spatialdata_meta.py b/modules/local/spatialdata/meta/templates/spatialdata_meta.py index e7f4d3d2..935f39b2 100644 --- a/modules/local/spatialdata/meta/templates/spatialdata_meta.py +++ b/modules/local/spatialdata/meta/templates/spatialdata_meta.py @@ -1,12 +1,13 @@ #!/usr/bin/env python3 """Add metadata to SpatialData bundle.""" +import argparse import json import sys import pandas as pd import spatialdata as sd -import zarr # noqa: F401 (preserved from original; ensures zarr is loaded before zarr.core.group) +import zarr # Fix zarr v3 + anndata + numcodecs incompatibility: # anndata's string writer passes numcodecs.VLenUTF8 to zarr.Group.create_array, @@ -16,13 +17,6 @@ import numcodecs import zarr.core.group as _zarr_group -# Nextflow-injected variables -SPATIALDATA_BUNDLE = "${spatialdata_bundle}" -XENIUM_BUNDLE = "${xenium_bundle}" -PREFIX = "${prefix}" -METADATA = "${meta}" -OUTPUT_FOLDER = "${outputfolder}" - _orig_create_array = _zarr_group.Group.create_array @@ -76,15 +70,27 @@ def convert_arrow_to_numpy(sdata): _convert_df_arrow_to_numpy(adata.var) +def parse_args(): + """Parse command-line arguments.""" + parser = argparse.ArgumentParser(description="Add metadata to SpatialData bundle") + parser.add_argument("--spatialdata-bundle", required=True, help="Path to spatialdata bundle") + parser.add_argument("--xenium-bundle", required=True, help="Path to xenium bundle") + parser.add_argument("--prefix", required=True, help="Output prefix (sample ID)") + parser.add_argument("--metadata", required=True, help="Metadata string from Nextflow meta map") + parser.add_argument("--output-folder", required=True, help="Output folder name") + return parser.parse_args() + + def main(): """Run spatialdata metadata addition.""" + args = parse_args() print("[START]") - sdata = sd.read_zarr(SPATIALDATA_BUNDLE) + sdata = sd.read_zarr(args.spatialdata_bundle) # Convert metadata into dict print("[NOTE] Read in provenance ...") - metadata = METADATA.strip("[]") # Remove square brackets + metadata = args.metadata.strip("[]") # Remove square brackets pairs = metadata.split(", ") # Split by comma and space metadata = {k: v for k, v in (pair.split(":") for pair in pairs)} # Create dictionary @@ -97,7 +103,7 @@ def main(): # Add experimental metadata print("[NOTE] Read in experiment metadata ...") sdata['raw_table'].uns['experiment_xenium'] = '' - metadata_experiment = f'{XENIUM_BUNDLE}/experiment.xenium' + metadata_experiment = f'{args.xenium_bundle}/experiment.xenium' with open(metadata_experiment, "r") as f: metadata_experiment = json.load(f) sdata['raw_table'].uns['experiment_xenium'] = json.dumps(metadata_experiment) @@ -105,13 +111,13 @@ def main(): # Add gene panel metadata print("[NOTE] Read in gene panel metadata ...") sdata['raw_table'].uns['gene_panel'] = '' - metadata_gene_panel = f'{XENIUM_BUNDLE}/gene_panel.json' + metadata_gene_panel = f'{args.xenium_bundle}/gene_panel.json' with open(metadata_gene_panel, "r") as f: metadata_gene_panel = json.load(f) sdata['raw_table'].uns['gene_panel'] = json.dumps(metadata_gene_panel) convert_arrow_to_numpy(sdata) - sdata.write(f"spatialdata/{PREFIX}/{OUTPUT_FOLDER}", overwrite=True, consolidate_metadata=True, sdata_formats=None) + sdata.write(f"spatialdata/{args.prefix}/{args.output_folder}", overwrite=True, consolidate_metadata=True, sdata_formats=None) print("[FINISH]") diff --git a/modules/local/spatialdata/write/main.nf b/modules/local/spatialdata/write/main.nf index 1144b7c9..050dcdd7 100644 --- a/modules/local/spatialdata/write/main.nf +++ b/modules/local/spatialdata/write/main.nf @@ -27,7 +27,15 @@ process SPATIALDATA_WRITE { prefix = task.ext.prefix ?: "${meta.id}" - template 'spatialdata_write.py' + """ + python3 ${moduleDir}/templates/spatialdata_write.py \\ + --bundle ${bundle} \\ + --prefix ${prefix} \\ + --output-folder ${outputfolder} \\ + --segmented-object ${segmented_object} \\ + --coordinate-space ${coordinate_space} \\ + --format ${params.format} + """ stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/spatialdata/write/templates/spatialdata_write.py b/modules/local/spatialdata/write/templates/spatialdata_write.py index 9428dccd..421e830f 100644 --- a/modules/local/spatialdata/write/templates/spatialdata_write.py +++ b/modules/local/spatialdata/write/templates/spatialdata_write.py @@ -1,10 +1,11 @@ #!/usr/bin/env python3 """Write spatialdata object from segmentation format.""" +import argparse import sys import pandas as pd -import spatialdata # noqa: F401 (preserved from original; ensures spatialdata loads before spatialdata_io) +import spatialdata from spatialdata_io import xenium # Fix zarr v3 + anndata + numcodecs incompatibility: @@ -15,14 +16,6 @@ import numcodecs import zarr.core.group as _zarr_group -# Nextflow-injected variables -BUNDLE = "${bundle}" -PREFIX = "${prefix}" -OUTPUT_FOLDER = "${outputfolder}" -SEGMENTED_OBJECT = "${segmented_object}" -COORDINATE_SPACE = "${coordinate_space}" -FORMAT = "${params.format}" - _orig_create_array = _zarr_group.Group.create_array @@ -84,8 +77,21 @@ def convert_arrow_to_numpy(sdata): _convert_df_arrow_to_numpy(adata.var) +def parse_args(): + """Parse command-line arguments.""" + parser = argparse.ArgumentParser(description="Write spatialdata object from segmentation format") + parser.add_argument("--bundle", required=True, help="Path to input bundle") + parser.add_argument("--prefix", required=True, help="Output prefix (sample ID)") + parser.add_argument("--output-folder", required=True, help="Output folder name") + parser.add_argument("--segmented-object", required=True, help="Segmented object type (cells, nuclei, cells_and_nuclei)") + parser.add_argument("--coordinate-space", required=True, help="Coordinate space (pixels, microns)") + parser.add_argument("--format", required=True, help="Input format (xenium)") + return parser.parse_args() + + def main(): """Run spatialdata write.""" + args = parse_args() print("[START]") cells_as_circles = False @@ -94,13 +100,13 @@ def main(): cells_labels = False nucleus_labels = False - if SEGMENTED_OBJECT == "cells": + if args.segmented_object == "cells": cells_boundaries = True cells_labels = True - elif SEGMENTED_OBJECT == "nuclei": + elif args.segmented_object == "nuclei": nucleus_boundaries = True nucleus_labels = True - elif SEGMENTED_OBJECT == "cells_and_nuclei": + elif args.segmented_object == "cells_and_nuclei": cells_boundaries = True nucleus_boundaries = True cells_labels = True @@ -109,7 +115,7 @@ def main(): cells_as_circles = False # set sd variables based on the coordinate space - if COORDINATE_SPACE == "pixels": + if args.coordinate_space == "pixels": cells_labels = True nucleus_labels = True # Labels are sufficient in pixel space; boundaries can contain @@ -118,16 +124,16 @@ def main(): cells_boundaries = False nucleus_boundaries = False - if COORDINATE_SPACE == "microns": + if args.coordinate_space == "microns": cells_labels = False cells_boundaries = True nucleus_boundaries = False nucleus_labels = False cells_as_circles = False - if FORMAT == "xenium": + if args.format == "xenium": sd_xenium_obj = xenium( - BUNDLE, + args.bundle, cells_as_circles=cells_as_circles, cells_boundaries=cells_boundaries, nucleus_boundaries=nucleus_boundaries, @@ -139,7 +145,7 @@ def main(): ) print(sd_xenium_obj) convert_arrow_to_numpy(sd_xenium_obj) - sd_xenium_obj.write(f"spatialdata/{PREFIX}/{OUTPUT_FOLDER}") + sd_xenium_obj.write(f"spatialdata/{args.prefix}/{args.output_folder}") else: sys.exit("[ERROR] Format not found") diff --git a/modules/local/utility/convert_mask_uint32/main.nf b/modules/local/utility/convert_mask_uint32/main.nf index b80acfad..78190dfe 100644 --- a/modules/local/utility/convert_mask_uint32/main.nf +++ b/modules/local/utility/convert_mask_uint32/main.nf @@ -34,8 +34,11 @@ process CONVERT_MASK_UINT32 { script: prefix = task.ext.prefix ?: "${meta.id}" - - template 'convert_mask_uint32.py' + """ + python3 ${moduleDir}/templates/convert_mask_uint32.py \\ + --input ${mask} \\ + --output ${prefix}_uint32_mask.tif + """ stub: prefix = task.ext.prefix ?: "${meta.id}" diff --git a/modules/local/utility/convert_mask_uint32/templates/convert_mask_uint32.py b/modules/local/utility/convert_mask_uint32/templates/convert_mask_uint32.py index f0a6835d..955ad4b7 100644 --- a/modules/local/utility/convert_mask_uint32/templates/convert_mask_uint32.py +++ b/modules/local/utility/convert_mask_uint32/templates/convert_mask_uint32.py @@ -7,13 +7,11 @@ the input mask, casts it to uint32, and writes the result. """ +import argparse + import numpy as np import tifffile -# Nextflow-injected variables -INPUT_PATH = "${mask}" -OUTPUT_PATH = "${prefix}_uint32_mask.tif" - def convert_mask_to_uint32(input_path: str, output_path: str) -> None: """ @@ -29,5 +27,20 @@ def convert_mask_to_uint32(input_path: str, output_path: str) -> None: print("Output dtype: uint32") +def parse_args() -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Convert a segmentation mask TIFF to uint32 dtype." + ) + parser.add_argument( + "--input", required=True, help="Path to input mask TIFF" + ) + parser.add_argument( + "--output", required=True, help="Path where uint32 mask will be written" + ) + return parser.parse_args() + + if __name__ == "__main__": - convert_mask_to_uint32(input_path=INPUT_PATH, output_path=OUTPUT_PATH) + args = parse_args() + convert_mask_to_uint32(input_path=args.input, output_path=args.output) diff --git a/modules/local/utility/downscale_morphology/main.nf b/modules/local/utility/downscale_morphology/main.nf index 39620bd6..1423106f 100644 --- a/modules/local/utility/downscale_morphology/main.nf +++ b/modules/local/utility/downscale_morphology/main.nf @@ -40,8 +40,13 @@ process DOWNSCALE_MORPHOLOGY { def diameter = task.ext.diameter ?: 9 def diam_mean = 30 prefix = task.ext.prefix ?: "${meta.id}" - - template 'downscale_morphology.py' + """ + python3 ${moduleDir}/templates/downscale_morphology.py \\ + --image ${image} \\ + --diameter ${diameter} \\ + --diam-mean ${diam_mean} \\ + --prefix ${prefix} + """ stub: prefix = task.ext.prefix ?: "${meta.id}" diff --git a/modules/local/utility/downscale_morphology/templates/downscale_morphology.py b/modules/local/utility/downscale_morphology/templates/downscale_morphology.py index 2f0f1e7d..8544ecf3 100644 --- a/modules/local/utility/downscale_morphology/templates/downscale_morphology.py +++ b/modules/local/utility/downscale_morphology/templates/downscale_morphology.py @@ -13,18 +13,13 @@ {prefix}/scale_info.json - Scale factor and original/new dimensions. """ +import argparse import json from pathlib import Path import tifffile from skimage.transform import resize -# Nextflow-injected variables -IMAGE_PATH = "${image}" -DIAMETER = float("${diameter}") -DIAM_MEAN = float("${diam_mean}") -PREFIX = "${prefix}" - # Cellpose network requires a minimum spatial size of 256 px. MIN_DIM = 256 @@ -86,10 +81,23 @@ def downscale_image( print(f"Done: downscaled.tif written, shape={img_ds.shape}") +def parse_args() -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Pre-downscale a morphology image for Cellpose." + ) + parser.add_argument("--image", required=True, help="Morphology TIFF input") + parser.add_argument("--diameter", type=float, required=True, help="Target object diameter") + parser.add_argument("--diam-mean", type=float, required=True, help="Cellpose model diam_mean") + parser.add_argument("--prefix", required=True, help="Output directory") + return parser.parse_args() + + if __name__ == "__main__": + args = parse_args() downscale_image( - image_path=IMAGE_PATH, - diameter=DIAMETER, - diam_mean=DIAM_MEAN, - prefix=PREFIX, + image_path=args.image, + diameter=args.diameter, + diam_mean=args.diam_mean, + prefix=args.prefix, ) diff --git a/modules/local/utility/extract_dapi/main.nf b/modules/local/utility/extract_dapi/main.nf index 206e6519..3ff3ae78 100644 --- a/modules/local/utility/extract_dapi/main.nf +++ b/modules/local/utility/extract_dapi/main.nf @@ -35,8 +35,12 @@ process EXTRACT_DAPI { script: prefix = task.ext.prefix ?: "${meta.id}" def channel_index = task.ext.channel_index ?: 0 - - template 'extract_dapi.py' + """ + python3 ${moduleDir}/templates/extract_dapi.py \\ + --input ${image} \\ + --output ${prefix}_dapi.tif \\ + --channel-index ${channel_index} + """ stub: prefix = task.ext.prefix ?: "${meta.id}" diff --git a/modules/local/utility/extract_dapi/templates/extract_dapi.py b/modules/local/utility/extract_dapi/templates/extract_dapi.py index f0d9b9f2..3d60f563 100644 --- a/modules/local/utility/extract_dapi/templates/extract_dapi.py +++ b/modules/local/utility/extract_dapi/templates/extract_dapi.py @@ -8,12 +8,9 @@ the requested channel, and writes the result. """ -import tifffile +import argparse -# Nextflow-injected variables -INPUT_PATH = "${image}" -OUTPUT_PATH = "${prefix}_dapi.tif" -CHANNEL_INDEX = int("${channel_index}") +import tifffile def extract_channel(input_path: str, output_path: str, channel_index: int) -> None: @@ -37,9 +34,27 @@ def extract_channel(input_path: str, output_path: str, channel_index: int) -> No print(f"Input shape: {orig_shape} -> extracted channel {channel_index}: {img.shape}") +def parse_args() -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Extract a single channel from a multi-channel OME-TIFF." + ) + parser.add_argument( + "--input", required=True, help="Path to multi-channel OME-TIFF morphology image" + ) + parser.add_argument( + "--output", required=True, help="Path where the single-channel TIFF will be written" + ) + parser.add_argument( + "--channel-index", type=int, default=0, help="Channel index to extract (default: 0)" + ) + return parser.parse_args() + + if __name__ == "__main__": + args = parse_args() extract_channel( - input_path=INPUT_PATH, - output_path=OUTPUT_PATH, - channel_index=CHANNEL_INDEX, + input_path=args.input, + output_path=args.output, + channel_index=args.channel_index, ) diff --git a/modules/local/utility/extract_preview_data/main.nf b/modules/local/utility/extract_preview_data/main.nf index 34d997fb..c0ae5117 100644 --- a/modules/local/utility/extract_preview_data/main.nf +++ b/modules/local/utility/extract_preview_data/main.nf @@ -25,7 +25,11 @@ process EXTRACT_PREVIEW_DATA { prefix = task.ext.prefix ?: "${meta.id}" - template 'extract_data.py' + """ + python3 ${moduleDir}/templates/extract_data.py \\ + --preview-html ${preview_html} \\ + --prefix ${prefix} + """ stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/utility/extract_preview_data/templates/extract_data.py b/modules/local/utility/extract_preview_data/templates/extract_data.py index d34fedaf..0ea737c2 100644 --- a/modules/local/utility/extract_preview_data/templates/extract_data.py +++ b/modules/local/utility/extract_preview_data/templates/extract_data.py @@ -6,6 +6,7 @@ Baysor preview.html file, writing MultiQC-compatible TSV and PNG files. """ +import argparse import base64 import html import json @@ -17,10 +18,6 @@ import pandas as pd from bs4 import BeautifulSoup -# Nextflow-injected variables -PREVIEW_HTML = "${preview_html}" -PREFIX = "${prefix}" - def get_png_files(soup: BeautifulSoup, outdir: Path) -> None: """Get png base64 images following specific h1 tags in preview.html""" @@ -163,9 +160,29 @@ def write_tsvs(specs: Dict[str, str], outdir: Path) -> List[Path]: return written +def parse_args() -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Extract preview data from Baysor preview HTML reports." + ) + parser.add_argument( + "--preview-html", + required=True, + help="Path to Baysor preview HTML file", + ) + parser.add_argument( + "--prefix", + required=True, + help="Output directory prefix (sample ID)", + ) + return parser.parse_args() + + if __name__ == "__main__": - input_path: Path = Path(PREVIEW_HTML) - outdir: Path = Path(PREFIX) + args = parse_args() + + input_path: Path = Path(args.preview_html) + outdir: Path = Path(args.prefix) text = input_path.read_text(encoding="utf-8", errors="ignore") soup = BeautifulSoup(text, "html.parser") diff --git a/modules/local/utility/get_coordinates/main.nf b/modules/local/utility/get_coordinates/main.nf index f6b70571..e45d4798 100644 --- a/modules/local/utility/get_coordinates/main.nf +++ b/modules/local/utility/get_coordinates/main.nf @@ -24,7 +24,10 @@ process GET_TRANSCRIPTS_COORDINATES { prefix = task.ext.prefix ?: "${meta.id}" - template 'get_coordinates.py' + """ + python3 ${moduleDir}/templates/get_coordinates.py \\ + --transcripts ${transcripts} + """ stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/utility/get_coordinates/templates/get_coordinates.py b/modules/local/utility/get_coordinates/templates/get_coordinates.py index 05c01567..8770f9a2 100644 --- a/modules/local/utility/get_coordinates/templates/get_coordinates.py +++ b/modules/local/utility/get_coordinates/templates/get_coordinates.py @@ -6,10 +6,9 @@ bounding box (x_min, x_max, y_min, y_max) to stdout. """ -import pandas as pd +import argparse -# Nextflow-injected variables -TRANSCRIPTS = "${transcripts}" +import pandas as pd def get_coordinates(parquet_path: str): @@ -42,6 +41,20 @@ def get_coordinates(parquet_path: str): ) +def parse_args() -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Get transcript coordinate bounds from a Parquet file." + ) + parser.add_argument( + "--transcripts", + required=True, + help="Path to transcripts parquet file", + ) + return parser.parse_args() + + if __name__ == "__main__": - result = get_coordinates(TRANSCRIPTS) + args = parse_args() + result = get_coordinates(args.transcripts) print(",".join(str(v) for v in result)) diff --git a/modules/local/utility/parquet_to_csv/main.nf b/modules/local/utility/parquet_to_csv/main.nf index 6df3da37..65d8e580 100644 --- a/modules/local/utility/parquet_to_csv/main.nf +++ b/modules/local/utility/parquet_to_csv/main.nf @@ -24,7 +24,12 @@ process PARQUET_TO_CSV { } prefix = task.ext.prefix ?: "${meta.id}" - template 'parquet_to_csv.py' + """ + python3 ${moduleDir}/templates/parquet_to_csv.py \\ + --transcripts ${transcripts} \\ + --extension ${extension} \\ + --prefix ${prefix} + """ stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/utility/parquet_to_csv/templates/parquet_to_csv.py b/modules/local/utility/parquet_to_csv/templates/parquet_to_csv.py index cb1d8107..bfa19c40 100644 --- a/modules/local/utility/parquet_to_csv/templates/parquet_to_csv.py +++ b/modules/local/utility/parquet_to_csv/templates/parquet_to_csv.py @@ -5,15 +5,11 @@ Reads a Parquet file and writes it as CSV, optionally gzip-compressed. """ +import argparse from pathlib import Path import pandas as pd -# Nextflow-injected variables -TRANSCRIPTS = "${transcripts}" -EXTENSION = "${extension}" -PREFIX = "${prefix}" - def convert_parquet( transcripts: str, @@ -42,9 +38,33 @@ def convert_parquet( return None +def parse_args() -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Convert a Parquet file to CSV format." + ) + parser.add_argument( + "--transcripts", + required=True, + help="Input parquet filename", + ) + parser.add_argument( + "--extension", + default=".csv", + help="Output extension: '.csv' or '.gz' (default: .csv)", + ) + parser.add_argument( + "--prefix", + required=True, + help="Output directory prefix (sample ID)", + ) + return parser.parse_args() + + if __name__ == "__main__": + args = parse_args() convert_parquet( - transcripts=TRANSCRIPTS, - extension=EXTENSION, - prefix=PREFIX, + transcripts=args.transcripts, + extension=args.extension, + prefix=args.prefix, ) diff --git a/modules/local/utility/resize_tif/main.nf b/modules/local/utility/resize_tif/main.nf index 965d652e..a4989d68 100644 --- a/modules/local/utility/resize_tif/main.nf +++ b/modules/local/utility/resize_tif/main.nf @@ -25,7 +25,14 @@ process RESIZE_TIF { prefix = task.ext.prefix ?: "${meta.id}" - template 'resize_tif.py' + """ + python3 ${moduleDir}/templates/resize_tif.py \\ + --mask ${mask} \\ + --transcripts ${transcripts} \\ + --metadata ${metadata} \\ + --prefix ${prefix} \\ + --mask-filename ${mask} + """ stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/utility/resize_tif/templates/resize_tif.py b/modules/local/utility/resize_tif/templates/resize_tif.py index 5674a094..6cca640d 100644 --- a/modules/local/utility/resize_tif/templates/resize_tif.py +++ b/modules/local/utility/resize_tif/templates/resize_tif.py @@ -6,6 +6,7 @@ space of Xenium transcript data using microns-per-pixel metadata. """ +import argparse import json import os from typing import Tuple @@ -15,13 +16,6 @@ import tifffile from skimage.transform import resize -# Nextflow-injected variables -MASK = "${mask}" -TRANSCRIPTS = "${transcripts}" -METADATA = "${metadata}" -PREFIX = "${prefix}" -MASK_FILENAME = "${mask}" - def read_mask(mask_path: str) -> np.ndarray: """Read the segmentation mask from a TIFF file.""" @@ -113,13 +107,28 @@ def main(mask_path: str, transcripts_path: str, metadata_path: str, output_path: print(f"Saved resized mask -> {output_path}") +def parse_args() -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Resize a segmentation TIFF mask to match transcript coordinates." + ) + parser.add_argument("--mask", required=True, help="Path to segmentation mask TIFF") + parser.add_argument("--transcripts", required=True, help="Path to transcripts file") + parser.add_argument("--metadata", required=True, help="Path to metadata JSON") + parser.add_argument("--prefix", required=True, help="Output directory prefix") + parser.add_argument("--mask-filename", required=True, help="Original mask filename for output naming") + return parser.parse_args() + + if __name__ == "__main__": - os.makedirs(PREFIX, exist_ok=True) - output_mask: str = os.path.join(PREFIX, f"resized_{MASK_FILENAME}.tif") + args = parse_args() + + os.makedirs(args.prefix, exist_ok=True) + output_mask: str = os.path.join(args.prefix, f"resized_{args.mask_filename}.tif") main( - mask_path=MASK, - transcripts_path=TRANSCRIPTS, - metadata_path=METADATA, + mask_path=args.mask, + transcripts_path=args.transcripts, + metadata_path=args.metadata, output_path=output_mask, ) diff --git a/modules/local/utility/segger2xr/main.nf b/modules/local/utility/segger2xr/main.nf index b562eed4..daec2fba 100644 --- a/modules/local/utility/segger2xr/main.nf +++ b/modules/local/utility/segger2xr/main.nf @@ -24,9 +24,14 @@ process SEGGER2XR { error "SEGGER2XR module does not support Conda. Please use Docker / Singularity / Podman instead." } - min_transcripts = task.ext.min_transcripts_per_cell ?: 3 + def min_transcripts = task.ext.min_transcripts_per_cell ?: 3 - template 'segger2xr.py' + """ + python3 ${moduleDir}/templates/segger2xr.py \\ + --transcripts ${transcripts} \\ + --prefix ${meta.id} \\ + --min-transcripts ${min_transcripts} + """ stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/utility/segger2xr/templates/segger2xr.py b/modules/local/utility/segger2xr/templates/segger2xr.py index 96f2d8df..22889e82 100644 --- a/modules/local/utility/segger2xr/templates/segger2xr.py +++ b/modules/local/utility/segger2xr/templates/segger2xr.py @@ -7,6 +7,7 @@ and GeoJSON cell boundary polygons for xeniumranger import-segmentation. """ +import argparse import json from pathlib import Path from typing import List @@ -14,11 +15,6 @@ import pandas as pd from scipy.spatial import ConvexHull -# Nextflow-injected variables -TRANSCRIPTS = "${transcripts}" -PREFIX = "${meta.id}" -MIN_TRANSCRIPTS = int("${min_transcripts}") - # Expected columns in transcripts.parquet REQUIRED_COLUMNS: List[str] = [ "transcript_id", @@ -218,9 +214,34 @@ def main(input_file: str, prefix: str, min_transcripts: int = 3) -> None: generate_viz_polygons(transcripts, f"{prefix}/segmentation_polygons.json", cell_map) +def parse_args() -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Convert Segger prediction output to XeniumRanger-compatible format." + ) + parser.add_argument( + "--transcripts", + required=True, + help="Path to Segger output transcripts parquet file", + ) + parser.add_argument( + "--prefix", + required=True, + help="Output directory prefix (sample ID)", + ) + parser.add_argument( + "--min-transcripts", + type=int, + default=3, + help="Minimum transcripts per cell (default: 3)", + ) + return parser.parse_args() + + if __name__ == "__main__": + args = parse_args() main( - input_file=TRANSCRIPTS, - prefix=PREFIX, - min_transcripts=MIN_TRANSCRIPTS, + input_file=args.transcripts, + prefix=args.prefix, + min_transcripts=args.min_transcripts, ) diff --git a/modules/local/utility/split_transcripts/main.nf b/modules/local/utility/split_transcripts/main.nf index 3cc54042..6862643a 100644 --- a/modules/local/utility/split_transcripts/main.nf +++ b/modules/local/utility/split_transcripts/main.nf @@ -23,9 +23,15 @@ process SPLIT_TRANSCRIPTS { if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { error "SPLIT_TRANSCRIPTS module does not support Conda. Please use Docker / Singularity / Podman instead." } - prefix = task.ext.prefix ?: "${meta.id}" + def prefix = task.ext.prefix ?: "${meta.id}" - template 'split_transcripts.py' + """ + python3 ${moduleDir}/templates/split_transcripts.py \\ + --transcripts ${transcripts} \\ + --x-bins ${x_bins} \\ + --y-bins ${y_bins} \\ + --prefix ${prefix} + """ stub: // Exit if running this module with -profile conda / -profile mamba diff --git a/modules/local/utility/split_transcripts/templates/split_transcripts.py b/modules/local/utility/split_transcripts/templates/split_transcripts.py index 1fa6db15..275fbab1 100644 --- a/modules/local/utility/split_transcripts/templates/split_transcripts.py +++ b/modules/local/utility/split_transcripts/templates/split_transcripts.py @@ -6,17 +6,12 @@ tiles, writing a splits.csv with tile boundaries. """ +import argparse import os from typing import List import pandas as pd -# Nextflow-injected variables -TRANSCRIPTS = "${transcripts}" -X_BINS = "${x_bins}" -Y_BINS = "${y_bins}" -PREFIX = "${prefix}" - def compute_quantile_ranges(df: pd.DataFrame, col: str, n_bins: int) -> List: """ @@ -74,10 +69,41 @@ def main( return None +def parse_args() -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Split transcript coordinates into spatial tiles." + ) + parser.add_argument( + "--transcripts", + required=True, + help="Path to transcripts parquet file", + ) + parser.add_argument( + "--x-bins", + type=int, + required=True, + help="Number of bins along X axis", + ) + parser.add_argument( + "--y-bins", + type=int, + required=True, + help="Number of bins along Y axis", + ) + parser.add_argument( + "--prefix", + required=True, + help="Output directory prefix", + ) + return parser.parse_args() + + if __name__ == "__main__": + args = parse_args() main( - transcripts=TRANSCRIPTS, - x_bins=int(X_BINS), - y_bins=int(Y_BINS), - prefix=PREFIX, + transcripts=args.transcripts, + x_bins=args.x_bins, + y_bins=args.y_bins, + prefix=args.prefix, ) diff --git a/modules/local/utility/upscale_mask/main.nf b/modules/local/utility/upscale_mask/main.nf index 41b3ff91..f03464a8 100644 --- a/modules/local/utility/upscale_mask/main.nf +++ b/modules/local/utility/upscale_mask/main.nf @@ -34,8 +34,12 @@ process UPSCALE_MASK { script: prefix = task.ext.prefix ?: "${meta.id}" - - template 'upscale_mask.py' + """ + python3 ${moduleDir}/templates/upscale_mask.py \\ + --mask ${mask} \\ + --scale-info ${scale_info} \\ + --prefix ${prefix} + """ stub: prefix = task.ext.prefix ?: "${meta.id}" diff --git a/modules/local/utility/upscale_mask/templates/upscale_mask.py b/modules/local/utility/upscale_mask/templates/upscale_mask.py index 9c0ef0bb..6cc1694e 100644 --- a/modules/local/utility/upscale_mask/templates/upscale_mask.py +++ b/modules/local/utility/upscale_mask/templates/upscale_mask.py @@ -9,6 +9,7 @@ Output: {prefix}/upscaled_{mask_basename}.tif """ +import argparse import json from pathlib import Path @@ -16,11 +17,6 @@ import tifffile from PIL import Image -# Nextflow-injected variables -MASK = "${mask}" -SCALE_INFO = "${scale_info}" -PREFIX = "${prefix}" - def upscale_mask(mask_path: str, scale_info_path: str, prefix: str) -> None: """ @@ -56,9 +52,21 @@ def upscale_mask(mask_path: str, scale_info_path: str, prefix: str) -> None: ) +def parse_args() -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Upscale a Cellpose mask back to original resolution." + ) + parser.add_argument("--mask", required=True, help="Downscaled mask TIFF") + parser.add_argument("--scale-info", required=True, help="scale_info.json from downscale step") + parser.add_argument("--prefix", required=True, help="Output directory") + return parser.parse_args() + + if __name__ == "__main__": + args = parse_args() upscale_mask( - mask_path=MASK, - scale_info_path=SCALE_INFO, - prefix=PREFIX, + mask_path=args.mask, + scale_info_path=args.scale_info, + prefix=args.prefix, ) diff --git a/modules/local/xenium_patch/stitch/main.nf b/modules/local/xenium_patch/stitch/main.nf index 118e77a7..db687a9a 100644 --- a/modules/local/xenium_patch/stitch/main.nf +++ b/modules/local/xenium_patch/stitch/main.nf @@ -34,8 +34,18 @@ process XENIUM_PATCH_STITCH { script: def args = task.ext.args ?: '' + """ + python3 ${moduleDir}/templates/stitch_transcripts.py \\ + --patches ${patches} \\ + --output output \\ + ${args} - template 'stitch.py' + # Post-process: ensure all GeoJSON geometries are Polygon and + # reconcile dropped cells in the transcript CSV. + python3 ${moduleDir}/templates/stitch_postprocess.py \\ + --geojson output/xr-cell-polygons.geojson \\ + --csv output/xr-transcript-metadata.csv + """ stub: """ diff --git a/modules/local/xenium_patch/stitch/templates/stitch_postprocess.py b/modules/local/xenium_patch/stitch/templates/stitch_postprocess.py new file mode 100644 index 00000000..7144b1ac --- /dev/null +++ b/modules/local/xenium_patch/stitch/templates/stitch_postprocess.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +""" +Post-process stitched per-patch segmentation outputs. + +Ensures every GeoJSON feature is a single Polygon: make_valid() and +sopa.solve_conflicts() can produce MultiPolygon, MultiLineString, or +GeometryCollection geometries that XeniumRanger rejects. Cells dropped +during cleanup are also reassigned to UNASSIGNED in the transcript CSV +so the two outputs stay consistent. +""" + +import argparse +import csv +import json + +import shapely +from shapely.geometry import mapping, shape + + +def clean_geojson(geojson_path: str) -> set: + """ + Force every feature to a single valid Polygon. + + Returns the set of cell ids whose features were dropped. + """ + with open(geojson_path) as f: + data = json.load(f) + + clean = [] + dropped_cells = set() + for feat in data["features"]: + geom = shape(feat["geometry"]) + if not geom.is_valid: + geom = shapely.make_valid(geom) + poly = None + if geom.geom_type == "Polygon": + poly = geom + elif geom.geom_type == "MultiPolygon": + poly = max(geom.geoms, key=lambda g: g.area) + elif geom.geom_type == "GeometryCollection": + polys = [g for g in geom.geoms if g.geom_type == "Polygon"] + if polys: + poly = max(polys, key=lambda g: g.area) + if poly is not None and not poly.is_empty: + feat["geometry"] = mapping(poly) + clean.append(feat) + else: + cell_id = feat.get("id") or feat.get("properties", {}).get("cell_id", "") + dropped_cells.add(str(cell_id)) + + print(f"GeoJSON: {len(clean)} kept, {len(dropped_cells)} dropped: {dropped_cells}") + data["features"] = clean + with open(geojson_path, "w") as f: + json.dump(data, f) + + return dropped_cells + + +def reassign_dropped(csv_path: str, dropped_cells: set) -> None: + """ + Reassign transcripts of dropped cells to UNASSIGNED in the CSV. + """ + if not dropped_cells: + return + + with open(csv_path) as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames + rows = list(reader) + + reassigned = 0 + for row in rows: + if row["cell"] in dropped_cells: + row["cell"] = "" + row["is_noise"] = "1" + reassigned += 1 + + with open(csv_path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + print(f"CSV: {reassigned} transcripts reassigned to UNASSIGNED") + + +def parse_args() -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Clean stitched GeoJSON polygons and reconcile transcript CSV." + ) + parser.add_argument("--geojson", required=True, help="Path to xr-cell-polygons.geojson") + parser.add_argument("--csv", required=True, help="Path to xr-transcript-metadata.csv") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + dropped = clean_geojson(args.geojson) + reassign_dropped(args.csv, dropped) diff --git a/modules/local/xenium_patch/stitch/templates/stitch.py b/modules/local/xenium_patch/stitch/templates/stitch_transcripts.py similarity index 85% rename from modules/local/xenium_patch/stitch/templates/stitch.py rename to modules/local/xenium_patch/stitch/templates/stitch_transcripts.py index 2306a7dc..d9fb8d41 100644 --- a/modules/local/xenium_patch/stitch/templates/stitch.py +++ b/modules/local/xenium_patch/stitch/templates/stitch_transcripts.py @@ -1,25 +1,15 @@ #!/usr/bin/env python3 -"""Stitch per-patch segmentation results, then post-process the GeoJSON + CSV. - -Phase 1 (stitch_transcripts): - Stitch per-patch Baysor segmentation results into unified output. Uses - sopa's solve_conflicts() for overlap resolution at patch boundaries. - -Phase 2 (postprocess): - Ensures every GeoJSON feature is a single Polygon: make_valid() and - sopa.solve_conflicts() can produce MultiPolygon, MultiLineString, or - GeometryCollection geometries that XeniumRanger rejects. Cells dropped - during cleanup are also reassigned to UNASSIGNED in the transcript CSV - so the two outputs stay consistent. +"""Stitch per-patch Baysor segmentation results into unified output. + +Standalone script that replaces the xenium_patch CLI package's stitch +functionality. Uses sopa's solve_conflicts() for overlap resolution. """ from __future__ import annotations import argparse -import csv import json import os -import shlex from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass from pathlib import Path @@ -34,11 +24,6 @@ from shapely.geometry import mapping, shape from sopa.segmentation.resolve import solve_conflicts -# Nextflow-injected variables -PATCHES = "${patches}" -ARGS = "${args}" # task.ext.args, may be empty - - # --------------------------------------------------------------------------- # Geometry helpers # --------------------------------------------------------------------------- @@ -59,7 +44,7 @@ def _ensure_polygon(geom) -> "shapely.Polygon | None": if geom.geom_type == "GeometryCollection": polys = [g for g in geom.geoms if g.geom_type == "Polygon"] return max(polys, key=lambda g: g.area) if polys else None - # LineString, MultiLineString, Point, etc. -- not a polygon + # LineString, MultiLineString, Point, etc. — not a polygon return None @@ -645,7 +630,7 @@ def _stitch_sopa_resolve( # --------------------------------------------------------------------------- -# Main orchestrator (stitch phase) +# Main orchestrator # --------------------------------------------------------------------------- @@ -779,17 +764,11 @@ def stitch_transcript_assignments( # --------------------------------------------------------------------------- -# Phase 1 entry point: stitch_transcripts (replaces stitch_transcripts.py main) +# CLI # --------------------------------------------------------------------------- -def stitch_transcripts(patches_dir: str, output_dir: str, args_str: str) -> None: - """Phase 1: parse extra CLI args and run the stitch orchestrator. - - Mirrors the original ``stitch_transcripts.py`` argparse interface so that - any flags passed through ``task.ext.args`` are honored by the same - parser as before. - """ +def main() -> None: parser = argparse.ArgumentParser( description="Stitch per-patch Baysor segmentation results into unified output." ) @@ -815,106 +794,13 @@ def stitch_transcripts(patches_dir: str, output_dir: str, args_str: str) -> None default="segmentation_polygons.json", help="GeoJSON filename within each patch (default: segmentation_polygons.json)", ) - - argv = ["--patches", patches_dir, "--output", output_dir] - if args_str: - argv.extend(shlex.split(args_str)) - parsed = parser.parse_args(argv) + args = parser.parse_args() stitch_transcript_assignments( - patches_dir=parsed.patches, - output_dir=parsed.output, - csv_filename=parsed.csv_filename, - geojson_filename=parsed.geojson_filename, - ) - - -# --------------------------------------------------------------------------- -# Phase 2: post-processing (from stitch_postprocess.py) -# --------------------------------------------------------------------------- - - -def clean_geojson(geojson_path: str) -> set: - """ - Force every feature to a single valid Polygon. - - Returns the set of cell ids whose features were dropped. - """ - with open(geojson_path) as f: - data = json.load(f) - - clean = [] - dropped_cells = set() - for feat in data["features"]: - geom = shape(feat["geometry"]) - if not geom.is_valid: - geom = shapely.make_valid(geom) - poly = None - if geom.geom_type == "Polygon": - poly = geom - elif geom.geom_type == "MultiPolygon": - poly = max(geom.geoms, key=lambda g: g.area) - elif geom.geom_type == "GeometryCollection": - polys = [g for g in geom.geoms if g.geom_type == "Polygon"] - if polys: - poly = max(polys, key=lambda g: g.area) - if poly is not None and not poly.is_empty: - feat["geometry"] = mapping(poly) - clean.append(feat) - else: - cell_id = feat.get("id") or feat.get("properties", {}).get("cell_id", "") - dropped_cells.add(str(cell_id)) - - print(f"GeoJSON: {len(clean)} kept, {len(dropped_cells)} dropped: {dropped_cells}") - data["features"] = clean - with open(geojson_path, "w") as f: - json.dump(data, f) - - return dropped_cells - - -def reassign_dropped(csv_path: str, dropped_cells: set) -> None: - """ - Reassign transcripts of dropped cells to UNASSIGNED in the CSV. - """ - if not dropped_cells: - return - - with open(csv_path) as f: - reader = csv.DictReader(f) - fieldnames = reader.fieldnames - rows = list(reader) - - reassigned = 0 - for row in rows: - if row["cell"] in dropped_cells: - row["cell"] = "" - row["is_noise"] = "1" - reassigned += 1 - - with open(csv_path, "w", newline="") as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(rows) - print(f"CSV: {reassigned} transcripts reassigned to UNASSIGNED") - - -def postprocess(geojson_path: str, csv_path: str) -> None: - """Phase 2 entry point: cleanup polygons and reconcile the CSV.""" - dropped = clean_geojson(geojson_path) - reassign_dropped(csv_path, dropped) - - -# --------------------------------------------------------------------------- -# Main: run both phases sequentially -# --------------------------------------------------------------------------- - - -def main() -> None: - stitch_transcripts(PATCHES, "output", ARGS) - postprocess( - "output/xr-cell-polygons.geojson", - "output/xr-transcript-metadata.csv", + patches_dir=args.patches, + output_dir=args.output, + csv_filename=args.csv_filename, + geojson_filename=args.geojson_filename, ) From 4edccc3650eb3839a8f85f62702c8d903d734a1f Mon Sep 17 00:00:00 2001 From: an-altosian Date: Thu, 7 May 2026 15:17:07 +0000 Subject: [PATCH 3/3] refactor(modules): move per-module Python scripts to pipeline-level bin/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the templates/-directive approach (which broke under the Nextflow constraint that `output: eval(...)` channels are bash-only, incompatible with `template 'foo.py'` setting a Python interpreter via shebang). Pipeline-level bin/ is the cleanest path: nf-core auto-prepends bin/ to PATH for every process, scripts resolve by name, and process scripts stay bash so eval() topic channels keep working. Per-module changes (19 modules, 20 scripts): - bin/_