nf-core · heylf · May 7, 2026 · May 5, 2026 · May 5, 2026 · May 7, 2026
diff --git a/bin/baysor_create_dataset.py b/bin/baysor_create_dataset.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+"""
+Create a sampled dataset for Baysor preview mode.
+
+Reads a CSV transcript file and randomly samples a fraction of rows,
+writing the result to a new CSV file.
+"""
+
+import argparse
+import csv
+import os
+import random
+from pathlib import Path
+
+
+class BaysorPreview():
+    """
+    Utility class to generate baysor preview dataset
+    """
+    @staticmethod
+    def generate_dataset(
+            transcripts: Path,
+            sampled_transcripts: Path,
+            sample_fraction: float = 0.3,
+            random_state: int = 42,
+            prefix: str = ""
+        ) -> None:
+        """
+        Reads a csv file & randomly samples a fraction of rows,
+        and writes the result to a .csv file.
+
+        Args:
+            transcripts: unziped transcripts.csv from xenium bundle
+            sampled_transcripts: randomly subsampled transcripts.csv file
+            sample_fraction: Fraction of rows to sample
+            random_state: Seed for reproducibility
+            prefix: Output directory prefix
+        """
+
+        random.seed(random_state)
+        output_path = f"{prefix}/{sampled_transcripts}"
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        with open(transcripts, mode='rt', newline='') as infile, \
+            open(output_path, mode='wt', newline='') as outfile:
+
+            reader = csv.reader(infile)
+            writer = csv.writer(outfile)
+
+            # get the header line
+            header = next(reader)
+            writer.writerow(header)
+
+            # randomize csv rows to write
+            for row in reader:
+                if random.random() < float(sample_fraction):
+                    writer.writerow(row)
+
+        return None
+
+
+def main() -> None:
+    """
+    Run create dataset as nf module
+    """
+    parser = argparse.ArgumentParser(
+        description="Create sampled dataset for Baysor preview"
+    )
+    parser.add_argument(
+        "--transcripts", required=True,
+        help="Path to transcripts CSV file"
+    )
+    parser.add_argument(
+        "--sample-fraction", required=True, type=float,
+        help="Fraction of rows to sample"
+    )
+    parser.add_argument(
+        "--prefix", required=True,
+        help="Output directory prefix"
+    )
+    args = parser.parse_args()
+
+    sampled_transcripts = "sampled_transcripts.csv"
+
+    # generate dataset
+    BaysorPreview.generate_dataset(
+        transcripts=args.transcripts,
+        sampled_transcripts=sampled_transcripts,
+        sample_fraction=args.sample_fraction,
+        prefix=args.prefix
+    )
+
+    return None
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/baysor_preprocess_transcripts.py b/bin/baysor_preprocess_transcripts.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+"""
+Preprocess Xenium transcripts for Baysor segmentation.
+
+Filters transcripts based on quality score and spatial coordinate thresholds,
+removes negative control probes, and outputs filtered CSV for Baysor compatibility.
+"""
+
+import argparse
+import os
+
+import pandas as pd
+
+
+def filter_transcripts(
+    transcripts: str,
+    min_qv: float = 20.0,
+    min_x: float = 0.0,
+    max_x: float = 24000.0,
+    min_y: float = 0.0,
+    max_y: float = 24000.0,
+    prefix: str = "",
+) -> None:
+    """
+    Filter transcripts based on the specified thresholds.
+
+    Args:
+        transcripts: Path to transcripts parquet file
+        min_qv: Minimum Q-Score to pass filtering
+        min_x: Minimum x-coordinate threshold
+        max_x: Maximum x-coordinate threshold
+        min_y: Minimum y-coordinate threshold
+        max_y: Maximum y-coordinate threshold
+        prefix: Output directory prefix
+    """
+    df = pd.read_parquet(transcripts, engine="pyarrow")
+
+    # filter transcripts df with thresholds, ignore negative controls
+    filtered_df = df[
+        (df["qv"] >= min_qv)
+        & (df["x_location"] >= min_x)
+        & (df["x_location"] <= max_x)
+        & (df["y_location"] >= min_y)
+        & (df["y_location"] <= max_y)
+        & (~df["feature_name"].str.startswith("NegControlProbe_"))
+        & (~df["feature_name"].str.startswith("antisense_"))
+        & (~df["feature_name"].str.startswith("NegControlCodeword_"))
+        & (~df["feature_name"].str.startswith("BLANK_"))
+    ]
+
+    # change cell_id of cell-free transcripts to "0" (Baysor's no-cell sentinel).
+    # Modern Xenium stores cell_id as a string ("UNASSIGNED" for cell-free transcripts);
+    # legacy Xenium used integer -1. Normalize to string and handle both cases — pandas 3
+    # rejects mixing int values into a string-dtype column.
+    filtered_df["cell_id"] = filtered_df["cell_id"].astype(str)
+    neg_cell_row = filtered_df["cell_id"].isin(["-1", "UNASSIGNED"])
+    filtered_df.loc[neg_cell_row, "cell_id"] = "0"
+
+    # Output filtered transcripts as CSV for Baysor 0.7.1 compatibility.
+    # Baysor's Julia Parquet.jl cannot read modern pyarrow Parquet files
+    # (pyarrow 15+ writes size_statistics Thrift field 16 unconditionally,
+    # which Baysor's old Thrift deserializer doesn't recognize).
+    os.makedirs(prefix, exist_ok=True)
+    filtered_df.to_csv(f"{prefix}/filtered_transcripts.csv", index=False)
+
+    return None
+
+
+def main() -> None:
+    """
+    Run preprocess transcripts as nf module.
+    """
+    parser = argparse.ArgumentParser(
+        description="Preprocess Xenium transcripts for Baysor"
+    )
+    parser.add_argument(
+        "--transcripts", required=True, help="Path to transcripts parquet file"
+    )
+    parser.add_argument("--prefix", required=True, help="Output directory prefix")
+    parser.add_argument(
+        "--min-qv",
+        type=float,
+        default=20.0,
+        help="Minimum Q-Score threshold (default: 20.0)",
+    )
+    parser.add_argument(
+        "--min-x",
+        type=float,
+        default=0.0,
+        help="Minimum x-coordinate threshold (default: 0.0)",
+    )
+    parser.add_argument(
+        "--max-x",
+        type=float,
+        default=24000.0,
+        help="Maximum x-coordinate threshold (default: 24000.0)",
+    )
+    parser.add_argument(
+        "--min-y",
+        type=float,
+        default=0.0,
+        help="Minimum y-coordinate threshold (default: 0.0)",
+    )
+    parser.add_argument(
+        "--max-y",
+        type=float,
+        default=24000.0,
+        help="Maximum y-coordinate threshold (default: 24000.0)",
+    )
+    args = parser.parse_args()
+
+    filter_transcripts(
+        transcripts=args.transcripts,
+        min_qv=args.min_qv,
+        min_x=args.min_x,
+        max_x=args.max_x,
+        min_y=args.min_y,
+        max_y=args.max_y,
+        prefix=args.prefix,
+    )
+
+    return None
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/ficture_preprocess.py b/bin/ficture_preprocess.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+"""Preprocess Xenium transcripts for FICTURE analysis."""
+
+import argparse
+import gzip
+import logging
+import os
+import re
+import sys
+
+import pandas as pd
+
+
+def parse_args():
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Preprocess Xenium transcripts for FICTURE"
+    )
+    parser.add_argument(
+        "--transcripts", required=True, help="Path to transcripts file (CSV)"
+    )
+    parser.add_argument(
+        "--features", default="", help="Path to features file (optional)"
+    )
+    parser.add_argument(
+        "--negative-control-regex", default="", help="Regex for negative control probes"
+    )
+    return parser.parse_args()
+
+
+def main():
+    """Run FICTURE preprocessing."""
+    args = parse_args()
+    print("[START]")
+
+    negctrl_regex = "BLANK|NegCon"
+    if args.negative_control_regex:
+        negctrl_regex = args.negative_control_regex
+
+    unit_info = ["X", "Y", "gene", "cell_id", "overlaps_nucleus"]
+    oheader = unit_info + ["Count"]
+
+    feature = pd.DataFrame()
+    xmin = sys.maxsize
+    xmax = 0
+    ymin = sys.maxsize
+    ymax = 0
+
+    output = "processed_transcripts.tsv.gz"
+    feature_file = "feature.clean.tsv.gz"
+    min_phred_score = 15
+
+    with gzip.open(output, "wt") as wf:
+        wf.write("\t".join(oheader) + "\n")
+
+    for chunk in pd.read_csv(args.transcripts, header=0, chunksize=500000):
+        chunk = chunk.loc[(chunk.qv > min_phred_score)]
+        chunk.rename(columns={"feature_name": "gene"}, inplace=True)
+        if negctrl_regex != "":
+            chunk = chunk[
+                ~chunk.gene.str.contains(negctrl_regex, flags=re.IGNORECASE, regex=True)
+            ]
+        chunk.rename(columns={"x_location": "X", "y_location": "Y"}, inplace=True)
+        chunk["Count"] = 1
+        chunk[oheader].to_csv(
+            output, sep="\t", mode="a", index=False, header=False, float_format="%.2f"
+        )
+        logging.info(f"{chunk.shape[0]}")
+        feature = pd.concat(
+            [feature, chunk.groupby(by="gene").agg({"Count": "sum"}).reset_index()]
+        )
+        x0 = chunk.X.min()
+        x1 = chunk.X.max()
+        y0 = chunk.Y.min()
+        y1 = chunk.Y.max()
+        xmin = min(int(xmin), int(x0))
+        xmax = max(int(xmax), int(x1))
+        ymin = min(int(ymin), int(y0))
+        ymax = max(int(ymax), int(y1))
+
+    if os.path.exists(args.features):
+        feature_list = []
+        with open(args.features, "r") as ff:
+            for line in ff:
+                feature_list.append(line.strip("\n"))
+        feature = feature.groupby(by="gene").agg({"Count": "sum"}).reset_index()
+        feature = feature[[x in feature_list for x in feature["gene"]]]
+        feature.to_csv(feature_file, sep="\t", index=False)
+
+    f = os.path.join(os.path.dirname(output), "coordinate_minmax.tsv")
+    with open(f, "w") as wf:
+        wf.write(f"xmin\t{xmin}\n")
+        wf.write(f"xmax\t{xmax}\n")
+        wf.write(f"ymin\t{ymin}\n")
+        wf.write(f"ymax\t{ymax}\n")
+
+    print("[FINISH]")
+
+
+if __name__ == "__main__":
+    main()