Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 96 additions & 0 deletions bin/baysor_create_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#!/usr/bin/env python3
"""
Create a sampled dataset for Baysor preview mode.

Reads a CSV transcript file and randomly samples a fraction of rows,
writing the result to a new CSV file.
"""

import argparse
import csv
import os
import random
from pathlib import Path


class BaysorPreview():
"""
Utility class to generate baysor preview dataset
"""
@staticmethod
def generate_dataset(
transcripts: Path,
sampled_transcripts: Path,
sample_fraction: float = 0.3,
random_state: int = 42,
prefix: str = ""
) -> None:
"""
Reads a csv file & randomly samples a fraction of rows,
and writes the result to a .csv file.

Args:
transcripts: unziped transcripts.csv from xenium bundle
sampled_transcripts: randomly subsampled transcripts.csv file
sample_fraction: Fraction of rows to sample
random_state: Seed for reproducibility
prefix: Output directory prefix
"""

random.seed(random_state)
output_path = f"{prefix}/{sampled_transcripts}"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(transcripts, mode='rt', newline='') as infile, \
open(output_path, mode='wt', newline='') as outfile:

reader = csv.reader(infile)
writer = csv.writer(outfile)

# get the header line
header = next(reader)
writer.writerow(header)

# randomize csv rows to write
for row in reader:
if random.random() < float(sample_fraction):
writer.writerow(row)

return None


def main() -> None:
"""
Run create dataset as nf module
"""
parser = argparse.ArgumentParser(
description="Create sampled dataset for Baysor preview"
)
parser.add_argument(
"--transcripts", required=True,
help="Path to transcripts CSV file"
)
parser.add_argument(
"--sample-fraction", required=True, type=float,
help="Fraction of rows to sample"
)
parser.add_argument(
"--prefix", required=True,
help="Output directory prefix"
)
args = parser.parse_args()

sampled_transcripts = "sampled_transcripts.csv"

# generate dataset
BaysorPreview.generate_dataset(
transcripts=args.transcripts,
sampled_transcripts=sampled_transcripts,
sample_fraction=args.sample_fraction,
prefix=args.prefix
)

return None


if __name__ == "__main__":
main()
126 changes: 126 additions & 0 deletions bin/baysor_preprocess_transcripts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#!/usr/bin/env python3
"""
Preprocess Xenium transcripts for Baysor segmentation.

Filters transcripts based on quality score and spatial coordinate thresholds,
removes negative control probes, and outputs filtered CSV for Baysor compatibility.
"""

import argparse
import os

import pandas as pd


def filter_transcripts(
transcripts: str,
min_qv: float = 20.0,
min_x: float = 0.0,
max_x: float = 24000.0,
min_y: float = 0.0,
max_y: float = 24000.0,
prefix: str = "",
) -> None:
"""
Filter transcripts based on the specified thresholds.

Args:
transcripts: Path to transcripts parquet file
min_qv: Minimum Q-Score to pass filtering
min_x: Minimum x-coordinate threshold
max_x: Maximum x-coordinate threshold
min_y: Minimum y-coordinate threshold
max_y: Maximum y-coordinate threshold
prefix: Output directory prefix
"""
df = pd.read_parquet(transcripts, engine="pyarrow")

# filter transcripts df with thresholds, ignore negative controls
filtered_df = df[
(df["qv"] >= min_qv)
& (df["x_location"] >= min_x)
& (df["x_location"] <= max_x)
& (df["y_location"] >= min_y)
& (df["y_location"] <= max_y)
& (~df["feature_name"].str.startswith("NegControlProbe_"))
& (~df["feature_name"].str.startswith("antisense_"))
& (~df["feature_name"].str.startswith("NegControlCodeword_"))
& (~df["feature_name"].str.startswith("BLANK_"))
]

# change cell_id of cell-free transcripts to "0" (Baysor's no-cell sentinel).
# Modern Xenium stores cell_id as a string ("UNASSIGNED" for cell-free transcripts);
# legacy Xenium used integer -1. Normalize to string and handle both cases — pandas 3
# rejects mixing int values into a string-dtype column.
filtered_df["cell_id"] = filtered_df["cell_id"].astype(str)
neg_cell_row = filtered_df["cell_id"].isin(["-1", "UNASSIGNED"])
filtered_df.loc[neg_cell_row, "cell_id"] = "0"

# Output filtered transcripts as CSV for Baysor 0.7.1 compatibility.
# Baysor's Julia Parquet.jl cannot read modern pyarrow Parquet files
# (pyarrow 15+ writes size_statistics Thrift field 16 unconditionally,
# which Baysor's old Thrift deserializer doesn't recognize).
os.makedirs(prefix, exist_ok=True)
filtered_df.to_csv(f"{prefix}/filtered_transcripts.csv", index=False)

return None


def main() -> None:
"""
Run preprocess transcripts as nf module.
"""
parser = argparse.ArgumentParser(
description="Preprocess Xenium transcripts for Baysor"
)
parser.add_argument(
"--transcripts", required=True, help="Path to transcripts parquet file"
)
parser.add_argument("--prefix", required=True, help="Output directory prefix")
parser.add_argument(
"--min-qv",
type=float,
default=20.0,
help="Minimum Q-Score threshold (default: 20.0)",
)
parser.add_argument(
"--min-x",
type=float,
default=0.0,
help="Minimum x-coordinate threshold (default: 0.0)",
)
parser.add_argument(
"--max-x",
type=float,
default=24000.0,
help="Maximum x-coordinate threshold (default: 24000.0)",
)
parser.add_argument(
"--min-y",
type=float,
default=0.0,
help="Minimum y-coordinate threshold (default: 0.0)",
)
parser.add_argument(
"--max-y",
type=float,
default=24000.0,
help="Maximum y-coordinate threshold (default: 24000.0)",
)
args = parser.parse_args()

filter_transcripts(
transcripts=args.transcripts,
min_qv=args.min_qv,
min_x=args.min_x,
max_x=args.max_x,
min_y=args.min_y,
max_y=args.max_y,
prefix=args.prefix,
)

return None


if __name__ == "__main__":
main()
101 changes: 101 additions & 0 deletions bin/ficture_preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#!/usr/bin/env python3
"""Preprocess Xenium transcripts for FICTURE analysis."""

import argparse
import gzip
import logging
import os
import re
import sys

import pandas as pd


def parse_args():
"""Parse command-line arguments."""
parser = argparse.ArgumentParser(
description="Preprocess Xenium transcripts for FICTURE"
)
parser.add_argument(
"--transcripts", required=True, help="Path to transcripts file (CSV)"
)
parser.add_argument(
"--features", default="", help="Path to features file (optional)"
)
parser.add_argument(
"--negative-control-regex", default="", help="Regex for negative control probes"
)
return parser.parse_args()


def main():
"""Run FICTURE preprocessing."""
args = parse_args()
print("[START]")

negctrl_regex = "BLANK|NegCon"
if args.negative_control_regex:
negctrl_regex = args.negative_control_regex

unit_info = ["X", "Y", "gene", "cell_id", "overlaps_nucleus"]
oheader = unit_info + ["Count"]

feature = pd.DataFrame()
xmin = sys.maxsize
xmax = 0
ymin = sys.maxsize
ymax = 0

output = "processed_transcripts.tsv.gz"
feature_file = "feature.clean.tsv.gz"
min_phred_score = 15

with gzip.open(output, "wt") as wf:
wf.write("\t".join(oheader) + "\n")

for chunk in pd.read_csv(args.transcripts, header=0, chunksize=500000):
chunk = chunk.loc[(chunk.qv > min_phred_score)]
chunk.rename(columns={"feature_name": "gene"}, inplace=True)
if negctrl_regex != "":
chunk = chunk[
~chunk.gene.str.contains(negctrl_regex, flags=re.IGNORECASE, regex=True)
]
chunk.rename(columns={"x_location": "X", "y_location": "Y"}, inplace=True)
chunk["Count"] = 1
chunk[oheader].to_csv(
output, sep="\t", mode="a", index=False, header=False, float_format="%.2f"
)
logging.info(f"{chunk.shape[0]}")
feature = pd.concat(
[feature, chunk.groupby(by="gene").agg({"Count": "sum"}).reset_index()]
)
x0 = chunk.X.min()
x1 = chunk.X.max()
y0 = chunk.Y.min()
y1 = chunk.Y.max()
xmin = min(int(xmin), int(x0))
xmax = max(int(xmax), int(x1))
ymin = min(int(ymin), int(y0))
ymax = max(int(ymax), int(y1))

if os.path.exists(args.features):
feature_list = []
with open(args.features, "r") as ff:
for line in ff:
feature_list.append(line.strip("\n"))
feature = feature.groupby(by="gene").agg({"Count": "sum"}).reset_index()
feature = feature[[x in feature_list for x in feature["gene"]]]
feature.to_csv(feature_file, sep="\t", index=False)

f = os.path.join(os.path.dirname(output), "coordinate_minmax.tsv")
with open(f, "w") as wf:
wf.write(f"xmin\t{xmin}\n")
wf.write(f"xmax\t{xmax}\n")
wf.write(f"ymin\t{ymin}\n")
wf.write(f"ymax\t{ymax}\n")

print("[FINISH]")


if __name__ == "__main__":
main()
Loading
Loading