|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +import pandas as pd |
| 4 | + |
| 5 | + |
| 6 | +def filter_transcripts ( |
| 7 | + transcripts: str, |
| 8 | + min_qv: float = 20.0, |
| 9 | + min_x: float = 0.0, |
| 10 | + max_x: float = 24000.0, |
| 11 | + min_y: float = 0.0, |
| 12 | + max_y: float = 24000.0 |
| 13 | +) -> None: |
| 14 | + """ |
| 15 | + Filter transcripts based on the specified thresholds |
| 16 | +
|
| 17 | + Args: |
| 18 | + transcripts - path to transcripts parquet |
| 19 | + ----------------------------------- filters -------------------------------------------- |
| 20 | + min_qv - minimum Q-Score to pass filtering (default: 20.0) |
| 21 | + min_x - only keep transcripts whose x-coordinate is greater than specified limit |
| 22 | + if no limit is specified, the default minimum value will be 0.0 |
| 23 | + max_x - only keep transcripts whose x-coordinate is less than specified limit |
| 24 | + if no limit is specified, the default value will retain all |
| 25 | + transcripts since Xenium slide is <24000 microns in x and y (default: 24000.0) |
| 26 | + min_y - only keep transcripts whose y-coordinate is greater than specified limit |
| 27 | + if no limit is specified, the default minimum value will be 0.0 |
| 28 | + max_y - only keep transcripts whose y-coordinate is less than specified limit |
| 29 | + if no limit is specified, the default value will retain all |
| 30 | + transcripts since Xenium slide is <24000 microns in x and y (default: 24000.0) |
| 31 | + """ |
| 32 | + df = pd.read_parquet(transcripts, engine = 'pyarrow') |
| 33 | + |
| 34 | + # filter transcripts df with thresholds, ignore negative controls |
| 35 | + filtered_df = df[(df["qv"] >= min_qv) & |
| 36 | + (df["x_location"] >= min_x) & |
| 37 | + (df["x_location"] <= max_x) & |
| 38 | + (df["y_location"] >= min_y) & |
| 39 | + (df["y_location"] <= max_y) & |
| 40 | + (~df["feature_name"].str.startswith("NegControlProbe_")) & |
| 41 | + (~df["feature_name"].str.startswith("antisense_")) & |
| 42 | + (~df["feature_name"].str.startswith("NegControlCodeword_")) & |
| 43 | + (~df["feature_name"].str.startswith("BLANK_"))] |
| 44 | + |
| 45 | + # change cell_id of cell-free transcripts from -1 to 0 |
| 46 | + neg_cell_row = filtered_df["cell_id"] == -1 |
| 47 | + filtered_df.loc[neg_cell_row,"cell_id"] = 0 |
| 48 | + |
| 49 | + # Output filtered transcripts to parquet |
| 50 | + filtered_df.to_parquet( |
| 51 | + '_'.join(["X"+str(min_x)+"-"+str(max_x), "Y"+str(min_y)+"-"+str(max_y), "filtered_transcripts.parquet"]), |
| 52 | + index=False |
| 53 | + ) |
| 54 | + |
| 55 | + return None |
| 56 | + |
| 57 | + |
| 58 | +def generate_version_yml() -> None: |
| 59 | + with open("versions.yml", "w") as yml: |
| 60 | + yml.write('"${task.process}":\\n') |
| 61 | + yml.write("Baysor-Preprocess Transcripts: 1.0.0'\\n") |
| 62 | + |
| 63 | + return None |
| 64 | + |
| 65 | + |
| 66 | +if __name__ == "__main__": |
| 67 | + |
| 68 | + transcripts: str = "${transcripts}" |
| 69 | + |
| 70 | + filter_transcripts ( |
| 71 | + transcripts=transcripts, |
| 72 | + ) |
| 73 | + |
| 74 | + generate_version_yml() |
0 commit comments