Skip to content

Commit

Permalink
Remove duplicated row names from the reference before marker detection.
Browse files Browse the repository at this point in the history
These cause trouble as they confuse our name-based approach to matters.
  • Loading branch information
LTLA committed Dec 13, 2024
1 parent c1bd77b commit 6f5fd77
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 14 deletions.
25 changes: 14 additions & 11 deletions src/singler/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,14 +101,17 @@ def _clean_matrix(x, features, assay_type, check_missing, num_threads):


def _restrict_features(data, features, restrict_to):
if restrict_to is not None:
if not isinstance(restrict_to, set):
restrict_to = set(restrict_to)
keep = []
new_features = []
for i, x in enumerate(features):
if x in restrict_to:
keep.append(i)
new_features.append(x)
return delayedarray.DelayedArray(data)[keep, :], new_features
return data, features
if restrict_to is None:
return data, features

if not isinstance(restrict_to, set):
restrict_to = set(restrict_to)
keep = []
for i, x in enumerate(features):
if x in restrict_to:
keep.append(i)

return (
delayedarray.DelayedArray(data)[keep, :],
biocutils.subset_sequence(features, keep)
)
22 changes: 19 additions & 3 deletions src/singler/train_single.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,9 @@ def train_single(
marker_method: Literal["classic"] = "classic",
marker_args: dict = {},
nn_parameters: Optional[knncolle.Parameters] = knncolle.AnnoyParameters(),
check_duplicated: bool = True,
num_threads: int = 1,
) -> TrainedSingleReference:
) -> TrainedSingleReference:
"""Build a single reference dataset in preparation for classification.
Args:
Expand Down Expand Up @@ -163,6 +164,10 @@ def train_single(
Algorithm for constructing the neighbor search index, used to
compute scores during classification.
check_duplicated:
Whether to remove rows with duplicate feature names. This can be
set to False if ``ref_features`` does not contain any duplicates.
num_threads:
Number of threads to use for reference building.
Expand All @@ -179,8 +184,19 @@ def train_single(
num_threads=num_threads,
)

if check_duplicated:
encountered = set()
keep = []
for i, rg in enumerate(ref_features):
if rg not in encountered:
encountered.add(rg)
keep.append(i)
if len(keep) != len(ref_features):
ref_features = biocutils.subset_sequence(ref_features, keep)
ref_data = delayedarray.DelayedArray(ref_data)[keep,:]

unique_labels, label_idx = _factorize(ref_labels)
markers = identify_genes(ref_data, ref_features, ref_labels, unique_labels, markers, marker_method, test_features, restrict_to, marker_args, num_threads)
markers = _identify_genes(ref_data, ref_features, ref_labels, unique_labels, markers, marker_method, test_features, restrict_to, marker_args, num_threads)
markers_idx = [None] * len(unique_labels)
for outer_i, outer_k in enumerate(unique_labels):
inner_instance = [None] * len(unique_labels)
Expand Down Expand Up @@ -217,7 +233,7 @@ def train_single(
)


def identify_genes(ref_data, ref_features, ref_labels, unique_labels, markers, marker_method, test_features, restrict_to, marker_args, num_threads):
def _identify_genes(ref_data, ref_features, ref_labels, unique_labels, markers, marker_method, test_features, restrict_to, marker_args, num_threads):
ref_data, ref_features = _restrict_features(ref_data, ref_features, test_features)
ref_data, ref_features = _restrict_features(ref_data, ref_features, restrict_to)

Expand Down
10 changes: 10 additions & 0 deletions tests/test_train_single.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,16 @@ def test_train_single_markers():
assert built.markers == mbuilt.markers


def test_train_single_dedup():
ref = numpy.random.rand(10000, 10)
labels = ["A", "B", "C", "D", "E", "E", "D", "C", "B", "A"]
features = [str(i) for i in range(ref.shape[0])]
features[0] = "1"
built = singler.train_single(ref, labels, features)
assert built.features == features[1:] # duplicates are ignored
assert built._full_data.shape[0] == len(built.features)


def test_train_single_restricted():
ref = numpy.random.rand(10000, 10)
labels = ["A", "B", "C", "D", "E", "E", "D", "C", "B", "A"]
Expand Down

0 comments on commit 6f5fd77

Please sign in to comment.