diff --git a/src/singler/_utils.py b/src/singler/_utils.py index dff601a..b9b061d 100644 --- a/src/singler/_utils.py +++ b/src/singler/_utils.py @@ -101,14 +101,17 @@ def _clean_matrix(x, features, assay_type, check_missing, num_threads): def _restrict_features(data, features, restrict_to): - if restrict_to is not None: - if not isinstance(restrict_to, set): - restrict_to = set(restrict_to) - keep = [] - new_features = [] - for i, x in enumerate(features): - if x in restrict_to: - keep.append(i) - new_features.append(x) - return delayedarray.DelayedArray(data)[keep, :], new_features - return data, features + if restrict_to is None: + return data, features + + if not isinstance(restrict_to, set): + restrict_to = set(restrict_to) + keep = [] + for i, x in enumerate(features): + if x in restrict_to: + keep.append(i) + + return ( + delayedarray.DelayedArray(data)[keep, :], + biocutils.subset_sequence(features, keep) + ) diff --git a/src/singler/train_single.py b/src/singler/train_single.py index 20c9de7..aa84f80 100644 --- a/src/singler/train_single.py +++ b/src/singler/train_single.py @@ -102,8 +102,9 @@ def train_single( marker_method: Literal["classic"] = "classic", marker_args: dict = {}, nn_parameters: Optional[knncolle.Parameters] = knncolle.AnnoyParameters(), + check_duplicated: bool = True, num_threads: int = 1, - ) -> TrainedSingleReference: +) -> TrainedSingleReference: """Build a single reference dataset in preparation for classification. Args: @@ -163,6 +164,10 @@ def train_single( Algorithm for constructing the neighbor search index, used to compute scores during classification. + check_duplicated: + Whether to remove rows with duplicate feature names. This can be + set to False if ``ref_features`` does not contain any duplicates. + num_threads: Number of threads to use for reference building. @@ -179,8 +184,19 @@ def train_single( num_threads=num_threads, ) + if check_duplicated: + encountered = set() + keep = [] + for i, rg in enumerate(ref_features): + if rg not in encountered: + encountered.add(rg) + keep.append(i) + if len(keep) != len(ref_features): + ref_features = biocutils.subset_sequence(ref_features, keep) + ref_data = delayedarray.DelayedArray(ref_data)[keep,:] + unique_labels, label_idx = _factorize(ref_labels) - markers = identify_genes(ref_data, ref_features, ref_labels, unique_labels, markers, marker_method, test_features, restrict_to, marker_args, num_threads) + markers = _identify_genes(ref_data, ref_features, ref_labels, unique_labels, markers, marker_method, test_features, restrict_to, marker_args, num_threads) markers_idx = [None] * len(unique_labels) for outer_i, outer_k in enumerate(unique_labels): inner_instance = [None] * len(unique_labels) @@ -217,7 +233,7 @@ def train_single( ) -def identify_genes(ref_data, ref_features, ref_labels, unique_labels, markers, marker_method, test_features, restrict_to, marker_args, num_threads): +def _identify_genes(ref_data, ref_features, ref_labels, unique_labels, markers, marker_method, test_features, restrict_to, marker_args, num_threads): ref_data, ref_features = _restrict_features(ref_data, ref_features, test_features) ref_data, ref_features = _restrict_features(ref_data, ref_features, restrict_to) diff --git a/tests/test_train_single.py b/tests/test_train_single.py index bbb12b0..6cd6351 100644 --- a/tests/test_train_single.py +++ b/tests/test_train_single.py @@ -38,6 +38,16 @@ def test_train_single_markers(): assert built.markers == mbuilt.markers +def test_train_single_dedup(): + ref = numpy.random.rand(10000, 10) + labels = ["A", "B", "C", "D", "E", "E", "D", "C", "B", "A"] + features = [str(i) for i in range(ref.shape[0])] + features[0] = "1" + built = singler.train_single(ref, labels, features) + assert built.features == features[1:] # duplicates are ignored + assert built._full_data.shape[0] == len(built.features) + + def test_train_single_restricted(): ref = numpy.random.rand(10000, 10) labels = ["A", "B", "C", "D", "E", "E", "D", "C", "B", "A"]