Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Handling of Noise Points in Clustering Algorithms (Fixes #152) #200

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 80 additions & 5 deletions src/tdamapper/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
"""

import logging
import numpy as np
import networkx as nx
from joblib import Parallel, delayed

Expand Down Expand Up @@ -122,6 +123,11 @@ def mapper_connected_components(X, y, cover, clustering, n_jobs=1):
label to each point in the dataset, based on the connected component of
the Mapper graph that it belongs to.

Points that are classified as noise (label -1) by the clustering algorithm
will retain their noise label unless modified by a noise handling wrapper
(see :class:`tdamapper.core.NoiseHandlingClustering`). This allows for
flexible handling of noise points in different applications.

This function uses a union-find data structure to efficiently keep track of
the connected components as it scans the points of the dataset. This
approach should be faster than computing the Mapper graph by first calling
Expand All @@ -135,15 +141,17 @@ def mapper_connected_components(X, y, cover, clustering, n_jobs=1):
:param cover: A cover algorithm.
:type cover: A class compatible with :class:`tdamapper.core.Cover`
:param clustering: The clustering algorithm to apply to each subset of the
dataset.
dataset. Can be wrapped with :class:`tdamapper.core.NoiseHandlingClustering`
to control how noise points are handled.
:type clustering: An estimator compatible with scikit-learn's clustering
interface, typically from :mod:`sklearn.cluster`.
:param n_jobs: The maximum number of parallel clustering jobs. This
parameter is passed to the constructor of :class:`joblib.Parallel`.
Defaults to 1.
:type n_jobs: int
:return: A list of labels. The label at position i identifies the connected
component of the point at position i in the dataset.
component of the point at position i in the dataset. Points labeled as
-1 are considered noise points.
:rtype: list[int]
"""
itm_lbls = mapper_labels(X, y, cover, clustering, n_jobs=n_jobs)
Expand All @@ -157,9 +165,10 @@ def mapper_connected_components(X, y, cover, clustering, n_jobs=1):
uf.union(first, second)
labels = [-1 for _ in X]
for i, lbls in enumerate(itm_lbls):
# assign -1 to noise points
root = uf.find(lbls[0]) if lbls else -1
labels[i] = root
if lbls: # if the point belongs to any cluster
root = uf.find(lbls[0])
labels[i] = root
# else: keep as -1 (noise point)
return labels


Expand Down Expand Up @@ -433,6 +442,72 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)


class NoiseHandlingClustering(ParamsMixin):
"""
A clustering algorithm wrapper that provides control over noise point handling.

This class wraps a clustering algorithm and provides options for handling noise
points (points labeled as -1). By default, noise points are preserved as
singleton clusters, but they can also be dropped or grouped into a single noise
cluster.

Performance implications of each noise handling mode:
- 'singleton': Creates individual clusters for noise points, which may increase
memory usage and processing time when there are many noise points.
- 'drop': Most memory efficient as noise points are simply ignored, but loses
information about noise points.
- 'group': Balances memory usage and information preservation by grouping all
noise points into a single cluster.

:param clustering: A clustering algorithm to delegate to.
:type clustering: An estimator compatible with scikit-learn's clustering
interface, typically from :mod:`sklearn.cluster`.
:param noise_handling: How to handle noise points. Options are:
- 'singleton': Each noise point becomes its own cluster (default)
- 'drop': Noise points are kept as -1 and will be dropped
- 'group': All noise points are grouped into a single cluster
:type noise_handling: str, optional
"""

def __init__(self, clustering=None, noise_handling='singleton'):
self.clustering = clustering
if noise_handling not in ['singleton', 'drop', 'group']:
raise ValueError(
"noise_handling must be one of 'singleton', 'drop', or 'group', "
f"got {noise_handling!r} instead"
)
self.noise_handling = noise_handling

def fit(self, X, y=None):
# Initialize and fit the base clustering algorithm
clustering = TrivialClustering() if self.clustering is None else clone(self.clustering)
clustering.fit(X, y)
labels = np.array(clustering.labels_) # Convert to numpy array for easier manipulation

# Find the maximum non-noise label, defaulting to -1 if all points are noise
non_noise_labels = [label for label in labels if label != -1]
max_label = max(non_noise_labels) if non_noise_labels else -1

if self.noise_handling == 'drop':
# Keep noise points as -1
self.labels_ = labels
elif self.noise_handling == 'group':
# Group all noise points into a single cluster
noise_label = max_label + 1
noise_mask = (labels == -1)
self.labels_ = labels.copy() # Preserve original cluster labels
self.labels_[noise_mask] = noise_label
else: # 'singleton' (default)
# Convert each noise point into its own cluster
next_label = max_label + 1
self.labels_ = labels.copy() # Preserve original cluster labels
noise_indices = np.where(labels == -1)[0]
for idx in noise_indices:
self.labels_[idx] = next_label
next_label += 1

return self

class FailSafeClustering(ParamsMixin):
"""
A delegating clustering algorithm that prevents failure.
Expand Down
97 changes: 97 additions & 0 deletions tests/test_unit_noise.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
"""Tests for noise point handling in clustering."""

import numpy as np
from sklearn.cluster import DBSCAN
from tdamapper.core import (
NoiseHandlingClustering,
TrivialCover,
mapper_connected_components,
)


def test_noise_handling_clustering():
# Create a simple dataset with obvious noise points
X = np.array([
[0, 0], # Cluster 1
[0.1, 0.1], # Cluster 1
[5, 5], # Noise point
[1, 1], # Cluster 2
[1.1, 0.9], # Cluster 2
[10, 10], # Noise point
])

# Base clustering with DBSCAN (eps=0.3 will make points far apart noise)
base_clustering = DBSCAN(eps=0.3, min_samples=2) # min_samples=2 to ensure small clusters are valid
# Debug: Print raw DBSCAN labels
debug_labels = base_clustering.fit(X).labels_
print(f"\nDebug - Raw DBSCAN labels: {debug_labels}")

# Test invalid noise_handling parameter
try:
NoiseHandlingClustering(clustering=base_clustering, noise_handling='invalid')
assert False, "Should raise ValueError for invalid noise_handling"
except ValueError as e:
assert "noise_handling must be one of" in str(e)

# Test 'drop' mode
clustering_drop = NoiseHandlingClustering(
clustering=base_clustering,
noise_handling='drop'
)
clustering_drop.fit(X)
assert -1 in clustering_drop.labels_, "Noise points should be kept as -1"

# Test 'group' mode
clustering_group = NoiseHandlingClustering(
clustering=base_clustering,
noise_handling='group'
)
clustering_group.fit(X)
assert -1 not in clustering_group.labels_, "No points should be marked as noise"
noise_points = np.where(clustering_group.labels_ == max(clustering_group.labels_))[0]
assert len(noise_points) == 2, "Should have 2 points in noise cluster"
assert 2 in noise_points and 5 in noise_points, "Points [5,5] and [10,10] should be noise"

# Test 'singleton' mode (default)
clustering_singleton = NoiseHandlingClustering(
clustering=base_clustering
)
clustering_singleton.fit(X)
assert -1 not in clustering_singleton.labels_, "No points should be marked as noise"
# Each noise point should have its own unique label
noise_labels = clustering_singleton.labels_[[2, 5]] # labels for [5,5] and [10,10]
assert len(set(noise_labels)) == 2, "Each noise point should have unique label"
# Verify exact number of clusters (2 original clusters + 2 singleton noise clusters)
assert len(set(clustering_singleton.labels_)) == 4, "Should have exactly 4 clusters (2 original + 2 noise)"


def test_mapper_with_noise_handling():
# Create a dataset with noise points
X = np.array([
[0, 0], [0.1, 0.1], # Cluster 1
[5, 5], # Noise point
[1, 1], [1.1, 0.9], # Cluster 2
[10, 10], # Noise point
])

# Test with default noise handling (drop)
base_clustering = DBSCAN(eps=0.3)
labels = mapper_connected_components(
X, X, # Use X as both data and lens
TrivialCover(),
base_clustering
)
assert -1 in labels, "Noise points should be kept by default"

# Test with custom noise handling
noise_handler = NoiseHandlingClustering(
clustering=base_clustering,
noise_handling='singleton'
)
labels = mapper_connected_components(
X, X,
TrivialCover(),
noise_handler
)
assert -1 not in labels, "No points should be marked as noise"
assert len(set(labels)) >= 4, "Should have at least 4 components (2 clusters + 2 noise)"