diff --git a/CLUEstering/CLUEstering.py b/CLUEstering/CLUEstering.py index a9f7d9af..32a618c8 100644 --- a/CLUEstering/CLUEstering.py +++ b/CLUEstering/CLUEstering.py @@ -15,6 +15,7 @@ import pandas as pd from sklearn.datasets import make_blobs from sklearn.preprocessing import StandardScaler +from sklearn.metrics import davies_bouldin_score from os.path import dirname, exists, join path = dirname(__file__) sys.path.insert(1, join(path, 'lib')) @@ -1183,3 +1184,9 @@ def import_clusterer(self, input_folder: str, file_name: str) -> None: points_per_cluster, df_ ) + +if __name__ == "__main__": + c = clusterer(4, 2.5, 4) + c.read_data('../data/toyDetector_1000.csv') + c.run_clue() + print(davies_bouldin_score(c.coords.T, c.cluster_ids)) diff --git a/include/CLUEstering/core/DistanceMetrics.hpp b/include/CLUEstering/core/DistanceMetrics.hpp index e80328e0..fd30e9cf 100644 --- a/include/CLUEstering/core/DistanceMetrics.hpp +++ b/include/CLUEstering/core/DistanceMetrics.hpp @@ -25,7 +25,10 @@ namespace clue { } // namespace concepts template - using Point = std::array; + using Point = std::array; + + template + using WeightedPoint = std::array; /// @brief Euclidean distance metric //// This class implements the Euclidean distance metric in Ndim dimensions. @@ -50,6 +53,18 @@ namespace clue { [&]() { return (lhs[Dim] - rhs[Dim]) * (lhs[Dim] - rhs[Dim]); }); return math::sqrt(distance2); } + /// @brief Compute the Euclidean distance between two points + /// + /// @param lhs First point + /// @param rhs Second point + /// @return Euclidean distance between the two points + /// NOTE: The weight is not used in the computation of the distance + ALPAKA_FN_HOST_ACC constexpr inline auto operator()(const WeightedPoint& lhs, + const WeightedPoint& rhs) const { + const auto distance2 = meta::accumulate( + [&]() { return (lhs[Dim] - rhs[Dim]) * (lhs[Dim] - rhs[Dim]); }); + return math::sqrt(distance2); + } }; /// @brief Weighted Euclidean distance metric @@ -95,6 +110,19 @@ namespace clue { }); return math::sqrt(distance2); } + /// @brief Compute the Weighted Euclidean distance between two points + /// + /// @param lhs First point + /// @param rhs Second point + /// @return Weighted Euclidean distance between the two points + /// NOTE: The weight is not used in the computation of the distance + ALPAKA_FN_HOST_ACC constexpr inline auto operator()(const WeightedPoint& lhs, + const WeightedPoint& rhs) const { + const auto distance2 = meta::accumulate([&]() { + return m_weights[Dim] * (lhs[Dim] - rhs[Dim]) * (lhs[Dim] - rhs[Dim]); + }); + return math::sqrt(distance2); + } }; /// @brief Periodic Euclidean distance metric @@ -140,6 +168,21 @@ namespace clue { }); return math::sqrt(distance2); } + /// @brief Compute the Periodic Euclidean distance between two points + /// + /// @param lhs First point + /// @param rhs Second point + /// @return Periodic Euclidean distance between the two points + /// NOTE: The weight is not used in the computation of the distance + ALPAKA_FN_HOST_ACC constexpr inline auto operator()(const WeightedPoint& lhs, + const WeightedPoint& rhs) const { + const auto distance2 = meta::accumulate([&]() { + const auto diff = math::fabs(lhs[Dim] - rhs[Dim]); + const auto periodic_diff = math::min(diff, m_periods[Dim] - diff); + return periodic_diff * periodic_diff; + }); + return math::sqrt(distance2); + } }; /// @brief Manhattan distance metric @@ -162,6 +205,17 @@ namespace clue { return meta::accumulate( [&]() { return math::fabs(lhs[Dim] - rhs[Dim]); }); } + /// @brief Compute the Manhattan distance between two points + /// + /// @param lhs First point + /// @param rhs Second point + /// @return Manhattan distance between the two points + /// NOTE: The weight is not used in the computation of the distance + ALPAKA_FN_HOST_ACC constexpr inline auto operator()(const WeightedPoint& lhs, + const WeightedPoint& rhs) const { + return meta::accumulate( + [&]() { return math::fabs(lhs[Dim] - rhs[Dim]); }); + } }; /// @brief Chebyshev distance metric @@ -184,6 +238,17 @@ namespace clue { return meta::maximum( [&]() { return math::fabs(lhs[Dim] - rhs[Dim]); }); } + /// @brief Compute the Chebyshev distance between two points + /// + /// @param lhs First point + /// @param rhs Second point + /// @return Chebyshev distance between the two points + /// NOTE: The weight is not used in the computation of the distance + ALPAKA_FN_HOST_ACC constexpr inline auto operator()(const WeightedPoint& lhs, + const WeightedPoint& rhs) const { + return meta::maximum( + [&]() { return math::fabs(lhs[Dim] - rhs[Dim]); }); + } }; /// @brief Weighted Chebyshev distance metric @@ -227,6 +292,17 @@ namespace clue { return meta::maximum( [&]() { return m_weights[Dim] * math::fabs(lhs[Dim] - rhs[Dim]); }); } + /// @brief Compute the Weighted Chebyshev distance between two points + /// + /// @param lhs First point + /// @param rhs Second point + /// @return Weighted Chebyshev distance between the two points + /// NOTE: The weight is not used in the computation of the distance + ALPAKA_FN_HOST_ACC constexpr inline auto operator()(const WeightedPoint& lhs, + const WeightedPoint& rhs) const { + return meta::maximum( + [&]() { return m_weights[Dim] * math::fabs(lhs[Dim] - rhs[Dim]); }); + } }; namespace metrics { diff --git a/include/CLUEstering/data_structures/PointsHost.hpp b/include/CLUEstering/data_structures/PointsHost.hpp index 8c4f510b..0a2b2dc9 100644 --- a/include/CLUEstering/data_structures/PointsHost.hpp +++ b/include/CLUEstering/data_structures/PointsHost.hpp @@ -64,6 +64,9 @@ namespace clue { float weight() const; float cluster_index() const; + + operator std::array() { return m_coordinates; } + // operator std::array() {} }; /// @brief Constructs a container for the points allocated on the host diff --git a/include/CLUEstering/utils/detail/scores.hpp b/include/CLUEstering/utils/detail/scores.hpp index 6a50842c..bdd36df9 100644 --- a/include/CLUEstering/utils/detail/scores.hpp +++ b/include/CLUEstering/utils/detail/scores.hpp @@ -1,8 +1,10 @@ #pragma once +#include "CLUEstering/core/DistanceMetrics.hpp" #include "CLUEstering/data_structures/PointsHost.hpp" #include "CLUEstering/data_structures/AssociationMap.hpp" +#include "CLUEstering/utils/cluster_centroid.hpp" #include #include #include @@ -98,4 +100,42 @@ namespace clue { return std::reduce(scores.begin(), scores.end(), 0.f) / static_cast(scores.size()); } + template DistanceMetric> + auto davies_bouldin(const clue::PointsHost& points, const DistanceMetric& metric) { + auto cluster_centroids = clue::cluster_centroids(points); + auto clusters = clue::get_clusters(points); + + std::vector clusters_scatter(cluster_centroids.size(), 0.f); + for (auto i = 0; i < points.size(); ++i) { + auto cluster_id = points[i].cluster_index(); + if (cluster_id == -1) + continue; + clusters_scatter[cluster_id] += metric(points[i], cluster_centroids[cluster_id]); + } + for (auto i = 0; i < cluster_centroids.size(); ++i) { + clusters_scatter[i] /= static_cast(clusters.count(i)); + } + std::vector> clusters_separation( + cluster_centroids.size(), std::vector(cluster_centroids.size(), 0.f)); + for (auto i = 0u; i < cluster_centroids.size(); ++i) { + for (auto j = 0u; j < cluster_centroids.size(); ++j) { + if (i == j) + continue; + clusters_separation[i][j] = metric(cluster_centroids[i], cluster_centroids[j]); + } + } + + std::vector R_values(cluster_centroids.size(), 0.f); + for (auto i = 0u; i < cluster_centroids.size(); ++i) { + for (auto j = 0u; j < clusters_separation[i].size(); ++j) { + if (i == j) + continue; + R_values[i] = std::max( + R_values[i], (clusters_scatter[i] + clusters_scatter[j]) / clusters_separation[i][j]); + } + } + + return std::reduce(R_values.begin(), R_values.end(), 0.f) / static_cast(R_values.size()); + } + } // namespace clue diff --git a/include/CLUEstering/utils/scores.hpp b/include/CLUEstering/utils/scores.hpp index 49cf2042..0e09ae20 100644 --- a/include/CLUEstering/utils/scores.hpp +++ b/include/CLUEstering/utils/scores.hpp @@ -4,6 +4,7 @@ #pragma once +#include "CLUEstering/core/DistanceMetrics.hpp" #include "CLUEstering/data_structures/PointsHost.hpp" namespace clue { @@ -27,6 +28,11 @@ namespace clue { template auto silhouette(const clue::PointsHost& points); + template DistanceMetric = clue::EuclideanMetric> + auto davies_bouldin(const clue::PointsHost& points, + const DistanceMetric& metric = clue::EuclideanMetric()); + } // namespace clue #include "CLUEstering/utils/detail/scores.hpp" diff --git a/tests/test_validation_scores.cpp b/tests/test_validation_scores.cpp index 6b7eb7e3..c1881ad5 100644 --- a/tests/test_validation_scores.cpp +++ b/tests/test_validation_scores.cpp @@ -26,4 +26,10 @@ TEST_CASE("Test validation scores on toy detector dataset") { CHECK(silhouette >= -1.f); CHECK(silhouette <= 1.f); } + + SUBCASE("Test computation of davies-bouldin score") { + const auto davies_bouldin = clue::davies_bouldin(points); + std::cout << "Davies-Bouldin score: " << davies_bouldin << std::endl; + CHECK(davies_bouldin >= 0.f); + } }