From e53c258840b0c89b16dcd8493cf92827e4c6ffd6 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Mon, 27 Jan 2025 12:30:45 -0800 Subject: [PATCH 1/4] Rfast::Dist -> philentropy::distance --- R/hier_clust.R | 6 ++++-- R/metric-helpers.R | 10 +++++++--- R/metric-silhouette.R | 8 ++++---- R/metric-sse.R | 8 ++++++-- man/dot-hier_clust_fit_stats.Rd | 2 +- man/prep_data_dist.Rd | 2 +- man/silhouette.Rd | 2 +- man/silhouette_avg.Rd | 2 +- vignettes/articles/k_means.Rmd | 2 +- 9 files changed, 26 insertions(+), 16 deletions(-) diff --git a/R/hier_clust.R b/R/hier_clust.R index ed475342..8dd95a73 100644 --- a/R/hier_clust.R +++ b/R/hier_clust.R @@ -193,9 +193,11 @@ translate_tidyclust.hier_clust <- function(x, engine = x$engine, ...) { num_clusters = NULL, cut_height = NULL, linkage_method = NULL, - dist_fun = Rfast::Dist + dist_fun = philentropy::distance ) { - dmat <- dist_fun(x) + suppressMessages( + dmat <- dist_fun(x) + ) res <- stats::hclust(stats::as.dist(dmat), method = linkage_method) attr(res, "num_clusters") <- num_clusters attr(res, "cut_height") <- cut_height diff --git a/R/metric-helpers.R b/R/metric-helpers.R index f7304e7a..9945ee4c 100644 --- a/R/metric-helpers.R +++ b/R/metric-helpers.R @@ -12,7 +12,7 @@ prep_data_dist <- function( object, new_data = NULL, dists = NULL, - dist_fun = Rfast::Dist + dist_fun = philentropy::distance ) { # Sihouettes requires a distance matrix if (is.null(new_data) && is.null(dists)) { @@ -46,7 +46,9 @@ prep_data_dist <- function( # Calculate distances including optionally supplied params if (is.null(dists)) { - dists <- dist_fun(new_data) + suppressMessages( + dists <- dist_fun(new_data) + ) } return( @@ -69,5 +71,7 @@ get_centroid_dists <- function(new_data, centroids, dist_fun = Rfast::dista) { rlang::abort("Centroids must have same columns as data.") } - dist_fun(centroids, new_data) + suppressMessages( + dist_fun(centroids, new_data) + ) } diff --git a/R/metric-silhouette.R b/R/metric-silhouette.R index 2783d92e..2722b466 100644 --- a/R/metric-silhouette.R +++ b/R/metric-silhouette.R @@ -27,7 +27,7 @@ silhouette <- function( object, new_data = NULL, dists = NULL, - dist_fun = Rfast::Dist + dist_fun = philentropy::distance ) { if (inherits(object, "cluster_spec")) { rlang::abort( @@ -126,7 +126,7 @@ silhouette_avg.cluster_fit <- function( ... ) { if (is.null(dist_fun)) { - dist_fun <- Rfast::Dist + dist_fun <- philentropy::distance } res <- silhouette_avg_impl(object, new_data, dists, dist_fun, ...) @@ -148,7 +148,7 @@ silhouette_avg_vec <- function( object, new_data = NULL, dists = NULL, - dist_fun = Rfast::Dist, + dist_fun = philentropy::distance, ... ) { silhouette_avg_impl(object, new_data, dists, dist_fun, ...) @@ -158,7 +158,7 @@ silhouette_avg_impl <- function( object, new_data = NULL, dists = NULL, - dist_fun = Rfast::Dist, + dist_fun = philentropy::distance, ... ) { mean(silhouette(object, new_data, dists, dist_fun, ...)$sil_width) diff --git a/R/metric-sse.R b/R/metric-sse.R index dd49c176..a13cd577 100644 --- a/R/metric-sse.R +++ b/R/metric-sse.R @@ -43,7 +43,9 @@ sse_within <- function(object, new_data = NULL, dist_fun = Rfast::dista) { n_members = summ$n_members ) } else { - dist_to_centroids <- dist_fun(summ$centroids, new_data) + suppressMessages( + dist_to_centroids <- dist_fun(summ$centroids, new_data) + ) res <- dist_to_centroids %>% tibble::as_tibble(.name_repair = "minimal") %>% @@ -253,7 +255,9 @@ sse_total_impl <- function( } else { overall_mean <- colSums(summ$centroids * summ$n_members) / sum(summ$n_members) - tot <- dist_fun(t(as.matrix(overall_mean)), new_data)^2 %>% sum() + suppressMessages( + tot <- dist_fun(t(as.matrix(overall_mean)), new_data)^2 %>% sum() + ) } return(tot) diff --git a/man/dot-hier_clust_fit_stats.Rd b/man/dot-hier_clust_fit_stats.Rd index 372e7cea..08d33f3f 100644 --- a/man/dot-hier_clust_fit_stats.Rd +++ b/man/dot-hier_clust_fit_stats.Rd @@ -9,7 +9,7 @@ num_clusters = NULL, cut_height = NULL, linkage_method = NULL, - dist_fun = Rfast::Dist + dist_fun = philentropy::distance ) } \arguments{ diff --git a/man/prep_data_dist.Rd b/man/prep_data_dist.Rd index 163c0ea8..1cc11771 100644 --- a/man/prep_data_dist.Rd +++ b/man/prep_data_dist.Rd @@ -4,7 +4,7 @@ \alias{prep_data_dist} \title{Prepares data and distance matrices for metric calculation} \usage{ -prep_data_dist(object, new_data = NULL, dists = NULL, dist_fun = Rfast::Dist) +prep_data_dist(object, new_data = NULL, dists = NULL, dist_fun = philentropy::distance) } \arguments{ \item{object}{A fitted \code{\link{cluster_spec}} object.} diff --git a/man/silhouette.Rd b/man/silhouette.Rd index 6840ad88..d6224bc0 100644 --- a/man/silhouette.Rd +++ b/man/silhouette.Rd @@ -4,7 +4,7 @@ \alias{silhouette} \title{Measures silhouette between clusters} \usage{ -silhouette(object, new_data = NULL, dists = NULL, dist_fun = Rfast::Dist) +silhouette(object, new_data = NULL, dists = NULL, dist_fun = philentropy::distance) } \arguments{ \item{object}{A fitted tidyclust model} diff --git a/man/silhouette_avg.Rd b/man/silhouette_avg.Rd index 8036d677..b8d8ed43 100644 --- a/man/silhouette_avg.Rd +++ b/man/silhouette_avg.Rd @@ -20,7 +20,7 @@ silhouette_avg_vec( object, new_data = NULL, dists = NULL, - dist_fun = Rfast::Dist, + dist_fun = philentropy::distance, ... ) } diff --git a/vignettes/articles/k_means.Rmd b/vignettes/articles/k_means.Rmd index bbd514db..167f15da 100644 --- a/vignettes/articles/k_means.Rmd +++ b/vignettes/articles/k_means.Rmd @@ -291,7 +291,7 @@ matrix (i.e., all pairwise distances between observations). ```{r} my_dist_1 <- function(x) { - Rfast::Dist(x, method = "manhattan") + philentropy::distance(x, method = "manhattan") } my_dist_2 <- function(x, y) { From da441c8f5ce443362a85e78945a36f892b51bf89 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Mon, 27 Jan 2025 13:08:53 -0800 Subject: [PATCH 2/4] use philentropy instead of Rfast --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5600b846..55564b1e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -26,8 +26,8 @@ Imports: hardhat (>= 1.0.0), modelenv (>= 0.2.0.9000), parsnip (>= 1.0.2), + philentropy (>= 0.9.0), prettyunits (>= 1.1.0), - Rfast (>= 2.0.6), rlang (>= 1.0.6), rsample (>= 1.0.0), stats, From f5a233345d5fde77f88aac4f6ec1ad4dd30ba51d Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Mon, 27 Jan 2025 13:09:10 -0800 Subject: [PATCH 3/4] switch from Rfast to philentropy --- R/extract_fit_summary.R | 16 +++++++++-- R/metric-helpers.R | 13 +++++++-- R/metric-sse.R | 52 ++++++++++++++++++++++++++-------- R/predict_helpers.R | 13 +++++++-- man/get_centroid_dists.Rd | 12 ++++++-- man/prep_data_dist.Rd | 7 ++++- man/silhouette.Rd | 7 ++++- man/sse_ratio.Rd | 10 ++++++- man/sse_total.Rd | 10 ++++++- man/sse_within.Rd | 9 +++++- man/sse_within_total.Rd | 10 ++++++- vignettes/articles/k_means.Rmd | 4 +-- 12 files changed, 134 insertions(+), 29 deletions(-) diff --git a/R/extract_fit_summary.R b/R/extract_fit_summary.R index d77bbb9c..fd787831 100644 --- a/R/extract_fit_summary.R +++ b/R/extract_fit_summary.R @@ -167,7 +167,13 @@ extract_fit_summary.hclust <- function(object, ...) { sse_within_total_total <- map2_dbl( by_clust$data, seq_len(n_clust), - ~sum(Rfast::dista(centroids[.y, ], .x)) + ~sum( + philentropy::dist_many_many( + as.matrix(centroids[.y, ]), + as.matrix(.x), + method = "euclidean" + ) + ) ) list( @@ -175,7 +181,13 @@ extract_fit_summary.hclust <- function(object, ...) { centroids = centroids, n_members = unname(as.integer(table(clusts))), sse_within_total_total = sse_within_total_total, - sse_total = sum(Rfast::dista(t(overall_centroid), training_data)), + sse_total = sum( + philentropy::dist_many_many( + t(overall_centroid), + as.matrix(training_data), + method = "euclidean" + ) + ), orig_labels = NULL, cluster_assignments = clusts ) diff --git a/R/metric-helpers.R b/R/metric-helpers.R index 9945ee4c..35572b85 100644 --- a/R/metric-helpers.R +++ b/R/metric-helpers.R @@ -65,13 +65,20 @@ prep_data_dist <- function( #' @param new_data A data frame #' @param centroids A data frame where each row is a centroid. #' @param dist_fun A function for computing matrix-to-matrix distances. Defaults -#' to `Rfast::dista()` -get_centroid_dists <- function(new_data, centroids, dist_fun = Rfast::dista) { +#' to +#' `function(x, y) philentropy::dist_many_many(x, y, method = "euclidean")`. +get_centroid_dists <- function( + new_data, + centroids, + dist_fun = function(x, y) { + philentropy::dist_many_many(x, y, method = "euclidean") + } +) { if (ncol(new_data) != ncol(centroids)) { rlang::abort("Centroids must have same columns as data.") } suppressMessages( - dist_fun(centroids, new_data) + dist_fun(as.matrix(centroids), as.matrix(new_data)) ) } diff --git a/R/metric-sse.R b/R/metric-sse.R index a13cd577..ac957d78 100644 --- a/R/metric-sse.R +++ b/R/metric-sse.R @@ -19,7 +19,13 @@ #' #' sse_within(kmeans_fit) #' @export -sse_within <- function(object, new_data = NULL, dist_fun = Rfast::dista) { +sse_within <- function( + object, + new_data = NULL, + dist_fun = function(x, y) { + philentropy::dist_many_many(x, y, method = "euclidean") + } +) { if (inherits(object, "cluster_spec")) { rlang::abort( paste( @@ -44,7 +50,10 @@ sse_within <- function(object, new_data = NULL, dist_fun = Rfast::dista) { ) } else { suppressMessages( - dist_to_centroids <- dist_fun(summ$centroids, new_data) + dist_to_centroids <- dist_fun( + as.matrix(summ$centroids), + as.matrix(new_data) + ) ) res <- dist_to_centroids %>% @@ -123,7 +132,9 @@ sse_within_total.cluster_fit <- function( ... ) { if (is.null(dist_fun)) { - dist_fun <- Rfast::dista + dist_fun <- function(x, y) { + philentropy::dist_many_many(x, y, method = "euclidean") + } } res <- sse_within_total_impl(object, new_data, dist_fun, ...) @@ -144,7 +155,9 @@ sse_within_total.workflow <- sse_within_total.cluster_fit sse_within_total_vec <- function( object, new_data = NULL, - dist_fun = Rfast::dista, + dist_fun = function(x, y) { + philentropy::dist_many_many(x, y, method = "euclidean") + }, ... ) { sse_within_total_impl(object, new_data, dist_fun, ...) @@ -153,7 +166,9 @@ sse_within_total_vec <- function( sse_within_total_impl <- function( object, new_data = NULL, - dist_fun = Rfast::dista, + dist_fun = function(x, y) { + philentropy::dist_many_many(x, y, method = "euclidean") + }, ... ) { sum(sse_within(object, new_data, dist_fun, ...)$wss, na.rm = TRUE) @@ -210,7 +225,9 @@ sse_total.cluster_fit <- function( ... ) { if (is.null(dist_fun)) { - dist_fun <- Rfast::dista + dist_fun <- function(x, y) { + philentropy::dist_many_many(x, y, method = "euclidean") + } } res <- sse_total_impl(object, new_data, dist_fun, ...) @@ -231,7 +248,9 @@ sse_total.workflow <- sse_total.cluster_fit sse_total_vec <- function( object, new_data = NULL, - dist_fun = Rfast::dista, + dist_fun = function(x, y) { + philentropy::dist_many_many(x, y, method = "euclidean") + }, ... ) { sse_total_impl(object, new_data, dist_fun, ...) @@ -240,7 +259,9 @@ sse_total_vec <- function( sse_total_impl <- function( object, new_data = NULL, - dist_fun = Rfast::dista, + dist_fun = function(x, y) { + philentropy::dist_many_many(x, y, method = "euclidean") + }, ... ) { # Preprocess data before computing distances if appropriate @@ -256,7 +277,8 @@ sse_total_impl <- function( overall_mean <- colSums(summ$centroids * summ$n_members) / sum(summ$n_members) suppressMessages( - tot <- dist_fun(t(as.matrix(overall_mean)), new_data)^2 %>% sum() + tot <- dist_fun(t(as.matrix(overall_mean)), as.matrix(new_data))^2 %>% + sum() ) } @@ -314,7 +336,9 @@ sse_ratio.cluster_fit <- function( ... ) { if (is.null(dist_fun)) { - dist_fun <- Rfast::dista + dist_fun <- function(x, y) { + philentropy::dist_many_many(x, y, method = "euclidean") + } } res <- sse_ratio_impl(object, new_data, dist_fun, ...) @@ -334,7 +358,9 @@ sse_ratio.workflow <- sse_ratio.cluster_fit sse_ratio_vec <- function( object, new_data = NULL, - dist_fun = Rfast::dista, + dist_fun = function(x, y) { + philentropy::dist_many_many(x, y, method = "euclidean") + }, ... ) { sse_ratio_impl(object, new_data, dist_fun, ...) @@ -343,7 +369,9 @@ sse_ratio_vec <- function( sse_ratio_impl <- function( object, new_data = NULL, - dist_fun = Rfast::dista, + dist_fun = function(x, y) { + philentropy::dist_many_many(x, y, method = "euclidean") + }, ... ) { sse_within_total_vec(object, new_data, dist_fun) / diff --git a/R/predict_helpers.R b/R/predict_helpers.R index cef573c3..e1c13f18 100644 --- a/R/predict_helpers.R +++ b/R/predict_helpers.R @@ -96,7 +96,11 @@ make_predictions <- function(x, prefix, n_clusters) { ) # need this to be obs on rows, dist to new data on cols - dists_new <- Rfast::dista(xnew = training_data, x = new_data, trans = TRUE) + dists_new <- philentropy::dist_many_many( + training_data, + new_data, + method = "euclidean" + ) cluster_dists <- dplyr::bind_cols(data.frame(dists_new), clusters) %>% dplyr::group_by(.cluster) %>% @@ -109,7 +113,12 @@ make_predictions <- function(x, prefix, n_clusters) { ## Centroid linkage_method, dist to center cluster_centers <- extract_centroids(object) %>% dplyr::select(-.cluster) - dists_means <- Rfast::dista(new_data, cluster_centers) + + dists_means <- philentropy::dist_many_many( + new_data, + cluster_centers, + method = "euclidean" + ) pred_clusts_num <- apply(dists_means, 1, which.min) } else if (linkage_method %in% c("ward.D", "ward", "ward.D2")) { diff --git a/man/get_centroid_dists.Rd b/man/get_centroid_dists.Rd index 267a98a9..887923e6 100644 --- a/man/get_centroid_dists.Rd +++ b/man/get_centroid_dists.Rd @@ -4,7 +4,14 @@ \alias{get_centroid_dists} \title{Computes distance from observations to centroids} \usage{ -get_centroid_dists(new_data, centroids, dist_fun = Rfast::dista) +get_centroid_dists( + new_data, + centroids, + dist_fun = function(x, y) { + philentropy::dist_many_many(x, y, method = + "euclidean") + } +) } \arguments{ \item{new_data}{A data frame} @@ -12,7 +19,8 @@ get_centroid_dists(new_data, centroids, dist_fun = Rfast::dista) \item{centroids}{A data frame where each row is a centroid.} \item{dist_fun}{A function for computing matrix-to-matrix distances. Defaults -to \code{Rfast::dista()}} +to +\code{function(x, y) philentropy::dist_many_many(x, y, method = "euclidean")}.} } \description{ Computes distance from observations to centroids diff --git a/man/prep_data_dist.Rd b/man/prep_data_dist.Rd index 1cc11771..229bfd86 100644 --- a/man/prep_data_dist.Rd +++ b/man/prep_data_dist.Rd @@ -4,7 +4,12 @@ \alias{prep_data_dist} \title{Prepares data and distance matrices for metric calculation} \usage{ -prep_data_dist(object, new_data = NULL, dists = NULL, dist_fun = philentropy::distance) +prep_data_dist( + object, + new_data = NULL, + dists = NULL, + dist_fun = philentropy::distance +) } \arguments{ \item{object}{A fitted \code{\link{cluster_spec}} object.} diff --git a/man/silhouette.Rd b/man/silhouette.Rd index d6224bc0..7a76d610 100644 --- a/man/silhouette.Rd +++ b/man/silhouette.Rd @@ -4,7 +4,12 @@ \alias{silhouette} \title{Measures silhouette between clusters} \usage{ -silhouette(object, new_data = NULL, dists = NULL, dist_fun = philentropy::distance) +silhouette( + object, + new_data = NULL, + dists = NULL, + dist_fun = philentropy::distance +) } \arguments{ \item{object}{A fitted tidyclust model} diff --git a/man/sse_ratio.Rd b/man/sse_ratio.Rd index 2adaceea..4e14ba0a 100644 --- a/man/sse_ratio.Rd +++ b/man/sse_ratio.Rd @@ -16,7 +16,15 @@ sse_ratio(object, ...) \method{sse_ratio}{workflow}(object, new_data = NULL, dist_fun = NULL, ...) -sse_ratio_vec(object, new_data = NULL, dist_fun = Rfast::dista, ...) +sse_ratio_vec( + object, + new_data = NULL, + dist_fun = function(x, y) { + philentropy::dist_many_many(x, y, method = + "euclidean") + }, + ... +) } \arguments{ \item{object}{A fitted kmeans tidyclust model} diff --git a/man/sse_total.Rd b/man/sse_total.Rd index bea3c468..805b5e91 100644 --- a/man/sse_total.Rd +++ b/man/sse_total.Rd @@ -16,7 +16,15 @@ sse_total(object, ...) \method{sse_total}{workflow}(object, new_data = NULL, dist_fun = NULL, ...) -sse_total_vec(object, new_data = NULL, dist_fun = Rfast::dista, ...) +sse_total_vec( + object, + new_data = NULL, + dist_fun = function(x, y) { + philentropy::dist_many_many(x, y, method = + "euclidean") + }, + ... +) } \arguments{ \item{object}{A fitted kmeans tidyclust model} diff --git a/man/sse_within.Rd b/man/sse_within.Rd index da4b3ecd..c79fb0df 100644 --- a/man/sse_within.Rd +++ b/man/sse_within.Rd @@ -4,7 +4,14 @@ \alias{sse_within} \title{Calculates Sum of Squared Error in each cluster} \usage{ -sse_within(object, new_data = NULL, dist_fun = Rfast::dista) +sse_within( + object, + new_data = NULL, + dist_fun = function(x, y) { + philentropy::dist_many_many(x, y, method = + "euclidean") + } +) } \arguments{ \item{object}{A fitted kmeans tidyclust model} diff --git a/man/sse_within_total.Rd b/man/sse_within_total.Rd index 9483cb09..6baf5d88 100644 --- a/man/sse_within_total.Rd +++ b/man/sse_within_total.Rd @@ -16,7 +16,15 @@ sse_within_total(object, ...) \method{sse_within_total}{workflow}(object, new_data = NULL, dist_fun = NULL, ...) -sse_within_total_vec(object, new_data = NULL, dist_fun = Rfast::dista, ...) +sse_within_total_vec( + object, + new_data = NULL, + dist_fun = function(x, y) { + philentropy::dist_many_many(x, y, method = + "euclidean") + }, + ... +) } \arguments{ \item{object}{A fitted kmeans tidyclust model} diff --git a/vignettes/articles/k_means.Rmd b/vignettes/articles/k_means.Rmd index 167f15da..9b718e8e 100644 --- a/vignettes/articles/k_means.Rmd +++ b/vignettes/articles/k_means.Rmd @@ -295,7 +295,7 @@ my_dist_1 <- function(x) { } my_dist_2 <- function(x, y) { - Rfast::dista(x, y, method = "manhattan") + philentropy::dist_many_many(x, y, method = "manhattan") } kmeans_fit %>% sse_ratio(dist_fun = my_dist_2) @@ -404,7 +404,7 @@ pens %>% ```{r, echo = FALSE} #| fig-alt: "scatter chart. bill_length_mm along the x-axis, bill_depth_mm along the y-axis. 3 vague cluster appears in the point cloud. Point are colored according to how close they were to the color points." -closest_center <- Rfast::dista(as.matrix(pens), as.matrix(pens[init, ])) %>% +closest_center <- philentropy::dist_many_many(as.matrix(pens), as.matrix(pens[init, ]), method = "euclidean") %>% apply(1, which.min) pens %>% From 85426152ec4cdb21382d70c59407b4e251a7865f Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Mon, 27 Jan 2025 13:10:54 -0800 Subject: [PATCH 4/4] update news --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 190f4f63..0456878e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # tidyclust (development version) +* The philentropy package is now used to calculate distances rather than Rfast. (#199) + # tidyclust 0.2.3 * Update to fix revdep issue for clustMixType. (#190)