From 7188282c88cc7bf9edf3ff4f9b485d00cd40aae1 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Tue, 1 Nov 2022 14:57:57 -0700 Subject: [PATCH] sse_within -> sse_within_total --- NAMESPACE | 8 ++-- R/extract_summary.R | 8 ++-- R/metric-sse.R | 36 +++++++++--------- R/tune_cluster.R | 2 +- _pkgdown.yml | 2 +- dev/cross_val_kmeans.R | 2 +- man/{sse_within.Rd => sse_within_total.Rd} | 22 +++++------ tests/testthat/_snaps/tune_cluster.md | 4 +- tests/testthat/test-cluster_metric_set.R | 8 ++-- tests/testthat/test-k_means_diagnostics.R | 6 +-- tests/testthat/test-tune_cluster.R | 44 +++++++++++----------- vignettes/articles/k_means.Rmd | 4 +- vignettes/articles/tuning_and_metrics.Rmd | 2 +- 13 files changed, 74 insertions(+), 74 deletions(-) rename man/{sse_within.Rd => sse_within_total.Rd} (57%) diff --git a/NAMESPACE b/NAMESPACE index 76f6b52c..5af1e8dd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -36,8 +36,8 @@ S3method(sse_ratio,cluster_fit) S3method(sse_ratio,workflow) S3method(sse_total,cluster_fit) S3method(sse_total,workflow) -S3method(sse_within,cluster_fit) -S3method(sse_within,workflow) +S3method(sse_within_total,cluster_fit) +S3method(sse_within_total,workflow) S3method(tidy,cluster_fit) S3method(translate_tidyclust,default) S3method(translate_tidyclust,hier_clust) @@ -99,8 +99,8 @@ export(sse_ratio) export(sse_ratio_vec) export(sse_total) export(sse_total_vec) -export(sse_within) -export(sse_within_vec) +export(sse_within_total) +export(sse_within_total_vec) export(tidy) export(translate_tidyclust) export(translate_tidyclust.default) diff --git a/R/extract_summary.R b/R/extract_summary.R index d735beda..61c651ef 100644 --- a/R/extract_summary.R +++ b/R/extract_summary.R @@ -50,7 +50,7 @@ extract_fit_summary.kmeans <- function(object, ...) { cluster_names = names, centroids = centroids, n_members = object$size[reorder_clusts], - sse_within_total = object$withinss[reorder_clusts], + sse_within_total_total = object$withinss[reorder_clusts], sse_total = object$totss, orig_labels = unname(object$cluster), cluster_assignments = cluster_asignments @@ -75,7 +75,7 @@ extract_fit_summary.KMeansCluster <- function(object, ...) { cluster_names = names, centroids = centroids, n_members = object$obs_per_cluster[reorder_clusts], - sse_within_total = object$WCSS_per_cluster[reorder_clusts], + sse_within_total_total = object$WCSS_per_cluster[reorder_clusts], sse_total = object$total_SSE, orig_labels = object$clusters, cluster_assignments = cluster_asignments @@ -103,7 +103,7 @@ extract_fit_summary.hclust <- function(object, ...) { map(dplyr::summarize_all, mean) %>% dplyr::bind_rows() - sse_within_total <- map2_dbl( + sse_within_total_total <- map2_dbl( by_clust$data, seq_len(n_clust), ~ sum(Rfast::dista(centroids[.y, ], .x)) @@ -113,7 +113,7 @@ extract_fit_summary.hclust <- function(object, ...) { cluster_names = unique(clusts), centroids = centroids, n_members = unname(table(clusts)), - sse_within_total = sse_within_total, + sse_within_total_total = sse_within_total_total, sse_total = sum(Rfast::dista(t(overall_centroid), training_data)), orig_labels = NULL, cluster_assignments = clusts diff --git a/R/metric-sse.R b/R/metric-sse.R index b1d91f11..b731de3a 100644 --- a/R/metric-sse.R +++ b/R/metric-sse.R @@ -29,7 +29,7 @@ within_cluster_sse <- function(object, new_data = NULL, if (is.null(new_data)) { res <- tibble::tibble( .cluster = factor(summ$cluster_names), - wss = summ$sse_within_total, + wss = summ$sse_within_total_total, n_members = summ$n_members ) } else { @@ -69,48 +69,48 @@ within_cluster_sse <- function(object, new_data = NULL, #' #' kmeans_fit <- fit(kmeans_spec, ~., mtcars) #' -#' sse_within(kmeans_fit) +#' sse_within_total(kmeans_fit) #' -#' sse_within_vec(kmeans_fit) +#' sse_within_total_vec(kmeans_fit) #' @export -sse_within <- function(object, ...) { - UseMethod("sse_within") +sse_within_total <- function(object, ...) { + UseMethod("sse_within_total") } -sse_within <- new_cluster_metric( - sse_within, +sse_within_total <- new_cluster_metric( + sse_within_total, direction = "zero" ) #' @export -#' @rdname sse_within -sse_within.cluster_fit <- function(object, new_data = NULL, +#' @rdname sse_within_total +sse_within_total.cluster_fit <- function(object, new_data = NULL, dist_fun = NULL, ...) { if (is.null(dist_fun)) { dist_fun <- Rfast::dista } - res <- sse_within_impl(object, new_data, dist_fun, ...) + res <- sse_within_total_impl(object, new_data, dist_fun, ...) tibble::tibble( - .metric = "sse_within", + .metric = "sse_within_total", .estimator = "standard", .estimate = res ) } #' @export -#' @rdname sse_within -sse_within.workflow <- sse_within.cluster_fit +#' @rdname sse_within_total +sse_within_total.workflow <- sse_within_total.cluster_fit #' @export -#' @rdname sse_within -sse_within_vec <- function(object, new_data = NULL, +#' @rdname sse_within_total +sse_within_total_vec <- function(object, new_data = NULL, dist_fun = Rfast::dista, ...) { - sse_within_impl(object, new_data, dist_fun, ...) + sse_within_total_impl(object, new_data, dist_fun, ...) } -sse_within_impl <- function(object, new_data = NULL, +sse_within_total_impl <- function(object, new_data = NULL, dist_fun = Rfast::dista, ...) { sum(within_cluster_sse(object, new_data, dist_fun, ...)$wss, na.rm = TRUE) } @@ -253,6 +253,6 @@ sse_ratio_impl <- function(object, new_data = NULL, dist_fun = Rfast::dista, ...) { - sse_within_vec(object, new_data, dist_fun) / + sse_within_total_vec(object, new_data, dist_fun) / sse_total_vec(object, new_data, dist_fun) } diff --git a/R/tune_cluster.R b/R/tune_cluster.R index 56ba6410..5c9e6fdd 100644 --- a/R/tune_cluster.R +++ b/R/tune_cluster.R @@ -817,7 +817,7 @@ check_metrics <- function(x, object) { if (is.null(x)) { switch(mode, partition = { - x <- cluster_metric_set(sse_within, sse_total) + x <- cluster_metric_set(sse_within_total, sse_total) }, unknown = { rlang::abort( diff --git a/_pkgdown.yml b/_pkgdown.yml index e087b293..e3e5ad4b 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -44,7 +44,7 @@ reference: - silhouette_avg - sse_ratio - sse_total - - sse_within + - sse_within_total - title: Tuning desc: > Functions to allow multiple cluster specifications to be fit at once. diff --git a/dev/cross_val_kmeans.R b/dev/cross_val_kmeans.R index f793f06f..c4dbbb4e 100644 --- a/dev/cross_val_kmeans.R +++ b/dev/cross_val_kmeans.R @@ -30,7 +30,7 @@ for (k in 2:10) { km_fit <- km %>% fit(~., data = tmp_train) wss <- km_fit %>% - sse_within_total(tmp_test) + sse_within_total_total(tmp_test) wss_2 <- km_fit$fit$tot.withinss diff --git a/man/sse_within.Rd b/man/sse_within_total.Rd similarity index 57% rename from man/sse_within.Rd rename to man/sse_within_total.Rd index 114ab0f9..77cde3b9 100644 --- a/man/sse_within.Rd +++ b/man/sse_within_total.Rd @@ -1,19 +1,19 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/metric-sse.R -\name{sse_within} -\alias{sse_within} -\alias{sse_within.cluster_fit} -\alias{sse_within.workflow} -\alias{sse_within_vec} +\name{sse_within_total} +\alias{sse_within_total} +\alias{sse_within_total.cluster_fit} +\alias{sse_within_total.workflow} +\alias{sse_within_total_vec} \title{Compute the sum of within-cluster SSE} \usage{ -sse_within(object, ...) +sse_within_total(object, ...) -\method{sse_within}{cluster_fit}(object, new_data = NULL, dist_fun = NULL, ...) +\method{sse_within_total}{cluster_fit}(object, new_data = NULL, dist_fun = NULL, ...) -\method{sse_within}{workflow}(object, new_data = NULL, dist_fun = NULL, ...) +\method{sse_within_total}{workflow}(object, new_data = NULL, dist_fun = NULL, ...) -sse_within_vec(object, new_data = NULL, dist_fun = Rfast::dista, ...) +sse_within_total_vec(object, new_data = NULL, dist_fun = Rfast::dista, ...) } \arguments{ \item{object}{A fitted kmeans tidyclust model} @@ -34,7 +34,7 @@ kmeans_spec <- k_means(num_clusters = 5) \%>\% kmeans_fit <- fit(kmeans_spec, ~., mtcars) -sse_within(kmeans_fit) +sse_within_total(kmeans_fit) -sse_within_vec(kmeans_fit) +sse_within_total_vec(kmeans_fit) } diff --git a/tests/testthat/_snaps/tune_cluster.md b/tests/testthat/_snaps/tune_cluster.md index a065da8a..d4450280 100644 --- a/tests/testthat/_snaps/tune_cluster.md +++ b/tests/testthat/_snaps/tune_cluster.md @@ -399,7 +399,7 @@ tmp <- tune::show_best(res) Condition Warning: - No value of `metric` was given; metric 'sse_within' will be used. + No value of `metric` was given; metric 'sse_within_total' will be used. --- @@ -407,5 +407,5 @@ tmp <- tune::select_best(res) Condition Warning: - No value of `metric` was given; metric 'sse_within' will be used. + No value of `metric` was given; metric 'sse_within_total' will be used. diff --git a/tests/testthat/test-cluster_metric_set.R b/tests/testthat/test-cluster_metric_set.R index d6055984..9d6e650f 100644 --- a/tests/testthat/test-cluster_metric_set.R +++ b/tests/testthat/test-cluster_metric_set.R @@ -4,13 +4,13 @@ test_that("cluster_metric_set works", { kmeans_fit <- fit(kmeans_spec, ~., mtcars) - my_metrics <- cluster_metric_set(sse_ratio, sse_total, sse_within, silhouette_avg) + my_metrics <- cluster_metric_set(sse_ratio, sse_total, sse_within_total, silhouette_avg) exp_res <- tibble::tibble( - .metric = c("sse_ratio", "sse_total", "sse_within", "silhouette_avg"), + .metric = c("sse_ratio", "sse_total", "sse_within_total", "silhouette_avg"), .estimator = "standard", .estimate = vapply( - list(sse_ratio_vec, sse_total_vec, sse_within_vec, silhouette_avg_vec), + list(sse_ratio_vec, sse_total_vec, sse_within_total_vec, silhouette_avg_vec), function(x) x(kmeans_fit, new_data = mtcars), FUN.VALUE = numeric(1) ) @@ -23,7 +23,7 @@ test_that("cluster_metric_set works", { expect_snapshot(error = TRUE, my_metrics(kmeans_fit)) - my_metrics <- cluster_metric_set(sse_ratio, sse_total, sse_within) + my_metrics <- cluster_metric_set(sse_ratio, sse_total, sse_within_total) expect_equal( my_metrics(kmeans_fit), diff --git a/tests/testthat/test-k_means_diagnostics.R b/tests/testthat/test-k_means_diagnostics.R index 18221c61..61f80803 100644 --- a/tests/testthat/test-k_means_diagnostics.R +++ b/tests/testthat/test-k_means_diagnostics.R @@ -20,7 +20,7 @@ test_that("kmeans sse metrics work", { ) expect_equal( - sse_within_vec(kmeans_fit_stats), + sse_within_total_vec(kmeans_fit_stats), km_orig$tot.withinss, tolerance = 0.005 ) @@ -41,7 +41,7 @@ test_that("kmeans sse metrics work", { ) expect_equal( - sse_within_vec(kmeans_fit_ClusterR), + sse_within_total_vec(kmeans_fit_ClusterR), sum(km_orig_2$WCSS_per_cluster), tolerance = 0.005 ) @@ -70,7 +70,7 @@ test_that("kmeans sse metrics work on new data", { ) expect_equal( - sse_within_vec(kmeans_fit_stats, new_data), + sse_within_total_vec(kmeans_fit_stats, new_data), 15654.38, tolerance = 0.005 ) diff --git a/tests/testthat/test-tune_cluster.R b/tests/testthat/test-tune_cluster.R index 66e21344..ef10a330 100644 --- a/tests/testthat/test-tune_cluster.R +++ b/tests/testthat/test-tune_cluster.R @@ -10,7 +10,7 @@ test_that("tune recipe only", { grid <- dials::grid_regular(pset, levels = 3) folds <- rsample::vfold_cv(mtcars) control <- tune::control_grid(extract = identity) - metrics <- cluster_metric_set(sse_within, sse_total) + metrics <- cluster_metric_set(sse_within_total, sse_total) res <- tune_cluster( wflow, @@ -28,7 +28,7 @@ test_that("tune recipe only", { expect_equal(res$id, folds$id) expect_equal(nrow(res_est), nrow(grid) * 2) expect_equal(sum(res_est$.metric == "sse_total"), nrow(grid)) - expect_equal(sum(res_est$.metric == "sse_within"), nrow(grid)) + expect_equal(sum(res_est$.metric == "sse_within_total"), nrow(grid)) expect_equal(res_est$n, rep(10, nrow(grid) * 2)) expect_false(identical(num_comp, expr(tune()))) expect_true(res_workflow$trained) @@ -46,7 +46,7 @@ test_that("tune model only (with recipe)", { grid$num_clusters <- grid$num_clusters + 1 folds <- rsample::vfold_cv(mtcars) control <- tune::control_grid(extract = identity) - metrics <- cluster_metric_set(sse_within, sse_total) + metrics <- cluster_metric_set(sse_within_total, sse_total) res <- tune_cluster( wflow, @@ -65,7 +65,7 @@ test_that("tune model only (with recipe)", { expect_equal(res$id, folds$id) expect_equal(nrow(res_est), nrow(grid) * 2) expect_equal(sum(res_est$.metric == "sse_total"), nrow(grid)) - expect_equal(sum(res_est$.metric == "sse_within"), nrow(grid)) + expect_equal(sum(res_est$.metric == "sse_within_total"), nrow(grid)) expect_equal(res_est$n, rep(10, nrow(grid) * 2)) expect_false(identical(num_clusters, expr(tune()))) expect_true(res_workflow$trained) @@ -85,7 +85,7 @@ test_that("tune model only (with variables)", { folds <- rsample::vfold_cv(mtcars) - metrics <- cluster_metric_set(sse_total, sse_within) + metrics <- cluster_metric_set(sse_total, sse_within_total) res <- tune_cluster(wflow, resamples = folds, grid = grid, metrics = metrics) @@ -95,7 +95,7 @@ test_that("tune model only (with variables)", { expect_equal(nrow(res_est), nrow(grid) * 2) expect_equal(sum(res_est$.metric == "sse_total"), nrow(grid)) - expect_equal(sum(res_est$.metric == "sse_within"), nrow(grid)) + expect_equal(sum(res_est$.metric == "sse_within_total"), nrow(grid)) expect_equal(res_est$n, rep(10, nrow(grid) * 2)) }) @@ -113,7 +113,7 @@ test_that("tune model only (with formula)", { folds <- rsample::vfold_cv(mtcars) - metrics <- cluster_metric_set(sse_total, sse_within) + metrics <- cluster_metric_set(sse_total, sse_within_total) res <- tune_cluster(wflow, resamples = folds, grid = grid, metrics = metrics) @@ -123,7 +123,7 @@ test_that("tune model only (with formula)", { expect_equal(nrow(res_est), nrow(grid) * 2) expect_equal(sum(res_est$.metric == "sse_total"), nrow(grid)) - expect_equal(sum(res_est$.metric == "sse_within"), nrow(grid)) + expect_equal(sum(res_est$.metric == "sse_within_total"), nrow(grid)) expect_equal(res_est$n, rep(10, nrow(grid) * 2)) }) @@ -140,7 +140,7 @@ test_that("tune model and recipe", { grid$num_clusters <- grid$num_clusters + 1 folds <- rsample::vfold_cv(mtcars) control <- tune::control_grid(extract = identity) - metrics <- cluster_metric_set(sse_within, sse_total) + metrics <- cluster_metric_set(sse_within_total, sse_total) res <- tune_cluster( wflow, @@ -167,7 +167,7 @@ test_that("tune model and recipe", { ) expect_equal(nrow(res_est), nrow(grid) * 2) expect_equal(sum(res_est$.metric == "sse_total"), nrow(grid)) - expect_equal(sum(res_est$.metric == "sse_within"), nrow(grid)) + expect_equal(sum(res_est$.metric == "sse_within_total"), nrow(grid)) expect_equal(res_est$n, rep(10, nrow(grid) * 2)) expect_false(identical(num_clusters, expr(tune()))) expect_false(identical(num_comp, expr(tune()))) @@ -187,7 +187,7 @@ test_that("verbose argument works", { grid$num_clusters <- grid$num_clusters + 1 folds <- rsample::vfold_cv(mtcars) control <- tune::control_grid(extract = identity, verbose = TRUE) - metrics <- cluster_metric_set(sse_within, sse_total) + metrics <- cluster_metric_set(sse_within_total, sse_total) expect_snapshot( res <- tune_cluster( @@ -216,7 +216,7 @@ test_that('tune model and recipe (parallel_over = "everything")', { extract = identity, parallel_over = "everything" ) - metrics <- cluster_metric_set(sse_within, sse_total) + metrics <- cluster_metric_set(sse_within_total, sse_total) res <- tune_cluster( wflow, @@ -235,7 +235,7 @@ test_that('tune model and recipe (parallel_over = "everything")', { ) expect_equal(nrow(res_est), nrow(grid) * 2) expect_equal(sum(res_est$.metric == "sse_total"), nrow(grid)) - expect_equal(sum(res_est$.metric == "sse_within"), nrow(grid)) + expect_equal(sum(res_est$.metric == "sse_within_total"), nrow(grid)) expect_equal(res_est$n, rep(10, nrow(grid) * 2)) }) @@ -309,7 +309,7 @@ test_that("metrics can be NULL", { grid <- dials::grid_regular(pset, levels = 3) folds <- rsample::vfold_cv(mtcars) control <- tune::control_grid(extract = identity) - metrics <- cluster_metric_set(sse_within, sse_total) + metrics <- cluster_metric_set(sse_within_total, sse_total) set.seed(4400) res <- tune_cluster( @@ -343,7 +343,7 @@ test_that("tune recipe only", { grid <- dials::grid_regular(pset, levels = 3) folds <- rsample::vfold_cv(mtcars) control <- tune::control_grid(extract = identity) - metrics <- cluster_metric_set(sse_within) + metrics <- cluster_metric_set(sse_within_total) res <- tune_cluster( wflow, @@ -360,7 +360,7 @@ test_that("tune recipe only", { expect_equal(res$id, folds$id) expect_equal(nrow(res_est), nrow(grid)) - expect_equal(sum(res_est$.metric == "sse_within"), nrow(grid)) + expect_equal(sum(res_est$.metric == "sse_within_total"), nrow(grid)) expect_equal(res_est$n, rep(10, nrow(grid))) expect_false(identical(num_comp, expr(tune()))) expect_true(res_workflow$trained) @@ -399,7 +399,7 @@ test_that("retain extra attributes", { grid <- dials::grid_regular(pset, levels = 3) grid$num_clusters <- grid$num_clusters + 1 folds <- rsample::vfold_cv(mtcars) - metrics <- cluster_metric_set(sse_within, sse_total) + metrics <- cluster_metric_set(sse_within_total, sse_total) res <- tune_cluster(wflow, resamples = folds, grid = grid, metrics = metrics) att <- attributes(res) @@ -423,7 +423,7 @@ test_that("select_best() and show_best() works", { grid$num_clusters <- grid$num_clusters + 1 folds <- rsample::vfold_cv(mtcars) control <- tune::control_grid(extract = identity) - metrics <- cluster_metric_set(sse_within, sse_total) + metrics <- cluster_metric_set(sse_within_total, sse_total) res <- tune_cluster( wflow, @@ -436,9 +436,9 @@ test_that("select_best() and show_best() works", { expect_snapshot(tmp <- tune::show_best(res)) expect_equal( - tune::show_best(res, metric = "sse_within"), + tune::show_best(res, metric = "sse_within_total"), tune::collect_metrics(res) %>% - dplyr::filter(.metric == "sse_within") %>% + dplyr::filter(.metric == "sse_within_total") %>% dplyr::slice_min(mean, n = 5, with_ties = FALSE) ) @@ -452,9 +452,9 @@ test_that("select_best() and show_best() works", { expect_snapshot(tmp <- tune::select_best(res)) expect_equal( - tune::select_best(res, metric = "sse_within"), + tune::select_best(res, metric = "sse_within_total"), tune::collect_metrics(res) %>% - dplyr::filter(.metric == "sse_within") %>% + dplyr::filter(.metric == "sse_within_total") %>% dplyr::slice_min(mean, n = 1, with_ties = FALSE) %>% dplyr::select(num_clusters, .config) ) diff --git a/vignettes/articles/k_means.Rmd b/vignettes/articles/k_means.Rmd index 6b240a6a..6e5ef660 100644 --- a/vignettes/articles/k_means.Rmd +++ b/vignettes/articles/k_means.Rmd @@ -240,10 +240,10 @@ The WSS and TSS come "for free" with the model fit summary, or they can be acces directly from the model fit: ```{r} -kmeans_summary$sse_within_total +kmeans_summary$sse_within_total_total kmeans_summary$sse_total -kmeans_fit %>% sse_within() +kmeans_fit %>% sse_within_total() kmeans_fit %>% sse_total() kmeans_fit %>% sse_ratio() diff --git a/vignettes/articles/tuning_and_metrics.Rmd b/vignettes/articles/tuning_and_metrics.Rmd index d6c23d12..3fde7ba2 100644 --- a/vignettes/articles/tuning_and_metrics.Rmd +++ b/vignettes/articles/tuning_and_metrics.Rmd @@ -104,7 +104,7 @@ res <- tune_cluster( resamples = penguins_cv, grid = clust_num_grid, control = control_grid(save_pred = TRUE, extract = identity), - metrics = cluster_metric_set(sse_within, sse_total, sse_ratio) + metrics = cluster_metric_set(sse_within_total, sse_total, sse_ratio) ) res