Open
Description
Hi Emil,
Thanks for your detailed description about the lower speed of tglkmeans
( #62 ).
The issue about the speed is something that can be corrected when using tglkmeans
in a paraellized way.
But for the relevant aspect of tglkmeans
with respect to kmeans
is that it offers a better cluster centers finding. tglkmeans
is initialized in a different way than kmeans
and it gets the right centers better than kmeans
.
Please consider this code:
library(tidymodels)
library(tidyclust)
library(tglkmeans)
library(recipes)
library(tibble)
set.seed(1234)
data <- rbind(
matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2),
matrix(rnorm(100, mean = 2, sd = 0.3), ncol = 2),
matrix(rnorm(100, mean = 3, sd = 0.3), ncol = 2),
matrix(rnorm(100, mean = 4, sd = 0.3), ncol = 2),
matrix(rnorm(100, mean = 5, sd = 0.3), ncol = 2)
)
colnames(data) <- c("x", "y")
data <- data %>% as.data.frame()
#------------------ SMALL --------------------
km <- TGL_kmeans_tidy(data, 5)
kmstd <- kmeans(data, 5)
kmstd$clust <- tibble(id = as.character(1:nrow(data)), clustkmstd = kmstd$cluster)
d <- left_join(km$cluster, kmstd$clust) %>%
mutate( compa = ifelse(clust == clustkmstd, 1, 0))
right_val <- sum(d$compa) * 100 / nrow(d)
error_val <- 100 - right_val
error_val
#------------------ MEDIUM --------------------
rec <- recipe(~., data = ames) |>
step_dummy(all_nominal_predictors()) |>
step_zv(all_predictors()) |>
step_normalize(all_predictors())
ames_num <- prep(rec) |>
bake(new_data = NULL)
data <- ames_num
km <- TGL_kmeans_tidy(data, 4)
kmstd <- kmeans(data, 4)
kmstd$clust <- tibble(id = as.character(1:nrow(data)), clustkmstd = kmstd$cluster)
d <- left_join(km$cluster, kmstd$clust) %>%
mutate( compa = ifelse(clust == clustkmstd, 1, 0))
right_val <- sum(d$compa) * 100 / nrow(d)
error_val <- 100 - right_val
error_val
#------------------ LARGE --------------------
ames_num_big <- ames_num |>
slice_sample(n = 1000000)
data <- ames_num_big
km <- TGL_kmeans_tidy(data, 4)
kmstd <- kmeans(data, 4)
kmstd$clust <- tibble(id = as.character(1:nrow(data)), clustkmstd = kmstd$cluster)
d <- left_join(km$cluster, kmstd$clust) %>%
mutate( compa = ifelse(clust == clustkmstd, 1, 0))
right_val <- sum(d$compa) * 100 / nrow(d)
error_val <- 100 - right_val
error_val
Which produces these results:
> #------------------ SMALL --------------------
> km <- TGL_kmeans_tidy(data, 5)
Warning message:
In TGL_kmeans_tidy(data, 5) :
Input doesn't have a column named "id". Using rownames instead.
> kmstd <- kmeans(data, 5)
> kmstd$clust <- tibble(id = as.character(1:nrow(data)), clustkmstd = kmstd$cluster)
>
> d <- left_join(km$cluster, kmstd$clust) %>%
+ mutate( compa = ifelse(clust == clustkmstd, 1, 0))
Joining with `by = join_by(id)`
>
> right_val <- sum(d$compa) * 100 / nrow(d)
> error_val <- 100 - right_val
> error_val
[1] 67.84983
>
>
>
> #------------------ MEDIUM --------------------
> rec <- recipe(~., data = ames) |>
+ step_dummy(all_nominal_predictors()) |>
+ step_zv(all_predictors()) |>
+ step_normalize(all_predictors())
>
> ames_num <- prep(rec) |>
+ bake(new_data = NULL)
>
> data <- ames_num
>
> km <- TGL_kmeans_tidy(data, 4)
Warning message:
In TGL_kmeans_tidy(data, 4) :
Input doesn't have a column named "id". Using rownames instead.
> kmstd <- kmeans(data, 4)
> kmstd$clust <- tibble(id = as.character(1:nrow(data)), clustkmstd = kmstd$cluster)
>
> d <- left_join(km$cluster, kmstd$clust) %>%
+ mutate( compa = ifelse(clust == clustkmstd, 1, 0))
Joining with `by = join_by(id)`
>
> right_val <- sum(d$compa) * 100 / nrow(d)
> error_val <- 100 - right_val
> error_val
[1] 24.57338
>
>
>
> #------------------ LARGE --------------------
> ames_num_big <- ames_num |>
+ slice_sample(n = 1000000)
>
> data <- ames_num_big
>
> km <- TGL_kmeans_tidy(data, 4)
Warning message:
In TGL_kmeans_tidy(data, 4) :
Input doesn't have a column named "id". Using rownames instead.
> kmstd <- kmeans(data, 4)
> kmstd$clust <- tibble(id = as.character(1:nrow(data)), clustkmstd = kmstd$cluster)
>
> d <- left_join(km$cluster, kmstd$clust) %>%
+ mutate( compa = ifelse(clust == clustkmstd, 1, 0))
Joining with `by = join_by(id)`
>
> right_val <- sum(d$compa) * 100 / nrow(d)
> error_val <- 100 - right_val
> error_val
[1] 95.93857
>
Thanks again,
Carlos.